Importing rustc-1.56.0
Change-Id: I98941481270706fa55f8fb2cb91686ae3bd30f38
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64.h
index d2170a9..658d447 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64.h
@@ -42,6 +42,7 @@
FunctionPass *createAArch64IndirectThunks();
FunctionPass *createAArch64SpeculationHardeningPass();
FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64LowerHomogeneousPrologEpilogPass();
FunctionPass *createAArch64SIMDInstrOptPass();
ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
@@ -58,7 +59,8 @@
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone);
+FunctionPass *createAArch64O0PreLegalizerCombiner();
+FunctionPass *createAArch64PreLegalizerCombiner();
FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone);
FunctionPass *createAArch64PostLegalizerLowering();
FunctionPass *createAArch64PostSelectOptimize();
@@ -79,7 +81,9 @@
void initializeAArch64SLSHardeningPass(PassRegistry&);
void initializeAArch64SpeculationHardeningPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
+void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PostLegalizerLoweringPass(PassRegistry &);
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64.td
index 7628552..d8dd9d1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64.td
@@ -147,12 +147,12 @@
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
-def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true",
- "Has zero-cycle zeroing instructions for FP registers">;
+def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
+ "Has no zero-cycle zeroing instructions for FP registers">;
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions",
- [FeatureZCZeroingGP, FeatureZCZeroingFP]>;
+ [FeatureZCZeroingGP]>;
/// ... but the floating-point version doesn't quite work in rare cases on older
/// CPUs.
@@ -174,9 +174,6 @@
def FeatureCallSavedX#i : SubtargetFeature<"call-saved-x"#i,
"CustomCallSavedXRegs["#i#"]", "true", "Make X"#i#" callee saved.">;
-def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
- "Use alias analysis during codegen">;
-
def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
"true",
"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
@@ -429,6 +426,20 @@
SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
"true", "Enable enhanced counter virtualization extension">;
+def FeatureRME : SubtargetFeature<"rme", "HasRME",
+ "true", "Enable Realm Management Extension">;
+
+// FIXME: SME should only imply the subset of SVE(2) instructions that are
+// legal in streaming mode.
+def FeatureSME : SubtargetFeature<"sme", "HasSME", "true",
+ "Enable Scalable Matrix Extension (SME)", [FeatureSVE2, FeatureBF16]>;
+
+def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true",
+ "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>;
+
+def FeatureSMEI64 : SubtargetFeature<"sme-i64", "HasSMEI64", "true",
+ "Enable Scalable Matrix Extension (SME) I16I64 instructions", [FeatureSME]>;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -526,6 +537,9 @@
def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
"HardenSlsBlr", "true",
"Harden against straight line speculation across BLR instructions">;
+def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat",
+ "HardenSlsNoComdat", "true",
+ "Generate thunk code for SLS mitigation in the normal text section">;
//===----------------------------------------------------------------------===//
// AArch64 Processors supported.
@@ -546,6 +560,10 @@
let F = [HasPAuth];
}
+def SMEUnsupported : AArch64Unsupported {
+ let F = [HasSME, HasSMEF64, HasSMEI64];
+}
+
include "AArch64SchedA53.td"
include "AArch64SchedA55.td"
include "AArch64SchedA57.td"
@@ -581,7 +599,6 @@
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
- FeatureUseAA
]>;
def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
@@ -594,7 +611,8 @@
FeatureFullFP16,
FeatureDotProd,
FeatureRCPC,
- FeaturePerfMon
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
@@ -728,7 +746,6 @@
"CortexR82",
"Cortex-R82 ARM Processors", [
FeaturePostRAScheduler,
- // TODO: crypto and FuseAES
// All other features are implied by v8_0r ops:
HasV8_0rOps,
]>;
@@ -913,8 +930,7 @@
FeatureLSLFast,
FeaturePerfMon,
FeaturePostRAScheduler,
- FeaturePredictableSelectIsExpensive,
- FeatureZCZeroingFP]>;
+ FeaturePredictableSelectIsExpensive]>;
def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M4 processors",
@@ -977,6 +993,8 @@
FeatureNEON,
FeatureRCPC,
FeatureSSBS,
+ FeaturePostRAScheduler,
+ FeatureFuseAES,
]>;
def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
@@ -991,6 +1009,8 @@
FeatureRCPC,
FeatureSPE,
FeatureSSBS,
+ FeaturePostRAScheduler,
+ FeatureFuseAES,
]>;
def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily",
@@ -1003,7 +1023,11 @@
FeatureMTE,
FeatureSVE2,
FeatureSVE2BitPerm,
- FeatureTRBE]>;
+ FeatureTRBE,
+ FeaturePostRAScheduler,
+ FeatureCrypto,
+ FeatureFuseAES,
+ ]>;
def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily",
"NeoverseV1",
@@ -1066,7 +1090,6 @@
FeaturePredictableSelectIsExpensive,
FeatureLSE,
FeaturePAuth,
- FeatureUseAA,
FeatureBalanceFPOps,
FeaturePerfMon,
FeatureStrictAlign,
@@ -1195,12 +1218,15 @@
def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>;
def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>;
+// Mac CPUs
+def : ProcessorModel<"apple-m1", CycloneModel, [ProcAppleA14]>;
+
// watch CPUs.
def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>;
def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
// Alias for the latest Apple processor model supported by LLVM.
-def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
+def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA14]>;
// Fujitsu A64FX
def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index a0c5498..3ab9b25 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -109,9 +109,9 @@
typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple;
std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
- void EmitHwasanMemaccessSymbols(Module &M);
+ void emitHwasanMemaccessSymbols(Module &M);
- void EmitSled(const MachineInstr &MI, SledKind Kind);
+ void emitSled(const MachineInstr &MI, SledKind Kind);
/// tblgen'erated driver function for lowering simple MI->MC
/// pseudo instructions.
@@ -178,10 +178,10 @@
AArch64FunctionInfo *AArch64FI = nullptr;
/// Emit the LOHs contained in AArch64FI.
- void EmitLOHs();
+ void emitLOHs();
/// Emit instruction to set float register to zero.
- void EmitFMov0(const MachineInstr &MI);
+ void emitFMov0(const MachineInstr &MI);
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
@@ -191,7 +191,32 @@
} // end anonymous namespace
void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
- if (!TM.getTargetTriple().isOSBinFormatELF())
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatCOFF()) {
+ // Emit an absolute @feat.00 symbol. This appears to be some kind of
+ // compiler features bitfield read by link.exe.
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+ OutStreamer->BeginCOFFSymbolDef(S);
+ OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->EndCOFFSymbolDef();
+ int64_t Feat00Flags = 0;
+
+ if (M.getModuleFlag("cfguard")) {
+ Feat00Flags |= 0x800; // Object is CFG-aware.
+ }
+
+ if (M.getModuleFlag("ehcontguard")) {
+ Feat00Flags |= 0x4000; // Object also has EHCont.
+ }
+
+ OutStreamer->emitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->emitAssignment(
+ S, MCConstantExpr::create(Feat00Flags, MMI->getContext()));
+ }
+
+ if (!TT.isOSBinFormatELF())
return;
// Assemble feature flags that may require creation of a note section.
@@ -235,21 +260,18 @@
return;
}
- EmitSled(MI, SledKind::FUNCTION_ENTER);
+ emitSled(MI, SledKind::FUNCTION_ENTER);
}
-void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
-{
- EmitSled(MI, SledKind::FUNCTION_EXIT);
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) {
+ emitSled(MI, SledKind::FUNCTION_EXIT);
}
-void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
-{
- EmitSled(MI, SledKind::TAIL_CALL);
+void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
+ emitSled(MI, SledKind::TAIL_CALL);
}
-void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
-{
+void AArch64AsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) {
static const int8_t NoopsInSledCount = 7;
// We want to emit the following pattern:
//
@@ -312,7 +334,7 @@
.addExpr(MCSymbolRefExpr::create(Sym, OutContext)));
}
-void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
+void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
if (HwasanMemaccessSymbols.empty())
return;
@@ -352,7 +374,7 @@
OutStreamer->SwitchSection(OutContext.getELFSection(
".text.hot", ELF::SHT_PROGBITS,
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
- Sym->getName()));
+ Sym->getName(), /*IsComdat=*/true));
OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
@@ -539,7 +561,7 @@
}
void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
- EmitHwasanMemaccessSymbols(M);
+ emitHwasanMemaccessSymbols(M);
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatMachO()) {
@@ -557,7 +579,7 @@
}
-void AArch64AsmPrinter::EmitLOHs() {
+void AArch64AsmPrinter::emitLOHs() {
SmallVector<MCSymbol *, 3> MCArgs;
for (const auto &D : AArch64FI->getLOHContainer()) {
@@ -574,7 +596,7 @@
void AArch64AsmPrinter::emitFunctionBodyEnd() {
if (!AArch64FI->getLOHRelated().empty())
- EmitLOHs();
+ emitLOHs();
}
/// GetCPISymbol - Return the symbol for the specified constant pool entry.
@@ -631,6 +653,9 @@
case 'x':
Reg = getXRegFromWReg(Reg);
break;
+ case 't':
+ Reg = getXRegFromXRegTuple(Reg);
+ break;
}
O << AArch64InstPrinter::getRegisterName(Reg);
@@ -727,6 +752,10 @@
AArch64::GPR64allRegClass.contains(Reg))
return printAsmMRegister(MO, 'x', O);
+ // If this is an x register tuple, print an x register.
+ if (AArch64::GPR64x8ClassRegClass.contains(Reg))
+ return printAsmMRegister(MO, 't', O);
+
unsigned AltName = AArch64::NoRegAltName;
const TargetRegisterClass *RegClass;
if (AArch64::ZPRRegClass.contains(Reg)) {
@@ -768,11 +797,15 @@
OS << MI->getDebugVariable()->getName();
OS << " <- ";
// Frame address. Currently handles register +- offset only.
- assert(MI->getDebugOperand(0).isReg() && MI->isDebugOffsetImm());
+ assert(MI->isIndirectDebugValue());
OS << '[';
- printOperand(MI, 0, OS);
- OS << '+';
- printOperand(MI, 1, OS);
+ for (unsigned I = 0, E = std::distance(MI->debug_operands().begin(),
+ MI->debug_operands().end());
+ I < E; ++I) {
+ if (I != 0)
+ OS << ", ";
+ printOperand(MI, I, OS);
+ }
OS << ']';
OS << "+";
printOperand(MI, NOps - 2, OS);
@@ -1059,20 +1092,19 @@
OutStreamer->emitInstruction(MI, getSubtargetInfo());
}
-void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
- // Convert H/S/D register to corresponding Q register
+ // Convert H/S register to corresponding D register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
- DestReg = AArch64::Q0 + (DestReg - AArch64::H0);
+ DestReg = AArch64::D0 + (DestReg - AArch64::H0);
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
- DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
- else {
+ DestReg = AArch64::D0 + (DestReg - AArch64::S0);
+ else
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
- DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
- }
+
MCInst MOVI;
- MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.setOpcode(AArch64::MOVID);
MOVI.addOperand(MCOperand::createReg(DestReg));
MOVI.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, MOVI);
@@ -1185,7 +1217,8 @@
}
break;
- case AArch64::DBG_VALUE: {
+ case AArch64::DBG_VALUE:
+ case AArch64::DBG_VALUE_LIST: {
if (isVerbose() && OutStreamer->hasRawTextSupport()) {
SmallString<128> TmpStr;
raw_svector_ostream OS(TmpStr);
@@ -1200,13 +1233,13 @@
ExceptionHandlingType != ExceptionHandling::ARM)
return;
- if (needsCFIMoves() == CFI_M_None)
+ if (getFunctionCFISectionType(*MF) == CFISection::None)
return;
OutStreamer->emitCFIBKeyFrame();
return;
}
- }
+ }
// Tail calls use pseudo instructions so they have the proper code-gen
// attributes (isCall, isReturn, etc.). We lower them to the real
@@ -1322,7 +1355,7 @@
case AArch64::FMOVH0:
case AArch64::FMOVS0:
case AArch64::FMOVD0:
- EmitFMov0(*MI);
+ emitFMov0(*MI);
return;
case TargetOpcode::STACKMAP:
@@ -1355,29 +1388,29 @@
return;
case AArch64::SEH_StackAlloc:
- TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
+ TS->emitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
return;
case AArch64::SEH_SaveFPLR:
- TS->EmitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
+ TS->emitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
return;
case AArch64::SEH_SaveFPLR_X:
assert(MI->getOperand(0).getImm() < 0 &&
"Pre increment SEH opcode must have a negative offset");
- TS->EmitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
+ TS->emitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
return;
case AArch64::SEH_SaveReg:
- TS->EmitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
return;
case AArch64::SEH_SaveReg_X:
assert(MI->getOperand(1).getImm() < 0 &&
"Pre increment SEH opcode must have a negative offset");
- TS->EmitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
- -MI->getOperand(1).getImm());
+ TS->emitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
+ -MI->getOperand(1).getImm());
return;
case AArch64::SEH_SaveRegP:
@@ -1385,13 +1418,13 @@
MI->getOperand(0).getImm() <= 28) {
assert((MI->getOperand(0).getImm() - 19) % 2 == 0 &&
"Register paired with LR must be odd");
- TS->EmitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(),
MI->getOperand(2).getImm());
return;
}
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
"Non-consecutive registers not allowed for save_regp");
- TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
MI->getOperand(2).getImm());
return;
@@ -1400,26 +1433,26 @@
"Non-consecutive registers not allowed for save_regp_x");
assert(MI->getOperand(2).getImm() < 0 &&
"Pre increment SEH opcode must have a negative offset");
- TS->EmitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
-MI->getOperand(2).getImm());
return;
case AArch64::SEH_SaveFReg:
- TS->EmitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
return;
case AArch64::SEH_SaveFReg_X:
assert(MI->getOperand(1).getImm() < 0 &&
"Pre increment SEH opcode must have a negative offset");
- TS->EmitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
-MI->getOperand(1).getImm());
return;
case AArch64::SEH_SaveFRegP:
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
"Non-consecutive registers not allowed for save_regp");
- TS->EmitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
MI->getOperand(2).getImm());
return;
@@ -1428,32 +1461,32 @@
"Non-consecutive registers not allowed for save_regp_x");
assert(MI->getOperand(2).getImm() < 0 &&
"Pre increment SEH opcode must have a negative offset");
- TS->EmitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
+ TS->emitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
-MI->getOperand(2).getImm());
return;
case AArch64::SEH_SetFP:
- TS->EmitARM64WinCFISetFP();
+ TS->emitARM64WinCFISetFP();
return;
case AArch64::SEH_AddFP:
- TS->EmitARM64WinCFIAddFP(MI->getOperand(0).getImm());
+ TS->emitARM64WinCFIAddFP(MI->getOperand(0).getImm());
return;
case AArch64::SEH_Nop:
- TS->EmitARM64WinCFINop();
+ TS->emitARM64WinCFINop();
return;
case AArch64::SEH_PrologEnd:
- TS->EmitARM64WinCFIPrologEnd();
+ TS->emitARM64WinCFIPrologEnd();
return;
case AArch64::SEH_EpilogStart:
- TS->EmitARM64WinCFIEpilogStart();
+ TS->emitARM64WinCFIEpilogStart();
return;
case AArch64::SEH_EpilogEnd:
- TS->EmitARM64WinCFIEpilogEnd();
+ TS->emitARM64WinCFIEpilogEnd();
return;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index d3b5166..d2acd1d 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -64,7 +64,6 @@
LLVM_DEBUG(
dbgs() << "********** AArch64 Branch Targets **********\n"
<< "********** Function: " << MF.getName() << '\n');
- const Function &F = MF.getFunction();
// LLVM does not consider basic blocks which are the targets of jump tables
// to be address-taken (the address can't escape anywhere else), but they are
@@ -78,13 +77,16 @@
bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
bool CouldCall = false, CouldJump = false;
- // If the function is address-taken or externally-visible, it could be
- // indirectly called. PLT entries and tail-calls use BR, but when they are
+ // Even in cases where a function has internal linkage and is only called
+ // directly in its translation unit, it can still be called indirectly if
+ // the linker decides to add a thunk to it for whatever reason (say, for
+ // example, if it is finally placed far from its call site and a BL is not
+ // long-range enough). PLT entries and tail-calls use BR, but when they are
// are in guarded pages should all use x16 or x17 to hold the called
// address, so we don't need to set CouldJump here. BR instructions in
// non-guarded pages (which might be non-BTI-aware code) are allowed to
// branch to a "BTI c" using any register.
- if (&MBB == &*MF.begin() && (F.hasAddressTaken() || !F.hasLocalLinkage()))
+ if (&MBB == &*MF.begin())
CouldCall = true;
// If the block itself is address-taken, it could be indirectly branched
@@ -119,8 +121,10 @@
auto MBBI = MBB.begin();
- // Skip the meta instuctions, those will be removed anyway.
- for (; MBBI != MBB.end() && MBBI->isMetaInstruction(); ++MBBI)
+ // Skip the meta instructions, those will be removed anyway.
+ for (; MBBI != MBB.end() &&
+ (MBBI->isMetaInstruction() || MBBI->getOpcode() == AArch64::EMITBKEY);
+ ++MBBI)
;
// SCTLR_EL1.BT[01] is set to 0 by default which means
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index c51dd48..bfcafc6 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -88,13 +88,8 @@
}
unsigned Size = LocVT.getSizeInBits() / 8;
- const Align StackAlign =
- State.getMachineFunction().getDataLayout().getStackAlignment();
- const Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
- const Align Alignment = std::min(OrigAlign, StackAlign);
-
for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, std::max(Alignment, SlotAlign)));
+ It.convertToMem(State.AllocateStack(Size, SlotAlign));
State.addLoc(It);
SlotAlign = Align(1);
}
@@ -197,7 +192,12 @@
State.AllocateReg(Reg);
}
- const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
+ const Align StackAlign =
+ State.getMachineFunction().getDataLayout().getStackAlignment();
+ const Align MemAlign = ArgFlags.getNonZeroMemAlign();
+ Align SlotAlign = std::min(MemAlign, StackAlign);
+ if (!Subtarget.isTargetDarwin())
+ SlotAlign = std::max(SlotAlign, Align(8));
return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index fdcc890..4b7ce56 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -69,6 +69,10 @@
// A SwiftError is passed in X21.
CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+ // Pass SwiftAsync in an otherwise callee saved register so that it will be
+ // preserved for normal function calls.
+ CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
@@ -166,7 +170,8 @@
// Vararg functions on windows pass floats in integer registers
let Entry = 1 in
def CC_AArch64_Win64_VarArg : CallingConv<[
- CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
+ CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
+ CCIfType<[f32], CCBitConvertToType<i32>>,
CCIfType<[f64], CCBitConvertToType<i64>>,
CCDelegateTo<CC_AArch64_AAPCS>
]>;
@@ -202,6 +207,10 @@
// A SwiftError is passed in X21.
CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+ // Pass SwiftAsync in an otherwise callee saved register so that it will be
+ // preserved for normal function calls.
+ CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -412,6 +421,9 @@
X19, X20, X21, X22, X23, X24,
X25, X26, X27, X28, LR, FP)>;
+def CSR_AArch64_AAPCS_SwiftTail
+ : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X20, X22)>;
+
// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
// 'this' and the pointer return value are both passed in X0 in these cases,
// this can be partially modelled by treating X0 as a callee-saved register;
@@ -464,6 +476,9 @@
def CSR_Darwin_AArch64_AAPCS_SwiftError
: CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
+def CSR_Darwin_AArch64_AAPCS_SwiftTail
+ : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X20, X22)>;
+
// The function used by Darwin to obtain the address of a thread-local variable
// guarantees more than a normal AAPCS function. x16 and x17 are used on the
// fast path for calculation, but other registers except X0 (argument/return)
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index efdb113..ac24334 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -419,13 +419,37 @@
++NumADRPToLDR;
}
break;
- case MCLOH_AdrpAddLdr:
+ case MCLOH_AdrpAddLdr: {
+ // There is a possibility that the linker may try to rewrite:
+ // adrp x0, @sym@PAGE
+ // add x1, x0, @sym@PAGEOFF
+ // [x0 = some other def]
+ // ldr x2, [x1]
+ // ...into...
+ // adrp x0, @sym
+ // nop
+ // [x0 = some other def]
+ // ldr x2, [x0]
+ // ...if the offset to the symbol won't fit within a literal load.
+ // This causes the load to use the result of the adrp, which in this
+ // case has already been clobbered.
+ // FIXME: Implement proper liveness tracking for all registers. For now,
+ // don't emit the LOH if there are any instructions between the add and
+ // the ldr.
+ MachineInstr *AddMI = const_cast<MachineInstr *>(Info.MI1);
+ const MachineInstr *LdrMI = Info.MI0;
+ auto AddIt = MachineBasicBlock::iterator(AddMI);
+ auto EndIt = AddMI->getParent()->end();
+ if (AddMI->getIterator() == EndIt || LdrMI != &*next_nodbg(AddIt, EndIt))
+ break;
+
LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n"
<< '\t' << MI << '\t' << *Info.MI1 << '\t'
<< *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
++NumADDToLDR;
break;
+ }
case MCLOH_AdrpAddStr:
if (Info.MI1 != nullptr) {
LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
index b1e7146..d938008 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -17,14 +17,39 @@
[{ return matchFConstantToConstant(*${root}, MRI); }]),
(apply [{ applyFConstantToConstant(*${root}); }])>;
+def icmp_redundant_trunc_matchdata : GIDefMatchData<"Register">;
+def icmp_redundant_trunc : GICombineRule<
+ (defs root:$root, icmp_redundant_trunc_matchdata:$matchinfo),
+ (match (wip_match_opcode G_ICMP):$root,
+ [{ return matchICmpRedundantTrunc(*${root}, MRI, Helper.getKnownBits(), ${matchinfo}); }]),
+ (apply [{ applyICmpRedundantTrunc(*${root}, MRI, B, Observer, ${matchinfo}); }])>;
+
+// AArch64-specific offset folding for G_GLOBAL_VALUE.
+def fold_global_offset_matchdata : GIDefMatchData<"std::pair<uint64_t, uint64_t>">;
+def fold_global_offset : GICombineRule<
+ (defs root:$root, fold_global_offset_matchdata:$matchinfo),
+ (match (wip_match_opcode G_GLOBAL_VALUE):$root,
+ [{ return matchFoldGlobalOffset(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ return applyFoldGlobalOffset(*${root}, MRI, B, Observer, ${matchinfo});}])
+>;
+
def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
"AArch64GenPreLegalizerCombinerHelper", [all_combines,
- fconstant_to_constant]> {
+ fconstant_to_constant,
+ icmp_redundant_trunc,
+ fold_global_offset]> {
let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
let StateClass = "AArch64PreLegalizerCombinerHelperState";
let AdditionalArguments = [];
}
+def AArch64O0PreLegalizerCombinerHelper: GICombinerHelper<
+ "AArch64GenO0PreLegalizerCombinerHelper", [optnone_combines]> {
+ let DisableRuleOption = "aarch64O0prelegalizercombiner-disable-rule";
+ let StateClass = "AArch64O0PreLegalizerCombinerHelperState";
+ let AdditionalArguments = [];
+}
+
// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a
// target-specific opcode.
def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">;
@@ -71,9 +96,13 @@
(apply [{ applyEXT(*${root}, ${matchinfo}); }])
>;
-// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
-// instruction.
-def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
+def shuf_to_ins_matchdata : GIDefMatchData<"std::tuple<Register, int, Register, int>">;
+def shuf_to_ins: GICombineRule <
+ (defs root:$root, shuf_to_ins_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchINS(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ return applyINS(*${root}, MRI, B, ${matchinfo}); }])
+>;
def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">;
def vashr_vlshr_imm : GICombineRule<
@@ -92,6 +121,10 @@
(apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
>;
+def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
+ form_duplane,
+ shuf_to_ins]>;
+
def adjust_icmp_imm_matchdata :
GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
def adjust_icmp_imm : GICombineRule <
@@ -101,7 +134,14 @@
(apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }])
>;
-def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>;
+def swap_icmp_operands : GICombineRule <
+ (defs root:$root),
+ (match (wip_match_opcode G_ICMP):$root,
+ [{ return trySwapICmpOperands(*${root}, MRI); }]),
+ (apply [{ applySwapICmpOperands(*${root}, Observer); }])
+>;
+
+def icmp_lowering : GICombineGroup<[adjust_icmp_imm, swap_icmp_operands]>;
def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">;
def extractvecelt_pairwise_add : GICombineRule<
@@ -119,13 +159,44 @@
(apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
>;
+def build_vector_to_dup : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_BUILD_VECTOR):$root,
+ [{ return matchBuildVectorToDup(*${root}, MRI); }]),
+ (apply [{ return applyBuildVectorToDup(*${root}, MRI, B); }])
+>;
+
+def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>;
+
+def lower_vector_fcmp : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_FCMP):$root,
+ [{ return lowerVectorFCMP(*${root}, MRI, B); }]),
+ (apply [{}])>;
+
+def form_truncstore_matchdata : GIDefMatchData<"Register">;
+def form_truncstore : GICombineRule<
+ (defs root:$root, form_truncstore_matchdata:$matchinfo),
+ (match (wip_match_opcode G_STORE):$root,
+ [{ return matchFormTruncstore(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyFormTruncstore(*${root}, MRI, B, Observer, ${matchinfo}); }])
+>;
+
+def fold_merge_to_zext : GICombineRule<
+ (defs root:$d),
+ (match (wip_match_opcode G_MERGE_VALUES):$d,
+ [{ return matchFoldMergeToZext(*${d}, MRI); }]),
+ (apply [{ applyFoldMergeToZext(*${d}, MRI, B, Observer); }])
+>;
+
// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
def AArch64PostLegalizerLoweringHelper
: GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper",
- [shuffle_vector_pseudos, vashr_vlshr_imm,
- icmp_lowering, form_duplane]> {
+ [shuffle_vector_lowering, vashr_vlshr_imm,
+ icmp_lowering, build_vector_lowering,
+ lower_vector_fcmp, form_truncstore]> {
let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule";
}
@@ -137,6 +208,10 @@
hoist_logic_op_with_same_opcode_hands,
redundant_and, xor_of_and_with_same_reg,
extractvecelt_pairwise_add, redundant_or,
- mul_const]> {
+ mul_const, redundant_sext_inreg,
+ form_bitfield_extract, rotate_out_of_range,
+ icmp_to_true_false_known_bits, merge_unmerge,
+ select_combines, fold_merge_to_zext,
+ constant_fold, identity_combines]> {
let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index c764af8..d98a5cf 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -14,9 +14,8 @@
#include "AArch64ExpandImm.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
-namespace llvm {
-
-namespace AArch64_IMM {
+using namespace llvm;
+using namespace llvm::AArch64_IMM;
/// Helper function which extracts the specified 16-bit chunk from a
/// 64-bit value.
@@ -302,8 +301,8 @@
/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
/// real move-immediate instructions to synthesize the immediate.
-void expandMOVImm(uint64_t Imm, unsigned BitSize,
- SmallVectorImpl<ImmInsnModel> &Insn) {
+void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
+ SmallVectorImpl<ImmInsnModel> &Insn) {
const unsigned Mask = 0xFFFF;
// Scan the immediate and count the number of 16-bit chunks which are either
@@ -405,7 +404,3 @@
// four-instruction sequence.
expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
}
-
-} // end namespace AArch64_AM
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index e57650a..b2eee28 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -85,6 +86,8 @@
unsigned N);
bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
+ bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
};
} // end anonymous namespace
@@ -276,21 +279,46 @@
Register NewLoReg = MI.getOperand(6).getReg();
Register NewHiReg = MI.getOperand(7).getReg();
+ unsigned LdxpOp, StxpOp;
+
+ switch (MI.getOpcode()) {
+ case AArch64::CMP_SWAP_128_MONOTONIC:
+ LdxpOp = AArch64::LDXPX;
+ StxpOp = AArch64::STXPX;
+ break;
+ case AArch64::CMP_SWAP_128_RELEASE:
+ LdxpOp = AArch64::LDXPX;
+ StxpOp = AArch64::STLXPX;
+ break;
+ case AArch64::CMP_SWAP_128_ACQUIRE:
+ LdxpOp = AArch64::LDAXPX;
+ StxpOp = AArch64::STXPX;
+ break;
+ case AArch64::CMP_SWAP_128:
+ LdxpOp = AArch64::LDAXPX;
+ StxpOp = AArch64::STLXPX;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto FailBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoadCmpBB);
MF->insert(++LoadCmpBB->getIterator(), StoreBB);
- MF->insert(++StoreBB->getIterator(), DoneBB);
+ MF->insert(++StoreBB->getIterator(), FailBB);
+ MF->insert(++FailBB->getIterator(), DoneBB);
// .Lloadcmp:
// ldaxp xDestLo, xDestHi, [xAddr]
// cmp xDestLo, xDesiredLo
// sbcs xDestHi, xDesiredHi
// b.ne .Ldone
- BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+ BuildMI(LoadCmpBB, DL, TII->get(LdxpOp))
.addReg(DestLo.getReg(), RegState::Define)
.addReg(DestHi.getReg(), RegState::Define)
.addReg(AddrReg);
@@ -312,23 +340,37 @@
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
.addUse(StatusReg, getKillRegState(StatusDead))
- .addMBB(DoneBB);
- LoadCmpBB->addSuccessor(DoneBB);
+ .addMBB(FailBB);
+ LoadCmpBB->addSuccessor(FailBB);
LoadCmpBB->addSuccessor(StoreBB);
// .Lstore:
// stlxp wStatus, xNewLo, xNewHi, [xAddr]
// cbnz wStatus, .Lloadcmp
- BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+ BuildMI(StoreBB, DL, TII->get(StxpOp), StatusReg)
.addReg(NewLoReg)
.addReg(NewHiReg)
.addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
.addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
+ BuildMI(StoreBB, DL, TII->get(AArch64::B)).addMBB(DoneBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
+ // .Lfail:
+ // stlxp wStatus, xDestLo, xDestHi, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ BuildMI(FailBB, DL, TII->get(StxpOp), StatusReg)
+ .addReg(DestLo.getReg())
+ .addReg(DestHi.getReg())
+ .addReg(AddrReg);
+ BuildMI(FailBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, getKillRegState(StatusDead))
+ .addMBB(LoadCmpBB);
+ FailBB->addSuccessor(LoadCmpBB);
+ FailBB->addSuccessor(DoneBB);
+
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
@@ -340,9 +382,13 @@
// Recompute liveness bottom up.
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *DoneBB);
+ computeAndAddLiveIns(LiveRegs, *FailBB);
computeAndAddLiveIns(LiveRegs, *StoreBB);
computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
+
// Do an extra pass in the loop to get the loop carried dependencies right.
+ FailBB->clearLiveIns();
+ computeAndAddLiveIns(LiveRegs, *FailBB);
StoreBB->clearLiveIns();
computeAndAddLiveIns(LiveRegs, *StoreBB);
LoadCmpBB->clearLiveIns();
@@ -405,7 +451,7 @@
assert(DstReg != MI.getOperand(3).getReg());
bool UseRev = false;
- unsigned PredIdx, DOPIdx, SrcIdx;
+ unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx;
switch (DType) {
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
@@ -419,7 +465,22 @@
case AArch64::DestructiveBinary:
case AArch64::DestructiveBinaryImm:
std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3);
- break;
+ break;
+ case AArch64::DestructiveUnaryPassthru:
+ std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(2, 3, 3);
+ break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 2, 3, 4);
+ if (DstReg == MI.getOperand(3).getReg()) {
+ // FMLA Zd, Pg, Za, Zd, Zm ==> FMAD Zdn, Pg, Zm, Za
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 3, 4, 2);
+ UseRev = true;
+ } else if (DstReg == MI.getOperand(4).getReg()) {
+ // FMLA Zd, Pg, Za, Zm, Zd ==> FMAD Zdn, Pg, Zm, Za
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 4, 3, 2);
+ UseRev = true;
+ }
+ break;
default:
llvm_unreachable("Unsupported Destructive Operand type");
}
@@ -436,9 +497,16 @@
DstReg != MI.getOperand(DOPIdx).getReg() ||
MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg();
break;
+ case AArch64::DestructiveUnaryPassthru:
case AArch64::DestructiveBinaryImm:
DOPRegIsUnique = true;
break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ DOPRegIsUnique =
+ DstReg != MI.getOperand(DOPIdx).getReg() ||
+ (MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg() &&
+ MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg());
+ break;
}
#endif
@@ -514,6 +582,11 @@
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
switch (DType) {
+ case AArch64::DestructiveUnaryPassthru:
+ DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .add(MI.getOperand(PredIdx))
+ .add(MI.getOperand(SrcIdx));
+ break;
case AArch64::DestructiveBinaryImm:
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
@@ -521,6 +594,12 @@
.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
.add(MI.getOperand(SrcIdx));
break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ DOP.add(MI.getOperand(PredIdx))
+ .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .add(MI.getOperand(SrcIdx))
+ .add(MI.getOperand(Src2Idx));
+ break;
}
if (PRFX) {
@@ -648,8 +727,10 @@
// Skip register arguments. Those are added during ISel, but are not
// needed for the concrete branch.
while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
- assert(MI.getOperand(RegMaskStartIdx).isReg() &&
- "should only skip register operands");
+ auto MOP = MI.getOperand(RegMaskStartIdx);
+ assert(MOP.isReg() && "can only add register operands");
+ OriginalCall->addOperand(MachineOperand::CreateReg(
+ MOP.getReg(), /*Def=*/false, /*Implicit=*/true));
RegMaskStartIdx++;
}
for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx)
@@ -669,6 +750,63 @@
return true;
}
+bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+ Register CtxReg = MBBI->getOperand(0).getReg();
+ Register BaseReg = MBBI->getOperand(1).getReg();
+ int Offset = MBBI->getOperand(2).getImm();
+ DebugLoc DL(MBBI->getDebugLoc());
+ auto &STI = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+
+ if (STI.getTargetTriple().getArchName() != "arm64e") {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+ .addUse(CtxReg)
+ .addUse(BaseReg)
+ .addImm(Offset / 8)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBBI->eraseFromParent();
+ return true;
+ }
+
+ // We need to sign the context in an address-discriminated way. 0xc31a is a
+ // fixed random value, chosen as part of the ABI.
+ // add x16, xBase, #Offset
+ // movk x16, #0xc31a, lsl #48
+ // mov x17, x22/xzr
+ // pacdb x17, x16
+ // str x17, [xBase, #Offset]
+ unsigned Opc = Offset >= 0 ? AArch64::ADDXri : AArch64::SUBXri;
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), AArch64::X16)
+ .addUse(BaseReg)
+ .addImm(abs(Offset))
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X16)
+ .addUse(AArch64::X16)
+ .addImm(0xc31a)
+ .addImm(48)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // We're not allowed to clobber X22 (and couldn't clobber XZR if we tried), so
+ // move it somewhere before signing.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::X17)
+ .addUse(AArch64::XZR)
+ .addUse(CtxReg)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACDB), AArch64::X17)
+ .addUse(AArch64::X17)
+ .addUse(AArch64::X16)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+ .addUse(AArch64::X17)
+ .addUse(BaseReg)
+ .addImm(Offset / 8)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ MBBI->eraseFromParent();
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -877,11 +1015,36 @@
MI.eraseFromParent();
return true;
}
+ case AArch64::MOVaddrBA: {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetMachO()) {
+ // blockaddress expressions have to come from a constant pool because the
+ // largest addend (and hence offset within a function) allowed for ADRP is
+ // only 8MB.
+ const BlockAddress *BA = MI.getOperand(1).getBlockAddress();
+ assert(MI.getOperand(1).getOffset() == 0 && "unexpected offset");
+ MachineConstantPool *MCP = MF.getConstantPool();
+ unsigned CPIdx = MCP->getConstantPoolIndex(BA, Align(8));
+
+ Register DstReg = MI.getOperand(0).getReg();
+ auto MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+ .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
+ auto MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(AArch64::LDRXui), DstReg)
+ .addUse(DstReg)
+ .addConstantPoolIndex(
+ CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+ LLVM_FALLTHROUGH;
case AArch64::MOVaddr:
case AArch64::MOVaddrJT:
case AArch64::MOVaddrCP:
- case AArch64::MOVaddrBA:
case AArch64::MOVaddrTLS:
case AArch64::MOVaddrEXT: {
// Expand into ADRP + ADD.
@@ -982,6 +1145,9 @@
AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
AArch64::XZR, NextMBBI);
case AArch64::CMP_SWAP_128:
+ case AArch64::CMP_SWAP_128_RELEASE:
+ case AArch64::CMP_SWAP_128_ACQUIRE:
+ case AArch64::CMP_SWAP_128_MONOTONIC:
return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
case AArch64::AESMCrrTied:
@@ -1058,6 +1224,8 @@
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
case AArch64::BLR_RVMARKER:
return expandCALL_RVMARKER(MBB, MBBI);
+ case AArch64::StoreSwiftAsyncContext:
+ return expandStoreSwiftAsyncContext(MBB, MBBI);
}
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9801036..9acda17 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -195,34 +195,32 @@
const Value *Cond);
bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
bool optimizeSelect(const SelectInst *SI);
- std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
+ unsigned getRegForGEPIndex(const Value *Idx);
// Emit helper routines.
unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
const Value *RHS, bool SetFlags = false,
bool WantResult = true, bool IsZExt = false);
unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
- bool SetFlags = false, bool WantResult = true);
+ unsigned RHSReg, bool SetFlags = false,
+ bool WantResult = true);
unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+ uint64_t Imm, bool SetFlags = false,
bool WantResult = true);
unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
- AArch64_AM::ShiftExtendType ShiftType,
+ unsigned RHSReg, AArch64_AM::ShiftExtendType ShiftType,
uint64_t ShiftImm, bool SetFlags = false,
bool WantResult = true);
unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
- AArch64_AM::ShiftExtendType ExtType,
- uint64_t ShiftImm, bool SetFlags = false,
+ unsigned RHSReg, AArch64_AM::ShiftExtendType ExtType,
+ uint64_t ShiftImm, bool SetFlags = false,
bool WantResult = true);
// Emit functions.
bool emitCompareAndBranch(const BranchInst *BI);
bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
- bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+ bool emitICmp_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm);
bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
MachineMemOperand *MMO = nullptr);
@@ -235,42 +233,34 @@
unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
bool SetFlags = false, bool WantResult = true,
bool IsZExt = false);
- unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+ unsigned emitAdd_ri_(MVT VT, unsigned Op0, int64_t Imm);
unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
bool SetFlags = false, bool WantResult = true,
bool IsZExt = false);
- unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
- unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
- unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
- unsigned RHSReg, bool RHSIsKill,
+ unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, unsigned RHSReg,
+ bool WantResult = true);
+ unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, unsigned RHSReg,
AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
bool WantResult = true);
unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
const Value *RHS);
unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, uint64_t Imm);
+ uint64_t Imm);
unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
- uint64_t ShiftImm);
- unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
- unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
- unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
- unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
- unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill);
- unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
- uint64_t Imm, bool IsZExt = true);
- unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill);
- unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
- uint64_t Imm, bool IsZExt = true);
- unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill);
- unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
- uint64_t Imm, bool IsZExt = false);
+ unsigned RHSReg, uint64_t ShiftImm);
+ unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm);
+ unsigned emitMul_rr(MVT RetVT, unsigned Op0, unsigned Op1);
+ unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1);
+ unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1);
+ unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg);
+ unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm,
+ bool IsZExt = true);
+ unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg);
+ unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm,
+ bool IsZExt = true);
+ unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg);
+ unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm,
+ bool IsZExt = false);
unsigned materializeInt(const ConstantInt *CI, MVT VT);
unsigned materializeFP(const ConstantFP *CFP, MVT VT);
@@ -414,8 +404,8 @@
return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
}
- // For the MachO large code model materialize the FP constant in code.
- if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ // For the large code model materialize the FP constant in code.
+ if (TM.getCodeModel() == CodeModel::Large) {
unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
const TargetRegisterClass *RC = Is64Bit ?
&AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
@@ -526,10 +516,7 @@
MVT VT = CEVT.getSimpleVT();
// arm64_32 has 32-bit pointers held in 64-bit registers. Because of that,
// 'null' pointers need to have a somewhat special treatment.
- if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) {
- (void)CPN;
- assert(CPN->getType()->getPointerAddressSpace() == 0 &&
- "Unexpected address space");
+ if (isa<ConstantPointerNull>(C)) {
assert(VT == MVT::i64 && "Expected 64-bit pointers");
return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT);
}
@@ -557,7 +544,7 @@
bool Is64Bit = (VT == MVT::f64);
unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
- return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+ return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg);
}
/// Check if the multiply is by a power-of-2 constant.
@@ -767,9 +754,7 @@
unsigned Reg = getRegForValue(LHS);
if (!Reg)
return false;
- bool RegIsKill = hasTrivialKill(LHS);
- Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
- AArch64::sub_32);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32);
Addr.setOffsetReg(Reg);
return true;
}
@@ -865,9 +850,7 @@
unsigned Reg = getRegForValue(LHS);
if (!Reg)
return false;
- bool RegIsKill = hasTrivialKill(LHS);
- Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
- AArch64::sub_32);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32);
Addr.setOffsetReg(Reg);
return true;
}
@@ -1067,26 +1050,22 @@
if (Addr.getExtendType() == AArch64_AM::SXTW ||
Addr.getExtendType() == AArch64_AM::UXTW )
ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
- /*TODO:IsKill=*/false, Addr.getOffsetReg(),
- /*TODO:IsKill=*/false, Addr.getExtendType(),
+ Addr.getOffsetReg(), Addr.getExtendType(),
Addr.getShift());
else
ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
- /*TODO:IsKill=*/false, Addr.getOffsetReg(),
- /*TODO:IsKill=*/false, AArch64_AM::LSL,
+ Addr.getOffsetReg(), AArch64_AM::LSL,
Addr.getShift());
} else {
if (Addr.getExtendType() == AArch64_AM::UXTW)
ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
- /*Op0IsKill=*/false, Addr.getShift(),
- /*IsZExt=*/true);
+ Addr.getShift(), /*IsZExt=*/true);
else if (Addr.getExtendType() == AArch64_AM::SXTW)
ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
- /*Op0IsKill=*/false, Addr.getShift(),
- /*IsZExt=*/false);
+ Addr.getShift(), /*IsZExt=*/false);
else
ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
- /*Op0IsKill=*/false, Addr.getShift());
+ Addr.getShift());
}
if (!ResultReg)
return false;
@@ -1103,7 +1082,7 @@
unsigned ResultReg;
if (Addr.getReg())
// Try to fold the immediate into the add instruction.
- ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+ ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), Offset);
else
ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
@@ -1202,7 +1181,6 @@
unsigned LHSReg = getRegForValue(LHS);
if (!LHSReg)
return 0;
- bool LHSIsKill = hasTrivialKill(LHS);
if (NeedExtend)
LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
@@ -1211,15 +1189,14 @@
if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
if (C->isNegative())
- ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
- SetFlags, WantResult);
+ ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, -Imm, SetFlags,
+ WantResult);
else
- ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, Imm, SetFlags,
WantResult);
} else if (const auto *C = dyn_cast<Constant>(RHS))
if (C->isNullValue())
- ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
- WantResult);
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, 0, SetFlags, WantResult);
if (ResultReg)
return ResultReg;
@@ -1233,17 +1210,14 @@
unsigned RHSReg = getRegForValue(SI->getOperand(0));
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ExtendType, C->getZExtValue(),
- SetFlags, WantResult);
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType,
+ C->getZExtValue(), SetFlags, WantResult);
}
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(RHS);
- return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
- ExtendType, 0, SetFlags, WantResult);
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, 0,
+ SetFlags, WantResult);
}
// Check if the mul can be folded into the instruction.
@@ -1261,10 +1235,8 @@
unsigned RHSReg = getRegForValue(MulLHS);
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(MulLHS);
- ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
- WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, AArch64_AM::LSL,
+ ShiftVal, SetFlags, WantResult);
if (ResultReg)
return ResultReg;
}
@@ -1286,10 +1258,8 @@
unsigned RHSReg = getRegForValue(SI->getOperand(0));
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftType, ShiftVal, SetFlags,
- WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, ShiftType,
+ ShiftVal, SetFlags, WantResult);
if (ResultReg)
return ResultReg;
}
@@ -1300,18 +1270,15 @@
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(RHS);
if (NeedExtend)
RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
- return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
- SetFlags, WantResult);
+ return emitAddSub_rr(UseAdd, RetVT, LHSReg, RHSReg, SetFlags, WantResult);
}
unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg,
- bool RHSIsKill, bool SetFlags,
+ unsigned RHSReg, bool SetFlags,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
@@ -1342,14 +1309,14 @@
LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill))
- .addReg(RHSReg, getKillRegState(RHSIsKill));
+ .addReg(LHSReg)
+ .addReg(RHSReg);
return ResultReg;
}
unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, uint64_t Imm,
- bool SetFlags, bool WantResult) {
+ uint64_t Imm, bool SetFlags,
+ bool WantResult) {
assert(LHSReg && "Invalid register number.");
if (RetVT != MVT::i32 && RetVT != MVT::i64)
@@ -1386,15 +1353,14 @@
const MCInstrDesc &II = TII.get(Opc);
LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(LHSReg)
.addImm(Imm)
.addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
return ResultReg;
}
unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg,
- bool RHSIsKill,
+ unsigned RHSReg,
AArch64_AM::ShiftExtendType ShiftType,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
@@ -1429,15 +1395,14 @@
LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill))
- .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addReg(LHSReg)
+ .addReg(RHSReg)
.addImm(getShifterImm(ShiftType, ShiftImm));
return ResultReg;
}
unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg,
- bool RHSIsKill,
+ unsigned RHSReg,
AArch64_AM::ShiftExtendType ExtType,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
@@ -1474,8 +1439,8 @@
LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill))
- .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addReg(LHSReg)
+ .addReg(RHSReg)
.addImm(getArithExtendImm(ExtType, ShiftImm));
return ResultReg;
}
@@ -1508,9 +1473,8 @@
IsZExt) != 0;
}
-bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
- uint64_t Imm) {
- return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm) {
+ return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, Imm,
/*SetFlags=*/true, /*WantResult=*/false) != 0;
}
@@ -1528,24 +1492,22 @@
unsigned LHSReg = getRegForValue(LHS);
if (!LHSReg)
return false;
- bool LHSIsKill = hasTrivialKill(LHS);
if (UseImm) {
unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
- .addReg(LHSReg, getKillRegState(LHSIsKill));
+ .addReg(LHSReg);
return true;
}
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return false;
- bool RHSIsKill = hasTrivialKill(RHS);
unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
- .addReg(LHSReg, getKillRegState(LHSIsKill))
- .addReg(RHSReg, getKillRegState(RHSIsKill));
+ .addReg(LHSReg)
+ .addReg(RHSReg);
return true;
}
@@ -1560,13 +1522,12 @@
/// First try to emit an add with an immediate operand using emitAddSub_ri. If
/// that fails, then try to materialize the immediate into a register and use
/// emitAddSub_rr instead.
-unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
- int64_t Imm) {
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, int64_t Imm) {
unsigned ResultReg;
if (Imm < 0)
- ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+ ResultReg = emitAddSub_ri(false, VT, Op0, -Imm);
else
- ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+ ResultReg = emitAddSub_ri(true, VT, Op0, Imm);
if (ResultReg)
return ResultReg;
@@ -1575,7 +1536,7 @@
if (!CReg)
return 0;
- ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+ ResultReg = emitAddSub_rr(true, VT, Op0, CReg);
return ResultReg;
}
@@ -1586,20 +1547,17 @@
}
unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg,
- bool RHSIsKill, bool WantResult) {
- return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, /*SetFlags=*/true, WantResult);
+ unsigned RHSReg, bool WantResult) {
+ return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, RHSReg,
+ /*SetFlags=*/true, WantResult);
}
unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
- bool LHSIsKill, unsigned RHSReg,
- bool RHSIsKill,
+ unsigned RHSReg,
AArch64_AM::ShiftExtendType ShiftType,
uint64_t ShiftImm, bool WantResult) {
- return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
- WantResult);
+ return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, RHSReg, ShiftType,
+ ShiftImm, /*SetFlags=*/true, WantResult);
}
unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
@@ -1622,12 +1580,11 @@
unsigned LHSReg = getRegForValue(LHS);
if (!LHSReg)
return 0;
- bool LHSIsKill = hasTrivialKill(LHS);
unsigned ResultReg = 0;
if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
uint64_t Imm = C->getZExtValue();
- ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+ ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, Imm);
}
if (ResultReg)
return ResultReg;
@@ -1648,9 +1605,7 @@
unsigned RHSReg = getRegForValue(MulLHS);
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(MulLHS);
- ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal);
if (ResultReg)
return ResultReg;
}
@@ -1664,9 +1619,7 @@
unsigned RHSReg = getRegForValue(SI->getOperand(0));
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal);
if (ResultReg)
return ResultReg;
}
@@ -1675,20 +1628,18 @@
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return 0;
- bool RHSIsKill = hasTrivialKill(RHS);
MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
- ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, RHSReg);
if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
}
return ResultReg;
}
unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
- unsigned LHSReg, bool LHSIsKill,
- uint64_t Imm) {
+ unsigned LHSReg, uint64_t Imm) {
static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
"ISD nodes are not consecutive!");
static const unsigned OpcTable[3][2] = {
@@ -1723,18 +1674,17 @@
return 0;
unsigned ResultReg =
- fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+ fastEmitInst_ri(Opc, RC, LHSReg,
AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
}
return ResultReg;
}
unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
- unsigned LHSReg, bool LHSIsKill,
- unsigned RHSReg, bool RHSIsKill,
+ unsigned LHSReg, unsigned RHSReg,
uint64_t ShiftImm) {
static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
"ISD nodes are not consecutive!");
@@ -1766,18 +1716,18 @@
break;
}
unsigned ResultReg =
- fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ fastEmitInst_rri(Opc, RC, LHSReg, RHSReg,
AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
}
return ResultReg;
}
-unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg,
uint64_t Imm) {
- return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+ return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, Imm);
}
unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
@@ -1898,7 +1848,7 @@
// Loading an i1 requires special handling.
if (VT == MVT::i1) {
- unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+ unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, 1);
assert(ANDReg && "Unexpected AND instruction emission failure.");
ResultReg = ANDReg;
}
@@ -2052,7 +2002,6 @@
removeDeadCode(I, std::next(I));
} else
ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
- /*IsKill=*/true,
AArch64::sub_32);
}
updateValueMap(I, ResultReg);
@@ -2160,7 +2109,7 @@
// Storing an i1 requires special handling.
if (VTIsi1 && SrcReg != AArch64::WZR) {
- unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, 1);
assert(ANDReg && "Unexpected AND instruction emission failure.");
SrcReg = ANDReg;
}
@@ -2393,11 +2342,9 @@
unsigned SrcReg = getRegForValue(LHS);
if (!SrcReg)
return false;
- bool SrcIsKill = hasTrivialKill(LHS);
if (BW == 64 && !Is64Bit)
- SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
- AArch64::sub_32);
+ SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, AArch64::sub_32);
if ((BW < 32) && !IsBitTest)
SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*isZExt=*/true);
@@ -2406,7 +2353,7 @@
SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
- .addReg(SrcReg, getKillRegState(SrcIsKill));
+ .addReg(SrcReg);
if (IsBitTest)
MIB.addImm(TestBit);
MIB.addMBB(TBB);
@@ -2524,7 +2471,6 @@
unsigned CondReg = getRegForValue(BI->getCondition());
if (CondReg == 0)
return false;
- bool CondRegIsKill = hasTrivialKill(BI->getCondition());
// i1 conditions come as i32 values, test the lowest bit with tb(n)z.
unsigned Opcode = AArch64::TBNZW;
@@ -2537,7 +2483,7 @@
unsigned ConstrainedCondReg
= constrainOperandRegClass(II, CondReg, II.getNumDefs());
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+ .addReg(ConstrainedCondReg)
.addImm(0)
.addMBB(TBB);
@@ -2687,19 +2633,16 @@
unsigned Src1Reg = getRegForValue(Src1Val);
if (!Src1Reg)
return false;
- bool Src1IsKill = hasTrivialKill(Src1Val);
unsigned Src2Reg = getRegForValue(Src2Val);
if (!Src2Reg)
return false;
- bool Src2IsKill = hasTrivialKill(Src2Val);
- if (NeedExtraOp) {
- Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
- Src1IsKill = true;
- }
+ if (NeedExtraOp)
+ Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, 1);
+
unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
- Src1IsKill, Src2Reg, Src2IsKill);
+ Src2Reg);
updateValueMap(SI, ResultReg);
return true;
}
@@ -2771,9 +2714,6 @@
unsigned SrcReg = getRegForValue(FoldSelect);
if (!SrcReg)
return false;
- unsigned UseReg = lookUpRegForValue(SI);
- if (UseReg)
- MRI.clearKillFlags(UseReg);
updateValueMap(I, SrcReg);
return true;
@@ -2802,7 +2742,6 @@
unsigned CondReg = getRegForValue(Cond);
if (!CondReg)
return false;
- bool CondIsKill = hasTrivialKill(Cond);
const MCInstrDesc &II = TII.get(AArch64::ANDSWri);
CondReg = constrainOperandRegClass(II, CondReg, 1);
@@ -2810,26 +2749,20 @@
// Emit a TST instruction (ANDS wzr, reg, #imm).
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
AArch64::WZR)
- .addReg(CondReg, getKillRegState(CondIsKill))
+ .addReg(CondReg)
.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
}
unsigned Src1Reg = getRegForValue(SI->getTrueValue());
- bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
-
unsigned Src2Reg = getRegForValue(SI->getFalseValue());
- bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
if (!Src1Reg || !Src2Reg)
return false;
- if (ExtraCC != AArch64CC::AL) {
- Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
- Src2IsKill, ExtraCC);
- Src2IsKill = true;
- }
- unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
- Src2IsKill, CC);
+ if (ExtraCC != AArch64CC::AL)
+ Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, ExtraCC);
+
+ unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -2914,7 +2847,6 @@
unsigned SrcReg = getRegForValue(I->getOperand(0));
if (!SrcReg)
return false;
- bool SrcIsKill = hasTrivialKill(I->getOperand(0));
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
@@ -2924,7 +2856,6 @@
emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
if (!SrcReg)
return false;
- SrcIsKill = true;
}
unsigned Opc;
@@ -2940,8 +2871,7 @@
Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
}
- unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
- SrcIsKill);
+ unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg);
updateValueMap(I, ResultReg);
return true;
}
@@ -2969,6 +2899,7 @@
Arg.hasAttribute(Attribute::InReg) ||
Arg.hasAttribute(Attribute::StructRet) ||
Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftAsync) ||
Arg.hasAttribute(Attribute::SwiftError) ||
Arg.hasAttribute(Attribute::Nest))
return false;
@@ -3227,7 +3158,7 @@
for (auto Flag : CLI.OutFlags)
if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
- Flag.isSwiftSelf() || Flag.isSwiftError())
+ Flag.isSwiftSelf() || Flag.isSwiftAsync() || Flag.isSwiftError())
return false;
// Set up the argument vectors.
@@ -3494,7 +3425,7 @@
unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
while (Depth--) {
DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
- SrcReg, /*IsKill=*/true, 0);
+ SrcReg, 0);
assert(DestReg && "Unexpected LDR instruction emission failure.");
SrcReg = DestReg;
}
@@ -3640,10 +3571,9 @@
unsigned SrcReg = getRegForValue(II->getOperand(0));
if (!SrcReg)
return false;
- bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+ .addReg(SrcReg);
updateValueMap(II, ResultReg);
return true;
}
@@ -3666,9 +3596,8 @@
unsigned Op0Reg = getRegForValue(II->getOperand(0));
if (!Op0Reg)
return false;
- bool Op0IsKill = hasTrivialKill(II->getOperand(0));
- unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+ unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg);
if (!ResultReg)
return false;
@@ -3745,33 +3674,28 @@
unsigned LHSReg = getRegForValue(LHS);
if (!LHSReg)
return false;
- bool LHSIsKill = hasTrivialKill(LHS);
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return false;
- bool RHSIsKill = hasTrivialKill(RHS);
if (VT == MVT::i32) {
- MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
- unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
- /*IsKill=*/false, 32);
- MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
- AArch64::sub_32);
- ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
- AArch64::sub_32);
- emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
- AArch64_AM::ASR, 31, /*WantResult=*/false);
+ MulReg = emitSMULL_rr(MVT::i64, LHSReg, RHSReg);
+ unsigned MulSubReg =
+ fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32);
+ // cmp xreg, wreg, sxtw
+ emitAddSub_rx(/*UseAdd=*/false, MVT::i64, MulReg, MulSubReg,
+ AArch64_AM::SXTW, /*ShiftImm=*/0, /*SetFlags=*/true,
+ /*WantResult=*/false);
+ MulReg = MulSubReg;
} else {
assert(VT == MVT::i64 && "Unexpected value type.");
// LHSReg and RHSReg cannot be killed by this Mul, since they are
// reused in the next instruction.
- MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
- /*IsKill=*/false);
- unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
- RHSReg, RHSIsKill);
- emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
- AArch64_AM::ASR, 63, /*WantResult=*/false);
+ MulReg = emitMul_rr(VT, LHSReg, RHSReg);
+ unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, RHSReg);
+ emitSubs_rs(VT, SMULHReg, MulReg, AArch64_AM::ASR, 63,
+ /*WantResult=*/false);
}
break;
}
@@ -3780,30 +3704,26 @@
unsigned LHSReg = getRegForValue(LHS);
if (!LHSReg)
return false;
- bool LHSIsKill = hasTrivialKill(LHS);
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
return false;
- bool RHSIsKill = hasTrivialKill(RHS);
if (VT == MVT::i32) {
- MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
- emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
- /*IsKill=*/false, AArch64_AM::LSR, 32,
- /*WantResult=*/false);
- MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
- AArch64::sub_32);
+ MulReg = emitUMULL_rr(MVT::i64, LHSReg, RHSReg);
+ // tst xreg, #0xffffffff00000000
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::ANDSXri), AArch64::XZR)
+ .addReg(MulReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xFFFFFFFF00000000, 64));
+ MulReg = fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32);
} else {
assert(VT == MVT::i64 && "Unexpected value type.");
// LHSReg and RHSReg cannot be killed by this Mul, since they are
// reused in the next instruction.
- MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
- /*IsKill=*/false);
- unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
- RHSReg, RHSIsKill);
- emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
- /*IsKill=*/false, /*WantResult=*/false);
+ MulReg = emitMul_rr(VT, LHSReg, RHSReg);
+ unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, RHSReg);
+ emitSubs_rr(VT, AArch64::XZR, UMULHReg, /*WantResult=*/false);
}
break;
}
@@ -3819,8 +3739,8 @@
return false;
ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
- AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
- /*IsKill=*/true, getInvertedCondCode(CC));
+ AArch64::WZR, AArch64::WZR,
+ getInvertedCondCode(CC));
(void)ResultReg2;
assert((ResultReg1 + 1) == ResultReg2 &&
"Nonconsecutive result registers.");
@@ -3894,7 +3814,7 @@
return false;
// Vectors (of > 1 lane) in big endian need tricky handling.
- if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+ if (RVEVT.isVector() && RVEVT.getVectorElementCount().isVector() &&
!Subtarget->isLittleEndian())
return false;
@@ -3920,7 +3840,7 @@
// "Callee" (i.e. value producer) zero extends pointers at function
// boundary.
if (Subtarget->isTargetILP32() && RV->getType()->isPointerTy())
- SrcReg = emitAnd_ri(MVT::i64, SrcReg, false, 0xffffffff);
+ SrcReg = emitAnd_ri(MVT::i64, SrcReg, 0xffffffff);
// Make the copy.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3962,7 +3882,6 @@
unsigned SrcReg = getRegForValue(Op);
if (!SrcReg)
return false;
- bool SrcIsKill = hasTrivialKill(Op);
// If we're truncating from i64 to a smaller non-legal type then generate an
// AND. Otherwise, we know the high bits are undefined and a truncate only
@@ -3987,16 +3906,16 @@
break;
}
// Issue an extract_subreg to get the lower 32-bits.
- unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+ unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg,
AArch64::sub_32);
// Create the AND instruction which performs the actual truncation.
- ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, Reg32, Mask);
assert(ResultReg && "Unexpected AND instruction emission failure.");
} else {
ResultReg = createResultReg(&AArch64::GPR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(SrcReg, getKillRegState(SrcIsKill));
+ .addReg(SrcReg);
}
updateValueMap(I, ResultReg);
@@ -4012,7 +3931,7 @@
DestVT = MVT::i32;
if (IsZExt) {
- unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, 1);
assert(ResultReg && "Unexpected AND instruction emission failure.");
if (DestVT == MVT::i64) {
// We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
@@ -4032,12 +3951,11 @@
return 0;
}
return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
- /*TODO:IsKill=*/false, 0, 0);
+ 0, 0);
}
}
-unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, unsigned Op1) {
unsigned Opc, ZReg;
switch (RetVT.SimpleTy) {
default: return 0;
@@ -4052,32 +3970,27 @@
const TargetRegisterClass *RC =
(RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
- return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
- /*IsKill=*/ZReg, true);
+ return fastEmitInst_rrr(Opc, RC, Op0, Op1, ZReg);
}
-unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1) {
if (RetVT != MVT::i64)
return 0;
return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
- Op0, Op0IsKill, Op1, Op1IsKill,
- AArch64::XZR, /*IsKill=*/true);
+ Op0, Op1, AArch64::XZR);
}
-unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1) {
if (RetVT != MVT::i64)
return 0;
return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
- Op0, Op0IsKill, Op1, Op1IsKill,
- AArch64::XZR, /*IsKill=*/true);
+ Op0, Op1, AArch64::XZR);
}
-unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill) {
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg,
+ unsigned Op1Reg) {
unsigned Opc = 0;
bool NeedTrunc = false;
uint64_t Mask = 0;
@@ -4091,20 +4004,17 @@
const TargetRegisterClass *RC =
(RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
- if (NeedTrunc) {
- Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
- Op1IsKill = true;
- }
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
- Op1IsKill);
if (NeedTrunc)
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
+
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
return ResultReg;
}
unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
- bool Op0IsKill, uint64_t Shift,
- bool IsZExt) {
+ uint64_t Shift, bool IsZExt) {
assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
"Unexpected source/return type pair.");
assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
@@ -4126,7 +4036,7 @@
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(Op0, getKillRegState(Op0IsKill));
+ .addReg(Op0);
return ResultReg;
} else
return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
@@ -4174,16 +4084,15 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
- .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op0)
.addImm(AArch64::sub_32);
Op0 = TmpReg;
- Op0IsKill = true;
}
- return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+ return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS);
}
-unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill) {
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg,
+ unsigned Op1Reg) {
unsigned Opc = 0;
bool NeedTrunc = false;
uint64_t Mask = 0;
@@ -4198,20 +4107,17 @@
const TargetRegisterClass *RC =
(RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
if (NeedTrunc) {
- Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
- Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
- Op0IsKill = Op1IsKill = true;
+ Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Mask);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
}
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
- Op1IsKill);
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
if (NeedTrunc)
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
return ResultReg;
}
unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
- bool Op0IsKill, uint64_t Shift,
- bool IsZExt) {
+ uint64_t Shift, bool IsZExt) {
assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
"Unexpected source/return type pair.");
assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
@@ -4233,7 +4139,7 @@
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(Op0, getKillRegState(Op0IsKill));
+ .addReg(Op0);
return ResultReg;
} else
return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
@@ -4277,7 +4183,6 @@
Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
if (!Op0)
return 0;
- Op0IsKill = true;
SrcVT = RetVT;
SrcBits = SrcVT.getSizeInBits();
IsZExt = true;
@@ -4295,16 +4200,15 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
- .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op0)
.addImm(AArch64::sub_32);
Op0 = TmpReg;
- Op0IsKill = true;
}
- return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+ return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS);
}
-unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
- unsigned Op1Reg, bool Op1IsKill) {
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg,
+ unsigned Op1Reg) {
unsigned Opc = 0;
bool NeedTrunc = false;
uint64_t Mask = 0;
@@ -4320,19 +4224,16 @@
(RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
if (NeedTrunc) {
Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false);
- Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
- Op0IsKill = Op1IsKill = true;
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
}
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
- Op1IsKill);
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
if (NeedTrunc)
- ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
return ResultReg;
}
unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
- bool Op0IsKill, uint64_t Shift,
- bool IsZExt) {
+ uint64_t Shift, bool IsZExt) {
assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
"Unexpected source/return type pair.");
assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
@@ -4354,7 +4255,7 @@
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(Op0, getKillRegState(Op0IsKill));
+ .addReg(Op0);
return ResultReg;
} else
return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
@@ -4404,12 +4305,11 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
- .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op0)
.addImm(AArch64::sub_32);
Op0 = TmpReg;
- Op0IsKill = true;
}
- return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+ return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS);
}
unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
@@ -4470,7 +4370,7 @@
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
- return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
+ return fastEmitInst_rii(Opc, RC, SrcReg, 0, Imm);
}
static bool isZExtLoad(const MachineInstr *LI) {
@@ -4593,7 +4493,6 @@
unsigned SrcReg = getRegForValue(I->getOperand(0));
if (!SrcReg)
return false;
- bool SrcIsKill = hasTrivialKill(I->getOperand(0));
// Try to optimize already sign-/zero-extended values from function arguments.
bool IsZExt = isa<ZExtInst>(I);
@@ -4604,17 +4503,10 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), ResultReg)
.addImm(0)
- .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addReg(SrcReg)
.addImm(AArch64::sub_32);
SrcReg = ResultReg;
}
- // Conservatively clear all kill flags from all uses, because we are
- // replacing a sign-/zero-extend instruction at IR level with a nop at MI
- // level. The result of the instruction at IR level might have been
- // trivially dead, which is now not longer true.
- unsigned UseReg = lookUpRegForValue(I);
- if (UseReg)
- MRI.clearKillFlags(UseReg);
updateValueMap(I, SrcReg);
return true;
@@ -4654,23 +4546,18 @@
unsigned Src0Reg = getRegForValue(I->getOperand(0));
if (!Src0Reg)
return false;
- bool Src0IsKill = hasTrivialKill(I->getOperand(0));
unsigned Src1Reg = getRegForValue(I->getOperand(1));
if (!Src1Reg)
return false;
- bool Src1IsKill = hasTrivialKill(I->getOperand(1));
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
- unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
- Src1Reg, /*IsKill=*/false);
+ unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg);
assert(QuotReg && "Unexpected DIV instruction emission failure.");
// The remainder is computed as numerator - (quotient * denominator) using the
// MSUB instruction.
- unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
- Src1Reg, Src1IsKill, Src0Reg,
- Src0IsKill);
+ unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg);
updateValueMap(I, ResultReg);
return true;
}
@@ -4718,10 +4605,9 @@
unsigned Src0Reg = getRegForValue(Src0);
if (!Src0Reg)
return false;
- bool Src0IsKill = hasTrivialKill(Src0);
unsigned ResultReg =
- emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+ emitLSL_ri(VT, SrcVT, Src0Reg, ShiftVal, IsZExt);
if (ResultReg) {
updateValueMap(I, ResultReg);
@@ -4732,14 +4618,12 @@
unsigned Src0Reg = getRegForValue(I->getOperand(0));
if (!Src0Reg)
return false;
- bool Src0IsKill = hasTrivialKill(I->getOperand(0));
unsigned Src1Reg = getRegForValue(I->getOperand(1));
if (!Src1Reg)
return false;
- bool Src1IsKill = hasTrivialKill(I->getOperand(1));
- unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+ unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src1Reg);
if (!ResultReg)
return false;
@@ -4785,18 +4669,17 @@
unsigned Op0Reg = getRegForValue(Op0);
if (!Op0Reg)
return false;
- bool Op0IsKill = hasTrivialKill(Op0);
switch (I->getOpcode()) {
default: llvm_unreachable("Unexpected instruction.");
case Instruction::Shl:
- ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt);
break;
case Instruction::AShr:
- ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt);
break;
case Instruction::LShr:
- ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt);
break;
}
if (!ResultReg)
@@ -4809,24 +4692,22 @@
unsigned Op0Reg = getRegForValue(I->getOperand(0));
if (!Op0Reg)
return false;
- bool Op0IsKill = hasTrivialKill(I->getOperand(0));
unsigned Op1Reg = getRegForValue(I->getOperand(1));
if (!Op1Reg)
return false;
- bool Op1IsKill = hasTrivialKill(I->getOperand(1));
unsigned ResultReg = 0;
switch (I->getOpcode()) {
default: llvm_unreachable("Unexpected instruction.");
case Instruction::Shl:
- ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ ResultReg = emitLSL_rr(RetVT, Op0Reg, Op1Reg);
break;
case Instruction::AShr:
- ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ ResultReg = emitASR_rr(RetVT, Op0Reg, Op1Reg);
break;
case Instruction::LShr:
- ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ ResultReg = emitLSR_rr(RetVT, Op0Reg, Op1Reg);
break;
}
@@ -4868,9 +4749,8 @@
unsigned Op0Reg = getRegForValue(I->getOperand(0));
if (!Op0Reg)
return false;
- bool Op0IsKill = hasTrivialKill(I->getOperand(0));
- unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
+ unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg);
if (!ResultReg)
return false;
@@ -4933,10 +4813,9 @@
unsigned Src0Reg = getRegForValue(I->getOperand(0));
if (!Src0Reg)
return false;
- bool Src0IsKill = hasTrivialKill(I->getOperand(0));
if (cast<BinaryOperator>(I)->isExact()) {
- unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+ unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Lg2);
if (!ResultReg)
return false;
updateValueMap(I, ResultReg);
@@ -4944,12 +4823,12 @@
}
int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
- unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+ unsigned AddReg = emitAdd_ri_(VT, Src0Reg, Pow2MinusOne);
if (!AddReg)
return false;
// (Src0 < 0) ? Pow2 - 1 : 0;
- if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+ if (!emitICmp_ri(VT, Src0Reg, 0))
return false;
unsigned SelectOpc;
@@ -4961,9 +4840,8 @@
SelectOpc = AArch64::CSELWr;
RC = &AArch64::GPR32RegClass;
}
- unsigned SelectReg =
- fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
- Src0IsKill, AArch64CC::LT);
+ unsigned SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg,
+ AArch64CC::LT);
if (!SelectReg)
return false;
@@ -4972,10 +4850,10 @@
unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
unsigned ResultReg;
if (C.isNegative())
- ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
- SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+ ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, SelectReg,
+ AArch64_AM::ASR, Lg2);
else
- ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+ ResultReg = emitASR_ri(VT, VT, SelectReg, Lg2);
if (!ResultReg)
return false;
@@ -4987,23 +4865,20 @@
/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
/// have to duplicate it for AArch64, because otherwise we would fail during the
/// sign-extend emission.
-std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+unsigned AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
unsigned IdxN = getRegForValue(Idx);
if (IdxN == 0)
// Unhandled operand. Halt "fast" selection and bail.
- return std::pair<unsigned, bool>(0, false);
-
- bool IdxNIsKill = hasTrivialKill(Idx);
+ return 0;
// If the index is smaller or larger than intptr_t, truncate or extend it.
MVT PtrVT = TLI.getPointerTy(DL);
EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
if (IdxVT.bitsLT(PtrVT)) {
IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*isZExt=*/false);
- IdxNIsKill = true;
} else if (IdxVT.bitsGT(PtrVT))
llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
- return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+ return IdxN;
}
/// This is mostly a copy of the existing FastISel GEP code, but we have to
@@ -5017,7 +4892,6 @@
unsigned N = getRegForValue(I->getOperand(0));
if (!N)
return false;
- bool NIsKill = hasTrivialKill(I->getOperand(0));
// Keep a running tab of the total offset to coalesce multiple N = N + Offset
// into a single N = N + TotalOffset.
@@ -5044,18 +4918,15 @@
continue;
}
if (TotalOffs) {
- N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ N = emitAdd_ri_(VT, N, TotalOffs);
if (!N)
return false;
- NIsKill = true;
TotalOffs = 0;
}
// N = N + Idx * ElementSize;
uint64_t ElementSize = DL.getTypeAllocSize(Ty);
- std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
- unsigned IdxN = Pair.first;
- bool IdxNIsKill = Pair.second;
+ unsigned IdxN = getRegForGEPIndex(Idx);
if (!IdxN)
return false;
@@ -5063,18 +4934,17 @@
unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
if (!C)
return false;
- IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+ IdxN = emitMul_rr(VT, IdxN, C);
if (!IdxN)
return false;
- IdxNIsKill = true;
}
- N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+ N = fastEmit_rr(VT, VT, ISD::ADD, N, IdxN);
if (!N)
return false;
}
}
if (TotalOffs) {
- N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ N = emitAdd_ri_(VT, N, TotalOffs);
if (!N)
return false;
}
@@ -5230,11 +5100,7 @@
return selectOperator(I, I->getOpcode());
}
-namespace llvm {
-
FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) {
return new AArch64FastISel(FuncInfo, LibInfo);
}
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 65ee501..f6a528c 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -47,8 +47,9 @@
// | callee-saved gpr registers | <--.
// | | | On Darwin platforms these
// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
-// | | | (frame record first)
-// | prev_fp, prev_lr | <--'
+// | prev_lr | | (frame record first)
+// | prev_fp | <--'
+// | async context if needed |
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
// | |
@@ -107,8 +108,14 @@
// so large that the offset can't be encoded in the immediate fields of loads
// or stores.
//
+// Outgoing function arguments must be at the bottom of the stack frame when
+// calling another function. If we do not have variable-sized stack objects, we
+// can allocate a "reserved call frame" area at the bottom of the local
+// variable area, large enough for all outgoing calls. If we do have VLAs, then
+// the stack pointer must be decremented and incremented around each call to
+// make space for the arguments below the VLAs.
+//
// FIXME: also explain the redzone concept.
-// FIXME: also explain the concept of reserved call frames.
//
//===----------------------------------------------------------------------===//
@@ -179,11 +186,21 @@
cl::desc("sort stack allocations"),
cl::init(true), cl::Hidden);
+cl::opt<bool> EnableHomogeneousPrologEpilog(
+ "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
+ cl::desc("Emit homogeneous prologue and epilogue for the size "
+ "optimization (default = off)"));
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-/// Returns the argument pop size.
-static uint64_t getArgumentPopSize(MachineFunction &MF,
- MachineBasicBlock &MBB) {
+/// Returns how much of the incoming argument stack area (in bytes) we should
+/// clean up in an epilogue. For the C calling convention this will be 0, for
+/// guaranteed tail call conventions it can be positive (a normal return or a
+/// tail call to a function that uses less stack space for arguments) or
+/// negative (for a tail call to a function that needs more stack space than us
+/// for arguments).
+static int64_t getArgumentStackToRestore(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
bool IsTailCallReturn = false;
if (MBB.end() != MBBI) {
@@ -194,7 +211,7 @@
}
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- uint64_t ArgumentPopSize = 0;
+ int64_t ArgumentPopSize = 0;
if (IsTailCallReturn) {
MachineOperand &StackAdjust = MBBI->getOperand(1);
@@ -213,6 +230,47 @@
return ArgumentPopSize;
}
+static bool produceCompactUnwindFrame(MachineFunction &MF);
+static bool needsWinCFI(const MachineFunction &MF);
+static StackOffset getSVEStackSize(const MachineFunction &MF);
+
+/// Returns true if a homogeneous prolog or epilog code can be emitted
+/// for the size optimization. If possible, a frame helper call is injected.
+/// When Exit block is given, this check is for epilog.
+bool AArch64FrameLowering::homogeneousPrologEpilog(
+ MachineFunction &MF, MachineBasicBlock *Exit) const {
+ if (!MF.getFunction().hasMinSize())
+ return false;
+ if (!EnableHomogeneousPrologEpilog)
+ return false;
+ if (ReverseCSRRestoreSeq)
+ return false;
+ if (EnableRedZone)
+ return false;
+
+ // TODO: Window is supported yet.
+ if (needsWinCFI(MF))
+ return false;
+ // TODO: SVE is not supported yet.
+ if (getSVEStackSize(MF))
+ return false;
+
+ // Bail on stack adjustment needed on return for simplicity.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
+ return false;
+ if (Exit && getArgumentStackToRestore(MF, *Exit))
+ return false;
+
+ return true;
+}
+
+/// Returns true if CSRs should be paired.
+bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
+ return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
+}
+
/// This is the biggest offset to the stack pointer we can encode in aarch64
/// instructions (without using a separate calculation and a temp register).
/// Note that the exception here are vector stores/loads which cannot encode any
@@ -258,10 +316,10 @@
const AArch64FunctionInfo *AFI, bool IsWin64,
bool IsFunclet) {
if (!IsWin64 || IsFunclet) {
- // Only Win64 uses fixed objects, and then only for the function (not
- // funclets)
- return 0;
+ return AFI->getTailCallReservedStack();
} else {
+ if (AFI->getTailCallReservedStack() != 0)
+ report_fatal_error("cannot generate ABI-changing tail call for Win64");
// Var args are stored here in the primary function.
const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
// To support EH funclets we allocate an UnwindHelp object
@@ -279,16 +337,20 @@
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
+
// Don't use the red zone if the function explicitly asks us not to.
// This is typically used for kernel code.
- if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const unsigned RedZoneSize =
+ Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
+ if (!RedZoneSize)
return false;
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
uint64_t NumBytes = AFI->getLocalStackSize();
- return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
+ return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
getSVEStackSize(MF));
}
@@ -307,7 +369,7 @@
return true;
if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
MFI.hasStackMap() || MFI.hasPatchPoint() ||
- RegInfo->needsStackRealignment(MF))
+ RegInfo->hasStackRealignment(MF))
return true;
// With large callframes around we may need to use FP to access the scavenging
// emergency spillslot.
@@ -560,7 +622,7 @@
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// Don't need a scratch register if we're not going to re-align the stack.
- if (!RegInfo->needsStackRealignment(*MF))
+ if (!RegInfo->hasStackRealignment(*MF))
return true;
// Otherwise, we can use any block as long as it has a scratch register
// available.
@@ -596,6 +658,8 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (homogeneousPrologEpilog(MF))
+ return false;
if (AFI->getLocalStackSize() == 0)
return false;
@@ -620,7 +684,7 @@
if (MFI.hasVarSizedObjects())
return false;
- if (RegInfo->needsStackRealignment(MF))
+ if (RegInfo->hasStackRealignment(MF))
return false;
// This isn't strictly necessary, but it simplifies things a bit since the
@@ -828,21 +892,17 @@
++MBBI;
}
unsigned NewOpc;
- int Scale = 1;
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
case AArch64::STPXi:
NewOpc = AArch64::STPXpre;
- Scale = 8;
break;
case AArch64::STPDi:
NewOpc = AArch64::STPDpre;
- Scale = 8;
break;
case AArch64::STPQi:
NewOpc = AArch64::STPQpre;
- Scale = 16;
break;
case AArch64::STRXui:
NewOpc = AArch64::STRXpre;
@@ -855,15 +915,12 @@
break;
case AArch64::LDPXi:
NewOpc = AArch64::LDPXpost;
- Scale = 8;
break;
case AArch64::LDPDi:
NewOpc = AArch64::LDPDpost;
- Scale = 8;
break;
case AArch64::LDPQi:
NewOpc = AArch64::LDPQpost;
- Scale = 16;
break;
case AArch64::LDRXui:
NewOpc = AArch64::LDRXpost;
@@ -882,6 +939,25 @@
SEH->eraseFromParent();
}
+ TypeSize Scale = TypeSize::Fixed(1);
+ unsigned Width;
+ int64_t MinOffset, MaxOffset;
+ bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
+ NewOpc, Scale, Width, MinOffset, MaxOffset);
+ (void)Success;
+ assert(Success && "unknown load/store opcode");
+
+ // If the first store isn't right where we want SP then we can't fold the
+ // update in so create a normal arithmetic instruction instead.
+ if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
+ CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(CSStackSizeInc), TII,
+ InProlog ? MachineInstr::FrameSetup
+ : MachineInstr::FrameDestroy);
+ return std::prev(MBBI);
+ }
+
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
MIB.addReg(AArch64::SP, RegState::Define);
@@ -897,7 +973,7 @@
assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
"Unexpected base register in callee-save save/restore instruction!");
assert(CSStackSizeInc % Scale == 0);
- MIB.addImm(CSStackSizeInc / Scale);
+ MIB.addImm(CSStackSizeInc / (int)Scale);
MIB.setMIFlags(MBBI->getFlags());
MIB.setMemRefs(MBBI->memoperands());
@@ -1053,16 +1129,23 @@
const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
if (MFnI.shouldSignReturnAddress()) {
+
+ unsigned PACI;
if (MFnI.shouldSignWithBKey()) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
.setMIFlag(MachineInstr::FrameSetup);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
- .setMIFlag(MachineInstr::FrameSetup);
+ PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
} else {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
- .setMIFlag(MachineInstr::FrameSetup);
+ PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
}
+ auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
+ if (Subtarget.hasPAuth())
+ MI.addReg(AArch64::LR, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::SP, RegState::InternalRead);
+ MI.setMIFlag(MachineInstr::FrameSetup);
+
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -1070,6 +1153,18 @@
.setMIFlags(MachineInstr::FrameSetup);
}
+ // We signal the presence of a Swift extended frame to external tools by
+ // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
+ // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
+ // bits so that is still true.
+ if (HasFP && AFI->hasSwiftAsyncContext()) {
+ // ORR x29, x29, #0x1000_0000_0000_0000
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
+ .addUse(AArch64::FP)
+ .addImm(0x1100)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
@@ -1139,12 +1234,16 @@
// All of the remaining stack allocations are for locals.
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ bool HomPrologEpilog = homogeneousPrologEpilog(MF);
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(-NumBytes), TII,
MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
+ } else if (HomPrologEpilog) {
+ // Stack has been already adjusted.
+ NumBytes -= PrologueSaveSize;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
@@ -1172,13 +1271,35 @@
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
- // Issue sub fp, sp, FPOffset or
- // mov fp,sp when FPOffset is zero.
- // Note: All stores of callee-saved registers are marked as "FrameSetup".
- // This code marks the instruction(s) that set the FP also.
- emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
- StackOffset::getFixed(FPOffset), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ if (AFI->hasSwiftAsyncContext()) {
+ // Before we update the live FP we have to ensure there's a valid (or
+ // null) asynchronous context in its slot just before FP in the frame
+ // record, so store it now.
+ const auto &Attrs = MF.getFunction().getAttributes();
+ bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
+ if (HaveInitialContext)
+ MBB.addLiveIn(AArch64::X22);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
+ .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
+ .addUse(AArch64::SP)
+ .addImm(FPOffset - 8)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ if (HomPrologEpilog) {
+ auto Prolog = MBBI;
+ --Prolog;
+ assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
+ Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
+ } else {
+ // Issue sub fp, sp, FPOffset or
+ // mov fp,sp when FPOffset is zero.
+ // Note: All stores of callee-saved registers are marked as "FrameSetup".
+ // This code marks the instruction(s) that set the FP also.
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+ StackOffset::getFixed(FPOffset), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ }
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1306,7 +1427,7 @@
if (NumBytes) {
// Alignment is required for the parent frame, not the funclet
const bool NeedsRealignment =
- !IsFunclet && RegInfo->needsStackRealignment(MF);
+ !IsFunclet && RegInfo->hasStackRealignment(MF);
unsigned scratchSPReg = AArch64::SP;
if (NeedsRealignment) {
@@ -1561,9 +1682,9 @@
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
- // Initial and residual are named for consistency with the prologue. Note that
- // in the epilogue, the residual adjustment is executed first.
- uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
+ // How much of the stack used by incoming arguments this function is expected
+ // to restore in this particular epilogue.
+ int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
// The stack frame should be like below,
//
@@ -1598,7 +1719,7 @@
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
- uint64_t AfterCSRPopSize = ArgumentPopSize;
+ int64_t AfterCSRPopSize = ArgumentStackToRestore;
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// We cannot rely on the local stack size set in emitPrologue if the function
// has funclets, as funclets have different local stack size requirements, and
@@ -1606,6 +1727,25 @@
// function.
if (MF.hasEHFunclets())
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
+ if (homogeneousPrologEpilog(MF, &MBB)) {
+ assert(!NeedsWinCFI);
+ auto LastPopI = MBB.getFirstTerminator();
+ if (LastPopI != MBB.begin()) {
+ auto HomogeneousEpilog = std::prev(LastPopI);
+ if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
+ LastPopI = HomogeneousEpilog;
+ }
+
+ // Adjust local stack
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(AFI->getLocalStackSize()), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI);
+
+ // SP has been already adjusted while restoring callee save regs.
+ // We've bailed-out the case with adjusting SP for arguments.
+ assert(AfterCSRPopSize == 0);
+ return;
+ }
bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
@@ -1616,8 +1756,10 @@
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
- // If the offset is 0, convert it to a post-index ldp.
- if (OffsetOp.getImm() == 0)
+ // If the offset is 0 and the AfterCSR pop is not actually trying to
+ // allocate more stack for arguments (in space that an untimely interrupt
+ // may clobber), convert it to a post-index ldp.
+ if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
convertCalleeSaveRestoreToSPPrePostIncDec(
MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
else {
@@ -1657,6 +1799,18 @@
.setMIFlag(MachineInstr::FrameDestroy);
}
+ if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
+ // We need to reset FP to its untagged state on return. Bit 60 is currently
+ // used to show the presence of an extended frame.
+
+ // BIC x29, x29, #0x1000_0000_0000_0000
+ BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
+ AArch64::FP)
+ .addUse(AArch64::FP)
+ .addImm(0x10fe)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
const StackOffset &SVEStackSize = getSVEStackSize(MF);
// If there is a single SP update, insert it before the ret and we're done.
@@ -1776,6 +1930,8 @@
// assumes the SP is at the same location as it was after the callee-save save
// code in the prologue.
if (AfterCSRPopSize) {
+ assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
+ "interrupt may have clobbered");
// Find an insertion point for the first ldp so that it goes before the
// shadow call stack epilog instruction. This ensures that the restore of
// lr from x18 is placed after the restore from sp.
@@ -1791,7 +1947,7 @@
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed((int64_t)AfterCSRPopSize), TII,
+ StackOffset::getFixed(AfterCSRPopSize), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
}
if (HasWinCFI)
@@ -1893,13 +2049,13 @@
// Argument access should always use the FP.
if (isFixed) {
UseFP = hasFP(MF);
- } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
+ } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
// References to the CSR area must use FP if we're re-aligning the stack
// since the dynamically-sized alignment padding is between the SP/BP and
// the CSR area.
assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
UseFP = true;
- } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
+ } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
// If the FPOffset is negative and we're producing a signed immediate, we
// have to keep in mind that the available offset range for negative
// offsets is smaller than for positive ones. If an offset is available
@@ -1941,9 +2097,10 @@
}
}
- assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
- "In the presence of dynamic stack pointer realignment, "
- "non-argument/CSR objects cannot be accessed through the frame pointer");
+ assert(
+ ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
+ "In the presence of dynamic stack pointer realignment, "
+ "non-argument/CSR objects cannot be accessed through the frame pointer");
if (isSVE) {
StackOffset FPOffset =
@@ -1953,10 +2110,9 @@
StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
ObjectOffset);
// Always use the FP for SVE spills if available and beneficial.
- if (hasFP(MF) &&
- (SPOffset.getFixed() ||
- FPOffset.getScalable() < SPOffset.getScalable() ||
- RegInfo->needsStackRealignment(MF))) {
+ if (hasFP(MF) && (SPOffset.getFixed() ||
+ FPOffset.getScalable() < SPOffset.getScalable() ||
+ RegInfo->hasStackRealignment(MF))) {
FrameReg = RegInfo->getFrameRegister(MF);
return FPOffset;
}
@@ -2009,7 +2165,8 @@
AttributeList Attrs = MF.getFunction().getAttributes();
return Subtarget.isTargetMachO() &&
!(Subtarget.getTargetLowering()->supportSwiftError() &&
- Attrs.hasAttrSomewhere(Attribute::SwiftError));
+ Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
+ MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
}
static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
@@ -2123,6 +2280,7 @@
FirstReg = Count - 1;
}
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
+ bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
// When iterating backwards, the loop condition relies on unsigned wraparound.
for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -2221,22 +2379,27 @@
else
ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
+ // Swift's async context is directly before FP, so allocate an extra
+ // 8 bytes for it.
+ if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
+ RPI.Reg2 == AArch64::FP)
+ ByteOffset += StackFillDir * 8;
+
assert(!(RPI.isScalable() && RPI.isPaired()) &&
"Paired spill/fill instructions don't exist for SVE vectors");
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
- if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI &&
+ if (NeedGapToAlignStack && !NeedsWinCFI &&
!RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
- !RPI.isPaired()) {
+ !RPI.isPaired() && ByteOffset % 16 != 0) {
ByteOffset += 8 * StackFillDir;
- assert(ByteOffset % 16 == 0);
assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
// A stack frame with a gap looks like this, bottom up:
// d9, d8. x21, gap, x20, x19.
- // Set extra alignment on the x21 object (the only unpaired register)
- // to create the gap above it.
+ // Set extra alignment on the x21 object to create the gap above it.
MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
+ NeedGapToAlignStack = false;
}
int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
@@ -2244,6 +2407,12 @@
// If filling top down (default), we want the offset after incrementing it.
// If fillibg bootom up (WinCFI) we need the original offset.
int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
+
+ // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
+ // Swift context can directly precede FP.
+ if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
+ RPI.Reg2 == AArch64::FP)
+ Offset += 8;
RPI.Offset = Offset / Scale;
assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
@@ -2324,6 +2493,22 @@
MBB.addLiveIn(AArch64::X18);
}
+ if (homogeneousPrologEpilog(MF)) {
+ auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ for (auto &RPI : RegPairs) {
+ MIB.addReg(RPI.Reg1);
+ MIB.addReg(RPI.Reg2);
+
+ // Update register live in.
+ if (!MRI.isReserved(RPI.Reg1))
+ MBB.addLiveIn(RPI.Reg1);
+ if (!MRI.isReserved(RPI.Reg2))
+ MBB.addLiveIn(RPI.Reg2);
+ }
+ return true;
+ }
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
RegPairInfo RPI = *RPII;
@@ -2519,6 +2704,14 @@
for (const RegPairInfo &RPI : reverse(RegPairs))
if (!RPI.isScalable())
EmitMI(RPI);
+ } else if (homogeneousPrologEpilog(MF, &MBB)) {
+ auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+ .setMIFlag(MachineInstr::FrameDestroy);
+ for (auto &RPI : RegPairs) {
+ MIB.addReg(RPI.Reg1, RegState::Define);
+ MIB.addReg(RPI.Reg2, RegState::Define);
+ }
+ return true;
} else
for (const RegPairInfo &RPI : RegPairs)
if (!RPI.isScalable())
@@ -2588,7 +2781,7 @@
// MachO's compact unwind format relies on all registers being stored in
// pairs.
// FIXME: the usual format is actually better if unwinding isn't needed.
- if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
+ if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
!SavedRegs.test(PairedReg)) {
SavedRegs.set(PairedReg);
if (AArch64::GPR64RegClass.contains(PairedReg) &&
@@ -2667,7 +2860,7 @@
// MachO's compact unwind format relies on all registers being stored in
// pairs, so if we need to spill one extra for BigStack, then we need to
// store the pair.
- if (produceCompactUnwindFrame(MF))
+ if (producePairRegisters(MF))
SavedRegs.set(UnspilledCSGPRPaired);
ExtraCSSpill = UnspilledCSGPR;
}
@@ -2688,6 +2881,12 @@
// Adding the size of additional 64bit GPR saves.
CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
+
+ // A Swift asynchronous context extends the frame record with a pointer
+ // directly before FP.
+ if (hasFP(MF) && AFI->hasSwiftAsyncContext())
+ CSStackSize += 8;
+
uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
<< EstimatedStackSize + AlignedCSStackSize
@@ -2705,8 +2904,9 @@
}
bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
- MachineFunction &MF, const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFunction &MF, const TargetRegisterInfo *RegInfo,
+ std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
+ unsigned &MaxCSFrameIndex) const {
bool NeedsWinCFI = needsWinCFI(MF);
// To match the canonical windows frame layout, reverse the list of
// callee saved registers to get them laid out by PrologEpilogInserter
@@ -2715,8 +2915,35 @@
// the top, thus have the CSI array start from the highest registers.)
if (NeedsWinCFI)
std::reverse(CSI.begin(), CSI.end());
- // Let the generic code do the rest of the setup.
- return false;
+
+ if (CSI.empty())
+ return true; // Early exit if no callee saved registers are modified!
+
+ // Now that we know which registers need to be saved and restored, allocate
+ // stack slots for them.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ for (auto &CS : CSI) {
+ Register Reg = CS.getReg();
+ const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+ unsigned Size = RegInfo->getSpillSize(*RC);
+ Align Alignment(RegInfo->getSpillAlign(*RC));
+ int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
+ CS.setFrameIdx(FrameIdx);
+
+ if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+ if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+
+ // Grab 8 bytes below FP for the extended asynchronous frame info.
+ if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
+ FrameIdx = MFI.CreateStackObject(8, Alignment, true);
+ AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
+ if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+ if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+ }
+ }
+ return true;
}
bool AArch64FrameLowering::enableStackSlotScavenging(
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 80079a9..f8adaf3 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -67,10 +67,13 @@
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
- bool
- assignCalleeSavedSpillSlots(MachineFunction &MF,
- const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI) const override;
+ bool hasSwiftExtendedFrame(const MachineFunction &MF) const;
+
+ bool assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI,
+ unsigned &MinCSFrameIndex,
+ unsigned &MaxCSFrameIndex) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
@@ -84,7 +87,7 @@
TargetStackID::Value getStackIDForScalableVectors() const override;
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
- RegScavenger *RS) const override;
+ RegScavenger *RS) const override;
void
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
@@ -124,6 +127,16 @@
SmallVectorImpl<int> &ObjectsToAllocate) const override;
private:
+ /// Returns true if a homogeneous prolog or epilog code can be emitted
+ /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo
+ /// instructions are emitted in place. When Exit block is given, this check is
+ /// for epilog.
+ bool homogeneousPrologEpilog(MachineFunction &MF,
+ MachineBasicBlock *Exit = nullptr) const;
+
+ /// Returns true if CSRs should be paired.
+ bool producePairRegisters(MachineFunction &MF) const;
+
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
uint64_t StackBumpBytes) const;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/src/llvm-project/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 528756b..87aef1d 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -29,6 +29,8 @@
{0, 32, AArch64::GPRRegBank},
// 7: GPR 64-bit value.
{0, 64, AArch64::GPRRegBank},
+ // 8: GPR 128-bit value.
+ {0, 128, AArch64::GPRRegBank},
};
// ValueMappings.
@@ -66,51 +68,55 @@
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 22: GPR 64-bit value. <-- This must match Last3OpsIdx.
+ // 22: GPR 64-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+ // 25: GPR 128-bit value. <-- This must match Last3OpsIdx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1},
// Cross register bank copies.
- // 25: FPR 16-bit value to GPR 16-bit. <-- This must match
+ // 28: FPR 16-bit value to GPR 16-bit. <-- This must match
// FirstCrossRegCpyIdx.
// Note: This is the kind of copy we see with physical registers.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 27: FPR 32-bit value to GPR 32-bit value.
+ // 30: FPR 32-bit value to GPR 32-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 29: FPR 64-bit value to GPR 64-bit value.
+ // 32: FPR 64-bit value to GPR 64-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
- // 31: FPR 128-bit value to GPR 128-bit value (invalid)
+ // 34: FPR 128-bit value to GPR 128-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 33: FPR 256-bit value to GPR 256-bit value (invalid)
+ // 36: FPR 256-bit value to GPR 256-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 35: FPR 512-bit value to GPR 512-bit value (invalid)
+ // 38: FPR 512-bit value to GPR 512-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 37: GPR 32-bit value to FPR 32-bit value.
+ // 40: GPR 32-bit value to FPR 32-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match
+ // 42: GPR 64-bit value to FPR 64-bit value. <-- This must match
// LastCrossRegCpyIdx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
- // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx.
+ // 44: FPExt: 16 to 32. <-- This must match FPExt16To32Idx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
- // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx.
+ // 46: FPExt: 16 to 32. <-- This must match FPExt16To64Idx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
- // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx.
+ // 48: FPExt: 32 to 64. <-- This must match FPExt32To64Idx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx.
+ // 50: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
- // 49: Shift scalar with 64 bit shift imm
+ // 52: Shift scalar with 64 bit shift imm
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
@@ -167,6 +173,8 @@
return 0;
if (Size <= 64)
return 1;
+ if (Size <= 128)
+ return 2;
return -1;
}
if (RBIdx == PMI_FirstFPR) {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 94b5d77..17e530a 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -128,6 +128,24 @@
bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 16, Base, OffImm);
}
+ template <unsigned Size, unsigned Max>
+ bool SelectAddrModeIndexedUImm(SDValue N, SDValue &Base, SDValue &OffImm) {
+ // Test if there is an appropriate addressing mode and check if the
+ // immediate fits.
+ bool Found = SelectAddrModeIndexed(N, Size, Base, OffImm);
+ if (Found) {
+ if (auto *CI = dyn_cast<ConstantSDNode>(OffImm)) {
+ int64_t C = CI->getSExtValue();
+ if (C <= Max)
+ return true;
+ }
+ }
+
+ // Otherwise, base only, materialize address in register.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+ return true;
+ }
template<int Width>
bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
@@ -186,9 +204,9 @@
return SelectSVEAddSubImm(N, VT, Imm, Shift);
}
- template<MVT::SimpleValueType VT>
+ template <MVT::SimpleValueType VT, bool Invert = false>
bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
- return SelectSVELogicalImm(N, VT, Imm);
+ return SelectSVELogicalImm(N, VT, Imm, Invert);
}
template <MVT::SimpleValueType VT>
@@ -223,6 +241,22 @@
return false;
}
+ template <signed Max, signed Scale>
+ bool SelectEXTImm(SDValue N, SDValue &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
+
+ if (MulImm >= 0 && MulImm <= Max) {
+ MulImm *= Scale;
+ Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ return false;
+ }
+
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
/// between 1 and 4 elements. If it contains a single element that is returned
@@ -326,7 +360,7 @@
bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
- bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
+ bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert);
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
@@ -335,6 +369,8 @@
bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
+
+ bool SelectAllActivePredicate(SDValue N);
};
} // end anonymous namespace
@@ -369,6 +405,7 @@
default:
llvm_unreachable("Unexpected asm memory constraint");
case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_o:
case InlineAsm::Constraint_Q:
// We need to make sure that this one operand does not end up in XZR, thus
// require the address to be in a PointerRegClass register.
@@ -816,7 +853,7 @@
// ldar and stlr have much more restrictive addressing modes (just a
// register).
- if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
+ if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getSuccessOrdering()))
return false;
}
@@ -1339,6 +1376,11 @@
SDValue Ops[] = { Base, Offset, Chain };
SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
MVT::Other, Ops);
+
+ // Transfer memoperands.
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Res), {MemOp});
+
// Either way, we're replacing the node, so tell the caller that.
SDValue LoadedVal = SDValue(Res, 1);
if (InsertTo64) {
@@ -2298,10 +2340,10 @@
case AArch64::ORRWrs:
case AArch64::ORRXrs:
- if (UserNode->getOperand(1) != Orig)
- return;
- return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
- Depth);
+ if (UserNode->getOperand(0) != Orig && UserNode->getOperand(1) == Orig)
+ getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+ Depth);
+ return;
case AArch64::BFMWri:
case AArch64::BFMXri:
return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
@@ -2910,6 +2952,7 @@
assert(AllIntFields &&
"Unexpected non-integer value in special register string.");
+ (void)AllIntFields;
// Need to combine the integer fields of the string into a single value
// based on the bit encoding of MRS/MSR instruction.
@@ -3092,20 +3135,32 @@
bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
- const int64_t ImmVal = CNode->getZExtValue();
+ const int64_t ImmVal = CNode->getSExtValue();
SDLoc DL(N);
switch (VT.SimpleTy) {
case MVT::i8:
+ // Can always select i8s, no shift, mask the immediate value to
+ // deal with sign-extended value from lowering.
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(ImmVal & 0xFF, DL, MVT::i32);
+ return true;
+ case MVT::i16:
+ // i16 values get sign-extended to 32-bits during lowering.
if ((ImmVal & 0xFF) == ImmVal) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
+ } else if ((ImmVal & 0xFF) == 0) {
+ assert((ImmVal >= -32768) && (ImmVal <= 32512));
+ Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32);
+ return true;
}
break;
- case MVT::i16:
case MVT::i32:
case MVT::i64:
+ // Range of immediate won't trigger signedness problems for 32/64b.
if ((ImmVal & 0xFF) == ImmVal) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
@@ -3164,32 +3219,36 @@
return false;
}
-bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
+bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm,
+ bool Invert) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CNode->getZExtValue();
SDLoc DL(N);
+ if (Invert)
+ ImmVal = ~ImmVal;
+
// Shift mask depending on type size.
switch (VT.SimpleTy) {
- case MVT::i8:
- ImmVal &= 0xFF;
- ImmVal |= ImmVal << 8;
- ImmVal |= ImmVal << 16;
- ImmVal |= ImmVal << 32;
- break;
- case MVT::i16:
- ImmVal &= 0xFFFF;
- ImmVal |= ImmVal << 16;
- ImmVal |= ImmVal << 32;
- break;
- case MVT::i32:
- ImmVal &= 0xFFFFFFFF;
- ImmVal |= ImmVal << 32;
- break;
- case MVT::i64:
- break;
- default:
- llvm_unreachable("Unexpected type");
+ case MVT::i8:
+ ImmVal &= 0xFF;
+ ImmVal |= ImmVal << 8;
+ ImmVal |= ImmVal << 16;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i16:
+ ImmVal &= 0xFFFF;
+ ImmVal |= ImmVal << 16;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i32:
+ ImmVal &= 0xFFFFFFFF;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i64:
+ break;
+ default:
+ llvm_unreachable("Unexpected type");
}
uint64_t encoding;
@@ -3881,6 +3940,18 @@
if (tryMULLV64LaneV128(IntNo, Node))
return;
break;
+ case Intrinsic::swift_async_context_addr: {
+ SDLoc DL(Node);
+ CurDAG->SelectNodeTo(Node, AArch64::SUBXri, MVT::i64,
+ CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ AArch64::FP, MVT::i64),
+ CurDAG->getTargetConstant(8, DL, MVT::i32),
+ CurDAG->getTargetConstant(0, DL, MVT::i32));
+ auto &MF = CurDAG->getMachineFunction();
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+ MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+ return;
+ }
}
break;
}
@@ -4963,6 +5034,24 @@
return true;
}
+ if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
+ int64_t ImmOff = C->getSExtValue();
+ unsigned Size = 1 << Scale;
+
+ // To use the reg+reg addressing mode, the immediate must be a multiple of
+ // the vector element's byte size.
+ if (ImmOff % Size)
+ return false;
+
+ SDLoc DL(N);
+ Base = LHS;
+ Offset = CurDAG->getTargetConstant(ImmOff >> Scale, DL, MVT::i64);
+ SDValue Ops[] = {Offset};
+ SDNode *MI = CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+ Offset = SDValue(MI, 0);
+ return true;
+ }
+
// Check if the RHS is a shift node with a constant.
if (RHS.getOpcode() != ISD::SHL)
return false;
@@ -4977,3 +5066,10 @@
return false;
}
+
+bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
+ const AArch64TargetLowering *TLI =
+ static_cast<const AArch64TargetLowering *>(getTargetLowering());
+
+ return TLI->isAllActivePredicate(N);
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c522ee7..b27a02b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29,7 +29,9 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -244,6 +246,12 @@
addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
+ if (Subtarget->hasLS64()) {
+ addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
+ setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
+ setOperationAction(ISD::STORE, MVT::i64x8, Custom);
+ }
+
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
@@ -343,6 +351,18 @@
setCondCodeAction(ISD::SETUGT, VT, Expand);
setCondCodeAction(ISD::SETUEQ, VT, Expand);
setCondCodeAction(ISD::SETUNE, VT, Expand);
+
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
}
}
@@ -458,6 +478,11 @@
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
@@ -604,6 +629,7 @@
setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
setOperationAction(ISD::FRINT, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
@@ -623,6 +649,7 @@
setOperationAction(ISD::FABS, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+ setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
setOperationAction(ISD::FMA, MVT::v4f16, Expand);
setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
@@ -647,6 +674,7 @@
setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
@@ -666,6 +694,7 @@
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
setOperationAction(ISD::FMINNUM, Ty, Legal);
setOperationAction(ISD::FMAXNUM, Ty, Legal);
setOperationAction(ISD::FMINIMUM, Ty, Legal);
@@ -683,6 +712,7 @@
setOperationAction(ISD::FRINT, MVT::f16, Legal);
setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
@@ -692,6 +722,7 @@
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+ setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
@@ -857,15 +888,20 @@
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ // TODO: Do the same for FP_TO_*INT_SAT.
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
+ // Try and combine setcc with csel
+ setTargetDAGCombine(ISD::SETCC);
+
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::VECTOR_SPLICE);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
@@ -873,9 +909,6 @@
if (Subtarget->supportsAddressTopByteIgnored())
setTargetDAGCombine(ISD::LOAD);
- setTargetDAGCombine(ISD::MGATHER);
- setTargetDAGCombine(ISD::MSCATTER);
-
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
@@ -886,6 +919,7 @@
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::VECREDUCE_ADD);
+ setTargetDAGCombine(ISD::STEP_VECTOR);
setTargetDAGCombine(ISD::GlobalAddress);
@@ -944,6 +978,7 @@
setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
setOperationAction(ISD::FREM, MVT::v1f64, Expand);
setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+ setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
@@ -968,9 +1003,11 @@
// elements smaller than i32, so promote the input to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
- // i8 vector elements also need promotion to i32 for v8i8
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
+
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
@@ -997,6 +1034,12 @@
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
+ setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
@@ -1014,6 +1057,12 @@
setOperationAction(ISD::USUBSAT, VT, Legal);
}
+ for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
+ MVT::v4i32}) {
+ setOperationAction(ISD::ABDS, VT, Legal);
+ setOperationAction(ISD::ABDU, VT, Legal);
+ }
+
// Vector reductions
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@@ -1070,6 +1119,7 @@
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
}
if (Subtarget->hasFullFP16()) {
@@ -1080,6 +1130,7 @@
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
}
}
@@ -1087,12 +1138,16 @@
setOperationAction(ISD::VSCALE, MVT::i32, Custom);
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
}
if (Subtarget->hasSVE()) {
- // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
- // splat of 0 or undef) once vector selects supported in SVE codegen. See
- // D68877 for more details.
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);
@@ -1106,9 +1161,14 @@
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::MULHS, VT, Custom);
+ setOperationAction(ISD::MULHU, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::SMIN, VT, Custom);
@@ -1127,6 +1187,12 @@
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
}
// Illegal unpacked integer vector types.
@@ -1135,6 +1201,11 @@
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
}
+ // Legalize unpacked bitcasts to REINTERPRET_CAST.
+ for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
+ MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
+ setOperationAction(ISD::BITCAST, VT, Custom);
+
for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -1145,6 +1216,10 @@
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+
// There are no legal MVT::nxv16f## based types.
if (VT != MVT::nxv16i1) {
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
@@ -1152,18 +1227,50 @@
}
}
+ // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
+ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+ MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+
+ for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
+ // Avoid marking truncating FP stores as legal to prevent the
+ // DAGCombiner from creating unsupported truncating stores.
+ setTruncStoreAction(VT, InnerVT, Expand);
+ // SVE does not have floating-point extending loads.
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
+ }
+
+ // SVE supports truncating stores of 64 and 128-bit vectors
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+
for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FDIV, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMINNUM, VT, Custom);
setOperationAction(ISD::FMUL, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
@@ -1183,12 +1290,16 @@
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
}
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
}
setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
@@ -1215,7 +1326,7 @@
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(ISD::TRUNCATE, VT, Custom);
for (auto VT : {MVT::v8f16, MVT::v4f32})
- setOperationAction(ISD::FP_ROUND, VT, Expand);
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
// These operations are not supported on NEON but SVE can do them.
setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
@@ -1224,6 +1335,10 @@
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
setOperationAction(ISD::MUL, MVT::v1i64, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
+ setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
@@ -1272,12 +1387,17 @@
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
}
+
+ setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
+ setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
+ setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
+ setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
-void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT) {
assert(VT.isVector() && "VT should be a vector type");
if (VT.isFloatingPoint()) {
@@ -1296,11 +1416,13 @@
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
-
- // But we do support custom-lowering for FCOPYSIGN.
- setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
+ // But we do support custom-lowering for FCOPYSIGN.
+ if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+ ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -1367,48 +1489,93 @@
// We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ if (VT.isFloatingPoint()) {
+ setCondCodeAction(ISD::SETO, VT, Expand);
+ setCondCodeAction(ISD::SETOLT, VT, Expand);
+ setCondCodeAction(ISD::SETLT, VT, Expand);
+ setCondCodeAction(ISD::SETOLE, VT, Expand);
+ setCondCodeAction(ISD::SETLE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETULE, VT, Expand);
+ setCondCodeAction(ISD::SETUGE, VT, Expand);
+ setCondCodeAction(ISD::SETUGT, VT, Expand);
+ setCondCodeAction(ISD::SETUEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
+
+ // Mark integer truncating stores as having custom lowering
+ if (VT.isInteger()) {
+ MVT InnerVT = VT.changeVectorElementType(MVT::i8);
+ while (InnerVT != VT) {
+ setTruncStoreAction(VT, InnerVT, Custom);
+ InnerVT = InnerVT.changeVectorElementType(
+ MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
+ }
+ }
+
// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::AND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::FCEIL, VT, Custom);
setOperationAction(ISD::FDIV, VT, Custom);
setOperationAction(ISD::FFLOOR, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMINNUM, VT, Custom);
setOperationAction(ISD::FMUL, VT, Custom);
setOperationAction(ISD::FNEARBYINT, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FRINT, VT, Custom);
setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUNDEVEN, VT, Custom);
setOperationAction(ISD::FSQRT, VT, Custom);
setOperationAction(ISD::FSUB, VT, Custom);
setOperationAction(ISD::FTRUNC, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::MULHS, VT, Custom);
+ setOperationAction(ISD::MULHU, VT, Custom);
setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::SMAX, VT, Custom);
setOperationAction(ISD::SMIN, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
setOperationAction(ISD::UMAX, VT, Custom);
setOperationAction(ISD::UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
@@ -1418,11 +1585,13 @@
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::XOR, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
@@ -1430,12 +1599,12 @@
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
- addTypeForNEON(VT, MVT::v2i32);
+ addTypeForNEON(VT);
}
void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR128RegClass);
- addTypeForNEON(VT, MVT::v4i32);
+ addTypeForNEON(VT);
}
EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
@@ -1660,7 +1829,7 @@
}
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
@@ -1674,7 +1843,7 @@
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
- Align <= 2 ||
+ Alignment <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
@@ -1704,7 +1873,7 @@
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
- Ty == LLT::vector(2, 64);
+ Ty == LLT::fixed_vector(2, 64);
}
return true;
}
@@ -1730,7 +1899,6 @@
MAKE_CASE(AArch64ISD::RET_FLAG)
MAKE_CASE(AArch64ISD::BRCOND)
MAKE_CASE(AArch64ISD::CSEL)
- MAKE_CASE(AArch64ISD::FCSEL)
MAKE_CASE(AArch64ISD::CSINV)
MAKE_CASE(AArch64ISD::CSNEG)
MAKE_CASE(AArch64ISD::CSINC)
@@ -1738,6 +1906,8 @@
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
MAKE_CASE(AArch64ISD::ADD_PRED)
MAKE_CASE(AArch64ISD::MUL_PRED)
+ MAKE_CASE(AArch64ISD::MULHS_PRED)
+ MAKE_CASE(AArch64ISD::MULHU_PRED)
MAKE_CASE(AArch64ISD::SDIV_PRED)
MAKE_CASE(AArch64ISD::SHL_PRED)
MAKE_CASE(AArch64ISD::SMAX_PRED)
@@ -1798,7 +1968,6 @@
MAKE_CASE(AArch64ISD::BICi)
MAKE_CASE(AArch64ISD::ORRi)
MAKE_CASE(AArch64ISD::BSP)
- MAKE_CASE(AArch64ISD::NEG)
MAKE_CASE(AArch64ISD::EXTR)
MAKE_CASE(AArch64ISD::ZIP1)
MAKE_CASE(AArch64ISD::ZIP2)
@@ -1810,6 +1979,7 @@
MAKE_CASE(AArch64ISD::REV32)
MAKE_CASE(AArch64ISD::REV64)
MAKE_CASE(AArch64ISD::EXT)
+ MAKE_CASE(AArch64ISD::SPLICE)
MAKE_CASE(AArch64ISD::VSHL)
MAKE_CASE(AArch64ISD::VLSHR)
MAKE_CASE(AArch64ISD::VASHR)
@@ -1839,6 +2009,8 @@
MAKE_CASE(AArch64ISD::URHADD)
MAKE_CASE(AArch64ISD::SHADD)
MAKE_CASE(AArch64ISD::UHADD)
+ MAKE_CASE(AArch64ISD::SDOT)
+ MAKE_CASE(AArch64ISD::UDOT)
MAKE_CASE(AArch64ISD::SMINV)
MAKE_CASE(AArch64ISD::UMINV)
MAKE_CASE(AArch64ISD::SMAXV)
@@ -1856,22 +2028,26 @@
MAKE_CASE(AArch64ISD::CLASTB_N)
MAKE_CASE(AArch64ISD::LASTA)
MAKE_CASE(AArch64ISD::LASTB)
- MAKE_CASE(AArch64ISD::REV)
MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+ MAKE_CASE(AArch64ISD::LS64_BUILD)
+ MAKE_CASE(AArch64ISD::LS64_EXTRACT)
MAKE_CASE(AArch64ISD::TBL)
MAKE_CASE(AArch64ISD::FADD_PRED)
MAKE_CASE(AArch64ISD::FADDA_PRED)
MAKE_CASE(AArch64ISD::FADDV_PRED)
MAKE_CASE(AArch64ISD::FDIV_PRED)
MAKE_CASE(AArch64ISD::FMA_PRED)
+ MAKE_CASE(AArch64ISD::FMAX_PRED)
MAKE_CASE(AArch64ISD::FMAXV_PRED)
MAKE_CASE(AArch64ISD::FMAXNM_PRED)
MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
+ MAKE_CASE(AArch64ISD::FMIN_PRED)
MAKE_CASE(AArch64ISD::FMINV_PRED)
MAKE_CASE(AArch64ISD::FMINNM_PRED)
MAKE_CASE(AArch64ISD::FMINNMV_PRED)
MAKE_CASE(AArch64ISD::FMUL_PRED)
MAKE_CASE(AArch64ISD::FSUB_PRED)
+ MAKE_CASE(AArch64ISD::BIC)
MAKE_CASE(AArch64ISD::BIT)
MAKE_CASE(AArch64ISD::CBZ)
MAKE_CASE(AArch64ISD::CBNZ)
@@ -1882,6 +2058,7 @@
MAKE_CASE(AArch64ISD::SITOF)
MAKE_CASE(AArch64ISD::UITOF)
MAKE_CASE(AArch64ISD::NVCAST)
+ MAKE_CASE(AArch64ISD::MRS)
MAKE_CASE(AArch64ISD::SQSHL_I)
MAKE_CASE(AArch64ISD::UQSHL_I)
MAKE_CASE(AArch64ISD::SRSHR_I)
@@ -1989,8 +2166,7 @@
MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::INDEX_VECTOR)
- MAKE_CASE(AArch64ISD::UABD)
- MAKE_CASE(AArch64ISD::SABD)
+ MAKE_CASE(AArch64ISD::UADDLP)
MAKE_CASE(AArch64ISD::CALL_RVMARKER)
}
#undef MAKE_CASE
@@ -2095,6 +2271,31 @@
// Lowering Code
//===----------------------------------------------------------------------===//
+// Forward declarations of SVE fixed length lowering helpers
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
+static SDValue convertFixedMaskToScalableVector(SDValue Mask,
+ SelectionDAG &DAG);
+
+/// isZerosVector - Check whether SDNode N is a zero-filled vector.
+static bool isZerosVector(const SDNode *N) {
+ // Look through a bit convert.
+ while (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0).getNode();
+
+ if (ISD::isConstantSplatVectorAllZeros(N))
+ return true;
+
+ if (N->getOpcode() != AArch64ISD::DUP)
+ return false;
+
+ auto Opnd0 = N->getOperand(0);
+ auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
+ auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
+ return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
+}
+
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
@@ -2824,50 +3025,25 @@
CC = AArch64CC::NE;
bool IsSigned = Op.getOpcode() == ISD::SMULO;
if (Op.getValueType() == MVT::i32) {
+ // Extend to 64-bits, then perform a 64-bit multiply.
unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- // For a 32 bit multiply with overflow check we want the instruction
- // selector to generate a widening multiply (SMADDL/UMADDL). For that we
- // need to generate the following pattern:
- // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
- SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
- DAG.getConstant(0, DL, MVT::i64));
- // On AArch64 the upper 32 bits are always zero extended for a 32 bit
- // operation. We need to clear out the upper 32 bits, because we used a
- // widening multiply that wrote all 64 bits. In the end this should be a
- // noop.
- Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+ Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
+
+ // Check that the result fits into a 32-bit integer.
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
if (IsSigned) {
- // The signed overflow check requires more than just a simple check for
- // any bit set in the upper 32 bits of the result. These bits could be
- // just the sign bits of a negative number. To perform the overflow
- // check we have to arithmetic shift right the 32nd bit of the result by
- // 31 bits. Then we compare the result to the upper 32 bits.
- SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
- DAG.getConstant(32, DL, MVT::i64));
- UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
- SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
- DAG.getConstant(31, DL, MVT::i64));
- // It is important that LowerBits is last, otherwise the arithmetic
- // shift will not be folded into the compare (SUBS).
- SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
- Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
- .getValue(1);
- } else {
- // The overflow check for unsigned multiply is easy. We only need to
- // check if any of the upper 32 bits are set. This can be done with a
- // CMP (shifted register). For that we need to generate the following
- // pattern:
- // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
- SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
- DAG.getConstant(32, DL, MVT::i64));
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ // cmp xreg, wreg, sxtw
+ SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
Overflow =
- DAG.getNode(AArch64ISD::SUBS, DL, VTs,
- DAG.getConstant(0, DL, MVT::i64),
- UpperBits).getValue(1);
+ DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
+ } else {
+ // tst xreg, #0xffffffff00000000
+ SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
+ Overflow =
+ DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
}
break;
}
@@ -3083,9 +3259,13 @@
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getValueType().isScalableVector())
+ EVT VT = Op.getValueType();
+ if (VT.isScalableVector())
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerFixedLengthFPExtendToSVE(Op, DAG);
+
assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
return SDValue();
}
@@ -3099,6 +3279,9 @@
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();
+ if (useSVEForFixedLengthVectorVT(SrcVT))
+ return LowerFixedLengthFPRoundToSVE(Op, DAG);
+
if (SrcVT != MVT::f128) {
// Expand cases where the input is a vector bigger than NEON.
if (useSVEForFixedLengthVectorVT(SrcVT))
@@ -3126,6 +3309,9 @@
return LowerToPredicatedOp(Op, DAG, Opcode);
}
+ if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
+ return LowerFixedLengthFPToIntToSVE(Op, DAG);
+
unsigned NumElts = InVT.getVectorNumElements();
// f16 conversions are promoted to f32 when full fp16 is not supported.
@@ -3186,6 +3372,44 @@
return SDValue();
}
+SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+ SelectionDAG &DAG) const {
+ // AArch64 FP-to-int conversions saturate to the destination register size, so
+ // we can lower common saturating conversions to simple instructions.
+ SDValue SrcVal = Op.getOperand(0);
+
+ EVT SrcVT = SrcVal.getValueType();
+ EVT DstVT = Op.getValueType();
+
+ EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ uint64_t SatWidth = SatVT.getScalarSizeInBits();
+ uint64_t DstWidth = DstVT.getScalarSizeInBits();
+ assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
+
+ // TODO: Support lowering of NEON and SVE conversions.
+ if (SrcVT.isVector())
+ return SDValue();
+
+ // TODO: Saturate to SatWidth explicitly.
+ if (SatWidth != DstWidth)
+ return SDValue();
+
+ // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
+ if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
+ return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
+ Op.getOperand(1));
+
+ // Cases that we can emit directly.
+ if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
+ (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
+ (DstVT == MVT::i64 || DstVT == MVT::i32))
+ return Op;
+
+ // For all other cases, fall back on the expanded form.
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
@@ -3212,6 +3436,9 @@
return LowerToPredicatedOp(Op, DAG, Opcode);
}
+ if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
+ return LowerFixedLengthIntToFPToSVE(Op, DAG);
+
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
@@ -3296,12 +3523,32 @@
return CallResult.first;
}
-static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+static MVT getSVEContainerType(EVT ContentTy);
+
+SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
+ SelectionDAG &DAG) const {
EVT OpVT = Op.getValueType();
+ EVT ArgVT = Op.getOperand(0).getValueType();
+
+ if (useSVEForFixedLengthVectorVT(OpVT))
+ return LowerFixedLengthBitcastToSVE(Op, DAG);
+
+ if (OpVT.isScalableVector()) {
+ if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
+ assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
+ "Expected int->fp bitcast!");
+ SDValue ExtResult =
+ DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
+ Op.getOperand(0));
+ return getSVESafeBitCast(OpVT, ExtResult, DAG);
+ }
+ return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
+ }
+
if (OpVT != MVT::f16 && OpVT != MVT::bf16)
return SDValue();
- assert(Op.getOperand(0).getValueType() == MVT::i16);
+ assert(ArgVT == MVT::i16);
SDLoc DL(Op);
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
@@ -3454,6 +3701,50 @@
return DAG.getMergeValues({AND, Chain}, dl);
}
+SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue RMValue = Op->getOperand(1);
+
+ // The rounding mode is in bits 23:22 of the FPCR.
+ // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
+ // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
+ // ((arg - 1) & 3) << 22).
+ //
+ // The argument of llvm.set.rounding must be within the segment [0, 3], so
+ // NearestTiesToAway (4) is not handled here. It is responsibility of the code
+ // generated llvm.set.rounding to ensure this condition.
+
+ // Calculate new value of FPCR[23:22].
+ RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
+ DAG.getConstant(1, DL, MVT::i32));
+ RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
+ DAG.getConstant(0x3, DL, MVT::i32));
+ RMValue =
+ DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
+ DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
+ RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
+
+ // Get current value of FPCR.
+ SDValue Ops[] = {
+ Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
+ SDValue FPCR =
+ DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
+ Chain = FPCR.getValue(1);
+ FPCR = FPCR.getValue(0);
+
+ // Put new rounding mode into FPSCR[23:22].
+ const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
+ FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
+ DAG.getConstant(RMMask, DL, MVT::i64));
+ FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
+ SDValue Ops2[] = {
+ Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
+ FPCR};
+ return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -3536,6 +3827,37 @@
DAG.getTargetConstant(Pattern, DL, MVT::i32));
}
+static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT OutVT = Op.getValueType();
+ SDValue InOp = Op.getOperand(1);
+ EVT InVT = InOp.getValueType();
+
+ // Return the operand if the cast isn't changing type,
+ // i.e. <n x 16 x i1> -> <n x 16 x i1>
+ if (InVT == OutVT)
+ return InOp;
+
+ SDValue Reinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp);
+
+ // If the argument converted to an svbool is a ptrue or a comparison, the
+ // lanes introduced by the widening are zero by construction.
+ switch (InOp.getOpcode()) {
+ case AArch64ISD::SETCC_MERGE_ZERO:
+ return Reinterpret;
+ case ISD::INTRINSIC_WO_CHAIN:
+ if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
+ return Reinterpret;
+ }
+
+ // Otherwise, zero the newly introduced lanes.
+ SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
+ SDValue MaskReinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask);
+ return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
+}
+
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3597,7 +3919,7 @@
return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_rev:
- return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
+ return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
Op.getOperand(1));
case Intrinsic::aarch64_sve_tbl:
return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
@@ -3620,6 +3942,9 @@
case Intrinsic::aarch64_sve_zip2:
return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_splice:
+ return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
Op.getOperand(1));
@@ -3639,6 +3964,8 @@
case Intrinsic::aarch64_sve_convert_from_svbool:
return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_convert_to_svbool:
+ return lowerConvertToSVBool(Op, DAG);
case Intrinsic::aarch64_sve_fneg:
return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -3694,22 +4021,6 @@
case Intrinsic::aarch64_sve_neg:
return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
- case Intrinsic::aarch64_sve_convert_to_svbool: {
- EVT OutVT = Op.getValueType();
- EVT InVT = Op.getOperand(1).getValueType();
- // Return the operand if the cast isn't changing type,
- // i.e. <n x 16 x i1> -> <n x 16 x i1>
- if (InVT == OutVT)
- return Op.getOperand(1);
- // Otherwise, zero the newly introduced lanes.
- SDValue Reinterpret =
- DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
- SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
- SDValue MaskReinterpret =
- DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
- return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
- }
-
case Intrinsic::aarch64_sve_insr: {
SDValue Scalar = Op.getOperand(2);
EVT ScalarTy = Scalar.getValueType();
@@ -3814,18 +4125,40 @@
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
-
+ case Intrinsic::aarch64_neon_sabd:
case Intrinsic::aarch64_neon_uabd: {
- return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
+ unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
+ : ISD::ABDS;
+ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
}
- case Intrinsic::aarch64_neon_sabd: {
- return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_uaddlp: {
+ unsigned Opcode = AArch64ISD::UADDLP;
+ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
+ }
+ case Intrinsic::aarch64_neon_sdot:
+ case Intrinsic::aarch64_neon_udot:
+ case Intrinsic::aarch64_sve_sdot:
+ case Intrinsic::aarch64_sve_udot: {
+ unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
+ IntNo == Intrinsic::aarch64_sve_udot)
+ ? AArch64ISD::UDOT
+ : AArch64ISD::SDOT;
+ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
}
}
}
+bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
+ if (VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16) {
+ EltTy = MVT::i32;
+ return true;
+ }
+ return false;
+}
+
bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
if (VT.getVectorElementType() == MVT::i32 &&
VT.getVectorElementCount().getKnownMinValue() >= 4)
@@ -3939,6 +4272,12 @@
if (!isNullConstant(BasePtr))
return;
+ // FIXME: This will not match for fixed vector type codegen as the nodes in
+ // question will have fixed<->scalable conversions around them. This should be
+ // moved to a DAG combine or complex pattern so that is executes after all of
+ // the fixed vector insert and extracts have been removed. This deficiency
+ // will result in a sub-optimal addressing mode being used, i.e. an ADD not
+ // being folded into the scatter/gather.
ConstantSDNode *Offset = nullptr;
if (Index.getOpcode() == ISD::ADD)
if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
@@ -3983,6 +4322,8 @@
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
assert(MGT && "Can only custom lower gather load nodes");
+ bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
+
SDValue Index = MGT->getIndex();
SDValue Chain = MGT->getChain();
SDValue PassThru = MGT->getPassThru();
@@ -4001,6 +4342,7 @@
bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
EVT VT = PassThru.getSimpleValueType();
+ EVT IndexVT = Index.getSimpleValueType();
EVT MemVT = MGT->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);
@@ -4008,14 +4350,35 @@
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
- // Handle FP data by using an integer gather and casting the result.
- if (VT.isFloatingPoint()) {
- EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
- PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
+ if (IsFixedLength) {
+ assert(Subtarget->useSVEForFixedLengthVectors() &&
+ "Cannot lower when not using SVE for fixed vectors");
+ if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+ IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+ MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+ } else {
+ MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+ IndexVT = MemVT.changeTypeToInteger();
+ }
+ InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
+ Mask = DAG.getNode(
+ ISD::ZERO_EXTEND, DL,
+ VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
+ }
+
+ if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
+ PassThru = SDValue();
+
+ if (VT.isFloatingPoint() && !IsFixedLength) {
+ // Handle FP data by using an integer gather and casting the result.
+ if (PassThru) {
+ EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+ PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
+ }
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}
- SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+ SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
@@ -4027,15 +4390,36 @@
if (ResNeedsSignExtend)
Opcode = getSignExtendedGatherOpcode(Opcode);
- SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
- SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
-
- if (VT.isFloatingPoint()) {
- SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
- return DAG.getMergeValues({Cast, Gather}, DL);
+ if (IsFixedLength) {
+ if (Index.getSimpleValueType().isFixedLengthVector())
+ Index = convertToScalableVector(DAG, IndexVT, Index);
+ if (BasePtr.getSimpleValueType().isFixedLengthVector())
+ BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
+ Mask = convertFixedMaskToScalableVector(Mask, DAG);
}
- return Gather;
+ SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
+ SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
+ Chain = Result.getValue(1);
+
+ if (IsFixedLength) {
+ Result = convertFromScalableVector(
+ DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
+ Result);
+ Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
+ Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+
+ if (PassThru)
+ Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
+ } else {
+ if (PassThru)
+ Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
+
+ if (VT.isFloatingPoint())
+ Result = getSVESafeBitCast(VT, Result, DAG);
+ }
+
+ return DAG.getMergeValues({Result, Chain}, DL);
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
@@ -4044,6 +4428,8 @@
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
assert(MSC && "Can only custom lower scatter store nodes");
+ bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
+
SDValue Index = MSC->getIndex();
SDValue Chain = MSC->getChain();
SDValue StoreVal = MSC->getValue();
@@ -4060,6 +4446,7 @@
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
EVT VT = StoreVal.getSimpleValueType();
+ EVT IndexVT = Index.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Other);
EVT MemVT = MSC->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);
@@ -4068,8 +4455,29 @@
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
- // Handle FP data by casting the data so an integer scatter can be used.
- if (VT.isFloatingPoint()) {
+ if (IsFixedLength) {
+ assert(Subtarget->useSVEForFixedLengthVectors() &&
+ "Cannot lower when not using SVE for fixed vectors");
+ if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+ IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+ MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+ } else {
+ MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+ IndexVT = MemVT.changeTypeToInteger();
+ }
+ InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
+
+ StoreVal =
+ DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
+ StoreVal = DAG.getNode(
+ ISD::ANY_EXTEND, DL,
+ VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
+ StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
+ Mask = DAG.getNode(
+ ISD::ZERO_EXTEND, DL,
+ VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
+ } else if (VT.isFloatingPoint()) {
+ // Handle FP data by casting the data so an integer scatter can be used.
EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
@@ -4082,10 +4490,44 @@
selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
/*isGather=*/false, DAG);
+ if (IsFixedLength) {
+ if (Index.getSimpleValueType().isFixedLengthVector())
+ Index = convertToScalableVector(DAG, IndexVT, Index);
+ if (BasePtr.getSimpleValueType().isFixedLengthVector())
+ BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
+ Mask = convertFixedMaskToScalableVector(Mask, DAG);
+ }
+
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
return DAG.getNode(Opcode, DL, VTs, Ops);
}
+SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
+ assert(LoadNode && "Expected custom lowering of a masked load node");
+ EVT VT = Op->getValueType(0);
+
+ if (useSVEForFixedLengthVectorVT(VT, true))
+ return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
+
+ SDValue PassThru = LoadNode->getPassThru();
+ SDValue Mask = LoadNode->getMask();
+
+ if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
+ return Op;
+
+ SDValue Load = DAG.getMaskedLoad(
+ VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
+ LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
+ LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
+ LoadNode->getExtensionType());
+
+ SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
+
+ return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
+}
+
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
@@ -4133,19 +4575,20 @@
EVT MemVT = StoreNode->getMemoryVT();
if (VT.isVector()) {
- if (useSVEForFixedLengthVectorVT(VT))
+ if (useSVEForFixedLengthVectorVT(VT, true))
return LowerFixedLengthVectorStoreToSVE(Op, DAG);
unsigned AS = StoreNode->getAddressSpace();
Align Alignment = StoreNode->getAlign();
if (Alignment < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
StoreNode->getMemOperand()->getFlags(),
nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
}
- if (StoreNode->isTruncatingStore()) {
+ if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
+ MemVT == MVT::v4i8) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
@@ -4186,11 +4629,79 @@
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
+ } else if (MemVT == MVT::i64x8) {
+ SDValue Value = StoreNode->getValue();
+ assert(Value->getValueType(0) == MVT::i64x8);
+ SDValue Chain = StoreNode->getChain();
+ SDValue Base = StoreNode->getBasePtr();
+ EVT PtrVT = Base.getValueType();
+ for (unsigned i = 0; i < 8; i++) {
+ SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
+ Value, DAG.getConstant(i, Dl, MVT::i32));
+ SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
+ DAG.getConstant(i * 8, Dl, PtrVT));
+ Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
+ StoreNode->getOriginalAlign());
+ }
+ return Chain;
}
return SDValue();
}
+SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+ assert(LoadNode && "Expected custom lowering of a load node");
+
+ if (LoadNode->getMemoryVT() == MVT::i64x8) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Base = LoadNode->getBasePtr();
+ SDValue Chain = LoadNode->getChain();
+ EVT PtrVT = Base.getValueType();
+ for (unsigned i = 0; i < 8; i++) {
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
+ DAG.getConstant(i * 8, DL, PtrVT));
+ SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
+ LoadNode->getPointerInfo(),
+ LoadNode->getOriginalAlign());
+ Ops.push_back(Part);
+ Chain = SDValue(Part.getNode(), 1);
+ }
+ SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
+ return DAG.getMergeValues({Loaded, Chain}, DL);
+ }
+
+ // Custom lowering for extending v4i8 vector loads.
+ EVT VT = Op->getValueType(0);
+ assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
+
+ if (LoadNode->getMemoryVT() != MVT::v4i8)
+ return SDValue();
+
+ unsigned ExtType;
+ if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
+ ExtType = ISD::SIGN_EXTEND;
+ else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
+ LoadNode->getExtensionType() == ISD::EXTLOAD)
+ ExtType = ISD::ZERO_EXTEND;
+ else
+ return SDValue();
+
+ SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
+ LoadNode->getBasePtr(), MachinePointerInfo());
+ SDValue Chain = Load.getValue(1);
+ SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
+ SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
+ SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
+ Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
+ DAG.getConstant(0, DL, MVT::i64));
+ if (VT == MVT::v4i32)
+ Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
+ return DAG.getMergeValues({Ext, Chain}, DL);
+}
+
// Generate SUBS and CSEL for integer abs.
SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
@@ -4340,10 +4851,9 @@
case ISD::SHL:
return LowerVectorSRA_SRL_SHL(Op, DAG);
case ISD::SHL_PARTS:
- return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS:
- return LowerShiftRightParts(Op, DAG);
+ return LowerShiftParts(Op, DAG);
case ISD::CTPOP:
return LowerCTPOP(Op, DAG);
case ISD::FCOPYSIGN:
@@ -4364,16 +4874,29 @@
case ISD::STRICT_FP_TO_SINT:
case ISD::STRICT_FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
+ return LowerFP_TO_INT_SAT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
case ISD::FLT_ROUNDS_:
return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::SET_ROUNDING:
+ return LowerSET_ROUNDING(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
+ case ISD::MULHS:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
+ /*OverrideNEON=*/true);
+ case ISD::MULHU:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
+ /*OverrideNEON=*/true);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
+ case ISD::MSTORE:
+ return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
case ISD::MGATHER:
return LowerMGATHER(Op, DAG);
case ISD::MSCATTER:
@@ -4417,18 +4940,24 @@
}
case ISD::TRUNCATE:
return LowerTRUNCATE(Op, DAG);
+ case ISD::MLOAD:
+ return LowerMLOAD(Op, DAG);
case ISD::LOAD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerFixedLengthVectorLoadToSVE(Op, DAG);
- llvm_unreachable("Unexpected request to lower ISD::LOAD");
+ return LowerLOAD(Op, DAG);
case ISD::ADD:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
case ISD::AND:
return LowerToScalableOp(Op, DAG);
case ISD::SUB:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+ case ISD::FMAXIMUM:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
case ISD::FMAXNUM:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
+ case ISD::FMINIMUM:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
case ISD::FMINNUM:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
case ISD::VSELECT:
@@ -4436,8 +4965,7 @@
case ISD::ABS:
return LowerABS(Op, DAG);
case ISD::BITREVERSE:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
- /*OverrideNEON=*/true);
+ return LowerBitreverse(Op, DAG);
case ISD::BSWAP:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
case ISD::CTLZ:
@@ -4445,6 +4973,8 @@
/*OverrideNEON=*/true);
case ISD::CTTZ:
return LowerCTTZ(Op, DAG);
+ case ISD::VECTOR_SPLICE:
+ return LowerVECTOR_SPLICE(Op, DAG);
}
}
@@ -4516,6 +5046,8 @@
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ case CallingConv::Tail:
if (Subtarget->isTargetWindows() && IsVarArg)
return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
@@ -4579,7 +5111,10 @@
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
}
- CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ bool UseVarArgCC = false;
+ if (IsWin64)
+ UseVarArgCC = isVarArg;
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
bool Res =
AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
assert(!Res && "Call operand has unhandled type");
@@ -4607,6 +5142,9 @@
continue;
}
+ if (Ins[i].Flags.isSwiftAsync())
+ MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+
SDValue ArgValue;
if (VA.isRegLoc()) {
// Arguments stored in registers.
@@ -4710,7 +5248,6 @@
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
-
}
if (VA.getLocInfo() == CCValAssign::Indirect) {
@@ -4986,8 +5523,9 @@
}
/// Return true if the calling convention is one that we can guarantee TCO for.
-static bool canGuaranteeTCO(CallingConv::ID CC) {
- return CC == CallingConv::Fast;
+static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
+ return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -4997,9 +5535,12 @@
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::PreserveMost:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ case CallingConv::Tail:
+ case CallingConv::Fast:
return true;
default:
- return canGuaranteeTCO(CC);
+ return false;
}
}
@@ -5015,11 +5556,11 @@
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
- // If this function uses the C calling convention but has an SVE signature,
- // then it preserves more registers and should assume the SVE_VectorCall CC.
+ // Functions using the C or Fast calling convention that have an SVE signature
+ // preserve more registers and should assume the SVE_VectorCall CC.
// The check for matching callee-saved regs will determine whether it is
// eligible for TCO.
- if (CallerCC == CallingConv::C &&
+ if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
CallerCC = CallingConv::AArch64_SVE_VectorCall;
@@ -5051,8 +5592,8 @@
return false;
}
- if (getTargetMachine().Options.GuaranteedTailCallOpt)
- return canGuaranteeTCO(CalleeCC) && CCMatch;
+ if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+ return CCMatch;
// Externally-defined functions with weak linkage should not be
// tail-called on AArch64 when the OS does not support dynamic
@@ -5183,7 +5724,8 @@
bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
bool TailCallOpt) const {
- return CallCC == CallingConv::Fast && TailCallOpt;
+ return (CallCC == CallingConv::Fast && TailCallOpt) ||
+ CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
@@ -5209,10 +5751,11 @@
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
+ bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
- if (CallConv == CallingConv::C) {
+ if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
return Out.VT.isScalableVector();
});
@@ -5228,19 +5771,21 @@
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
- report_fatal_error("failed to perform tail call elimination on a call "
- "site marked musttail");
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
- if (!TailCallOpt && IsTailCall)
+ if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
+ CallConv != CallingConv::SwiftTail)
IsSibCall = true;
if (IsTailCall)
++NumTailCalls;
}
+ if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
@@ -5258,8 +5803,12 @@
"currently not supported");
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
- /*IsVarArg=*/ !Outs[i].IsFixed);
+ bool UseVarArgCC = !Outs[i].IsFixed;
+ // On Windows, the fixed arguments in a vararg call are passed in GPRs
+ // too, so use the vararg CC to force them to integer registers.
+ if (IsCalleeWin64)
+ UseVarArgCC = true;
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
@@ -5321,6 +5870,11 @@
// can actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
+ // Update the required reserved area if this is the tail call requiring the
+ // most argument stack space.
+ if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
+ FuncInfo->setTailCallReservedStack(-FPDiff);
+
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
@@ -5332,7 +5886,7 @@
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
@@ -5486,7 +6040,8 @@
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize;
- if (VA.getLocInfo() == CCValAssign::Indirect)
+ if (VA.getLocInfo() == CCValAssign::Indirect ||
+ VA.getLocInfo() == CCValAssign::Trunc)
OpSize = VA.getLocVT().getFixedSizeInBits();
else
OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
@@ -5589,7 +6144,7 @@
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
InFlag = Chain.getValue(1);
}
@@ -5648,11 +6203,12 @@
}
unsigned CallOpc = AArch64ISD::CALL;
- // Calls marked with "rv_marker" are special. They should be expanded to the
- // call, directly followed by a special marker sequence. Use the CALL_RVMARKER
- // to do that.
- if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) {
- assert(!IsTailCall && "tail calls cannot be marked with rv_marker");
+ // Calls with operand bundle "clang.arc.attachedcall" are special. They should
+ // be expanded to the call, directly followed by a special marker sequence.
+ // Use the CALL_RVMARKER to do that.
+ if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
+ assert(!IsTailCall &&
+ "tail calls cannot be marked with clang.arc.attachedcall");
CallOpc = AArch64ISD::CALL_RVMARKER;
}
@@ -6585,6 +7141,56 @@
return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
}
+SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT.isScalableVector() ||
+ useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
+ true);
+
+ SDLoc DL(Op);
+ SDValue REVB;
+ MVT VST;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Invalid type for bitreverse!");
+
+ case MVT::v2i32: {
+ VST = MVT::v8i8;
+ REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+
+ case MVT::v4i32: {
+ VST = MVT::v16i8;
+ REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+
+ case MVT::v1i64: {
+ VST = MVT::v8i8;
+ REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+
+ case MVT::v2i64: {
+ VST = MVT::v16i8;
+ REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+ }
+
+ return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+ DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
+}
+
SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
@@ -6701,13 +7307,26 @@
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+ ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+ // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
+ // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
+ // supported types.
+ if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal &&
+ CTVal->isOne() && CFVal->isAllOnesValue() &&
+ LHS.getValueType() == TVal.getValueType()) {
+ EVT VT = LHS.getValueType();
+ SDValue Shift =
+ DAG.getNode(ISD::SRA, dl, VT, LHS,
+ DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
+ return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
+ }
+
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
// order to for a CSINV or CSINC out of them.
- ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
- ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
@@ -6862,6 +7481,16 @@
return CS1;
}
+SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ EVT Ty = Op.getValueType();
+ auto Idx = Op.getConstantOperandAPInt(2);
+ if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements()))
+ return Op;
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -6888,6 +7517,17 @@
return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
}
+ if (useSVEForFixedLengthVectorVT(Ty)) {
+ // FIXME: Ideally this would be the same as above using i1 types, however
+ // for the moment we can't deal with fixed i1 vector types properly, so
+ // instead extend the predicate to a result type sized integer vector.
+ MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
+ MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
+ SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
+ SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
+ return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
+ }
+
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
if (ISD::isOverflowIntrOpRes(CCVal)) {
@@ -6910,7 +7550,7 @@
if (CCVal.getOpcode() == ISD::SETCC) {
LHS = CCVal.getOperand(0);
RHS = CCVal.getOperand(1);
- CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
+ CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
} else {
LHS = CCVal;
RHS = DAG.getConstant(0, DL, CCVal.getValueType());
@@ -7294,112 +7934,13 @@
return SDValue(St, 0);
}
-/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
- assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
- DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-
- // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
- // is "undef". We wanted 0, so CSEL it directly.
- SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETEQ, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
- HiBitsForLo =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
- HiBitsForLo, CCVal, Cmp);
-
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i64));
-
- SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
- SDValue LoForNormalShift =
- DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
-
- Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
- dl, DAG);
- CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
- SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
- SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
- LoForNormalShift, CCVal, Cmp);
-
- // AArch64 shifts larger than the register width are wrapped rather than
- // clamped, so we can't just emit "hi >> x".
- SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue HiForBigShift =
- Opc == ISD::SRA
- ? DAG.getNode(Opc, dl, VT, ShOpHi,
- DAG.getConstant(VTBits - 1, dl, MVT::i64))
- : DAG.getConstant(0, dl, VT);
- SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
- HiForNormalShift, CCVal, Cmp);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
-
- assert(Op.getOpcode() == ISD::SHL_PARTS);
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
- DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-
- // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
- // is "undef". We wanted 0, so CSEL it directly.
- SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETEQ, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
- LoBitsForHi =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
- LoBitsForHi, CCVal, Cmp);
-
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i64));
- SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue HiForNormalShift =
- DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
-
- SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
- Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
- dl, DAG);
- CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
- SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
- HiForNormalShift, CCVal, Cmp);
-
- // AArch64 shifts of larger than register sizes are wrapped rather than
- // clamped, so we can't just emit "lo << a" if a is too big.
- SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
- SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
- LoForNormalShift, CCVal, Cmp);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
+/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Lo, Hi;
+ expandShiftParts(Op.getNode(), Lo, Hi, DAG);
+ return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
}
bool AArch64TargetLowering::isOffsetFoldingLegal(
@@ -7690,6 +8231,8 @@
case 'r':
if (VT.isScalableVector())
return std::make_pair(0U, nullptr);
+ if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
+ return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
if (VT.getFixedSizeInBits() == 64)
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
@@ -7739,7 +8282,7 @@
: std::make_pair(0U, &AArch64::PPRRegClass);
}
}
- if (StringRef("{cc}").equals_lower(Constraint))
+ if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
// Use the default implementation in TargetLowering to convert the register
@@ -7777,6 +8320,15 @@
return Res;
}
+EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
+ llvm::Type *Ty,
+ bool AllowUnknown) const {
+ if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
+ return EVT(MVT::i64x8);
+
+ return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
+}
+
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void AArch64TargetLowering::LowerAsmOperandForConstraint(
@@ -7815,10 +8367,6 @@
dyn_cast<BlockAddressSDNode>(Op)) {
Result =
DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
- } else if (const ExternalSymbolSDNode *ES =
- dyn_cast<ExternalSymbolSDNode>(Op)) {
- Result =
- DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
} else
return;
break;
@@ -7945,7 +8493,7 @@
SDLoc DL(V64Reg);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
- V64Reg, DAG.getConstant(0, DL, MVT::i32));
+ V64Reg, DAG.getConstant(0, DL, MVT::i64));
}
/// getExtFactor - Determine the adjustment factor for the position when
@@ -8793,6 +9341,9 @@
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
+
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility
@@ -8802,6 +9353,10 @@
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
+ assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
+ assert(ShuffleMask.size() == VT.getVectorNumElements() &&
+ "Unexpected VECTOR_SHUFFLE mask size!");
+
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
@@ -8848,6 +9403,14 @@
if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+ if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
+ (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
+ ShuffleVectorInst::isReverseMask(ShuffleMask)) {
+ SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
+ return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
+ DAG.getConstant(8, dl, MVT::i32));
+ }
+
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
@@ -9028,9 +9591,7 @@
SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
// create the vector 0,1,0,1,...
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
- DL, MVT::nxv2i64, Zero, One);
+ SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
// create the vector idx64,idx64+1,idx64,idx64+1,...
@@ -9557,10 +10118,10 @@
}
if (i > 0)
isOnlyLowElement = false;
- if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+ if (!isIntOrFPConstant(V))
isConstant = false;
- if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+ if (isIntOrFPConstant(V)) {
++NumConstantLanes;
if (!ConstantValue.getNode())
ConstantValue = V;
@@ -9585,7 +10146,7 @@
// Convert BUILD_VECTOR where all elements but the lowest are undef into
// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
- if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
+ if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
"SCALAR_TO_VECTOR node\n");
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
@@ -9726,7 +10287,7 @@
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
- if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
+ if (!isIntOrFPConstant(V))
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
@@ -9750,9 +10311,7 @@
if (PreferDUPAndInsert) {
// First, build a constant vector with the common element.
- SmallVector<SDValue, 8> Ops;
- for (unsigned I = 0; I < NumElts; ++I)
- Ops.push_back(Value);
+ SmallVector<SDValue, 8> Ops(NumElts, Value);
SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
// Next, insert the elements that do not match the common value.
for (unsigned I = 0; I < NumElts; ++I)
@@ -9814,6 +10373,9 @@
SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
+
assert(Op.getValueType().isScalableVector() &&
isTypeLegal(Op.getValueType()) &&
"Expected legal scalable vector type!");
@@ -9828,13 +10390,32 @@
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerFixedLengthInsertVectorElt(Op, DAG);
+
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
+
+ if (VT.getScalarType() == MVT::i1) {
+ EVT VectorVT = getPromotedVTForPredicate(VT);
+ SDLoc DL(Op);
+ SDValue ExtendedVector =
+ DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
+ SDValue ExtendedValue =
+ DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
+ VectorVT.getScalarType().getSizeInBits() < 32
+ ? MVT::i32
+ : VectorVT.getScalarType());
+ ExtendedVector =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
+ ExtendedValue, Op.getOperand(2));
+ return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
+ }
+
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
-
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
@@ -9862,14 +10443,29 @@
AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+ EVT VT = Op.getOperand(0).getValueType();
+
+ if (VT.getScalarType() == MVT::i1) {
+ // We can't directly extract from an SVE predicate; extend it first.
+ // (This isn't the only possible lowering, but it's straightforward.)
+ EVT VectorVT = getPromotedVTForPredicate(VT);
+ SDLoc DL(Op);
+ SDValue Extend =
+ DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
+ MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
+ Extend, Op.getOperand(1));
+ return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
+ }
+
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerFixedLengthExtractVectorElt(Op, DAG);
// Check for non-constant or out of range lane.
- EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
-
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
@@ -10160,7 +10756,8 @@
unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
: Intrinsic::aarch64_neon_ushl;
// negate the shift amount
- SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+ SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Op.getOperand(1));
SDValue NegShiftLeft =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
@@ -10268,11 +10865,8 @@
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getValueType().isScalableVector()) {
- if (Op.getOperand(0).getValueType().isFloatingPoint())
- return Op;
+ if (Op.getValueType().isScalableVector())
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
- }
if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
return LowerFixedLengthVectorSetccToSVE(Op, DAG);
@@ -11392,8 +11986,8 @@
if (Op.isAligned(AlignCheck))
return true;
bool Fast;
- return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
- &Fast) &&
+ return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
+ MachineMemOperand::MONone, &Fast) &&
Fast;
};
@@ -11423,14 +12017,14 @@
if (Op.isAligned(AlignCheck))
return true;
bool Fast;
- return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
- &Fast) &&
+ return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
+ MachineMemOperand::MONone, &Fast) &&
Fast;
};
if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
AlignmentIsAcceptable(MVT::v2i64, Align(16)))
- return LLT::vector(2, 64);
+ return LLT::fixed_vector(2, 64);
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return LLT::scalar(128);
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
@@ -11483,8 +12077,12 @@
return false;
// FIXME: Update this method to support scalable addressing modes.
- if (isa<ScalableVectorType>(Ty))
- return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+ if (isa<ScalableVectorType>(Ty)) {
+ uint64_t VecElemNumBytes =
+ DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
+ return AM.HasBaseReg && !AM.BaseOffs &&
+ (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
+ }
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
@@ -11522,9 +12120,8 @@
return true;
}
-int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
- const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+InstructionCost AArch64TargetLowering::getScalingFactorCost(
+ const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const {
// Scaling factors are not free at all.
// Operands | Rt Latency
// -------------------------------------------
@@ -11547,6 +12144,8 @@
return false;
switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f16:
+ return Subtarget->hasFullFP16();
case MVT::f32:
case MVT::f64:
return true;
@@ -11568,6 +12167,11 @@
}
}
+bool AArch64TargetLowering::generateFMAsInMachineCombiner(
+ EVT VT, CodeGenOpt::Level OptLevel) const {
+ return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
+}
+
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
@@ -11655,79 +12259,144 @@
return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
}
-// VECREDUCE_ADD( EXTEND(v16i8_type) ) to
-// VECREDUCE_ADD( DOTv16i8(v16i8_type) )
-static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *ST) {
- SDValue Op0 = N->getOperand(0);
- if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32)
+// Given a vecreduce_add node, detect the below pattern and convert it to the
+// node sequence with UABDL, [S|U]ADB and UADDLP.
+//
+// i32 vecreduce_add(
+// v16i32 abs(
+// v16i32 sub(
+// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
+// =================>
+// i32 vecreduce_add(
+// v4i32 UADDLP(
+// v8i16 add(
+// v8i16 zext(
+// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
+// v8i16 zext(
+// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
+static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
+ SelectionDAG &DAG) {
+ // Assumed i32 vecreduce_add
+ if (N->getValueType(0) != MVT::i32)
return SDValue();
- if (Op0.getValueType().getVectorElementType() != MVT::i32)
+ SDValue VecReduceOp0 = N->getOperand(0);
+ unsigned Opcode = VecReduceOp0.getOpcode();
+ // Assumed v16i32 abs
+ if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
+ return SDValue();
+
+ SDValue ABS = VecReduceOp0;
+ // Assumed v16i32 sub
+ if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
+ ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
+ return SDValue();
+
+ SDValue SUB = ABS->getOperand(0);
+ unsigned Opcode0 = SUB->getOperand(0).getOpcode();
+ unsigned Opcode1 = SUB->getOperand(1).getOpcode();
+ // Assumed v16i32 type
+ if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
+ SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
+ return SDValue();
+
+ // Assumed zext or sext
+ bool IsZExt = false;
+ if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
+ IsZExt = true;
+ } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
+ IsZExt = false;
+ } else
+ return SDValue();
+
+ SDValue EXT0 = SUB->getOperand(0);
+ SDValue EXT1 = SUB->getOperand(1);
+ // Assumed zext's operand has v16i8 type
+ if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
+ EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
+ return SDValue();
+
+ // Pattern is dectected. Let's convert it to sequence of nodes.
+ SDLoc DL(N);
+
+ // First, create the node pattern of UABD/SABD.
+ SDValue UABDHigh8Op0 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
+ DAG.getConstant(8, DL, MVT::i64));
+ SDValue UABDHigh8Op1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
+ DAG.getConstant(8, DL, MVT::i64));
+ SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
+ UABDHigh8Op0, UABDHigh8Op1);
+ SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
+
+ // Second, create the node pattern of UABAL.
+ SDValue UABDLo8Op0 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue UABDLo8Op1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
+ UABDLo8Op0, UABDLo8Op1);
+ SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
+ SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
+
+ // Third, create the node of UADDLP.
+ SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
+
+ // Fourth, create the node of VECREDUCE_ADD.
+ return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
+}
+
+// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
+// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
+// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
+static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *ST) {
+ if (!ST->hasDotProd())
+ return performVecReduceAddCombineWithUADDLP(N, DAG);
+
+ SDValue Op0 = N->getOperand(0);
+ if (N->getValueType(0) != MVT::i32 ||
+ Op0.getValueType().getVectorElementType() != MVT::i32)
return SDValue();
unsigned ExtOpcode = Op0.getOpcode();
+ SDValue A = Op0;
+ SDValue B;
+ if (ExtOpcode == ISD::MUL) {
+ A = Op0.getOperand(0);
+ B = Op0.getOperand(1);
+ if (A.getOpcode() != B.getOpcode() ||
+ A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
+ return SDValue();
+ ExtOpcode = A.getOpcode();
+ }
if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
return SDValue();
- EVT Op0VT = Op0.getOperand(0).getValueType();
- if (Op0VT != MVT::v16i8)
+ EVT Op0VT = A.getOperand(0).getValueType();
+ if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
return SDValue();
SDLoc DL(Op0);
- SDValue Ones = DAG.getConstant(1, DL, Op0VT);
- SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
- auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND)
- ? Intrinsic::aarch64_neon_udot
- : Intrinsic::aarch64_neon_sdot;
- SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(),
- DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros,
- Ones, Op0.getOperand(0));
+ // For non-mla reductions B can be set to 1. For MLA we take the operand of
+ // the extend B.
+ if (!B)
+ B = DAG.getConstant(1, DL, Op0VT);
+ else
+ B = B.getOperand(0);
+
+ SDValue Zeros =
+ DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
+ auto DotOpcode =
+ (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
+ SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
+ A.getOperand(0), B);
return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
}
-// Given a ABS node, detect the following pattern:
-// (ABS (SUB (EXTEND a), (EXTEND b))).
-// Generates UABD/SABD instruction.
-static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
- SDValue AbsOp1 = N->getOperand(0);
- SDValue Op0, Op1;
-
- if (AbsOp1.getOpcode() != ISD::SUB)
- return SDValue();
-
- Op0 = AbsOp1.getOperand(0);
- Op1 = AbsOp1.getOperand(1);
-
- unsigned Opc0 = Op0.getOpcode();
- // Check if the operands of the sub are (zero|sign)-extended.
- if (Opc0 != Op1.getOpcode() ||
- (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
- return SDValue();
-
- EVT VectorT1 = Op0.getOperand(0).getValueType();
- EVT VectorT2 = Op1.getOperand(0).getValueType();
- // Check if vectors are of same type and valid size.
- uint64_t Size = VectorT1.getFixedSizeInBits();
- if (VectorT1 != VectorT2 || (Size != 64 && Size != 128))
- return SDValue();
-
- // Check if vector element types are valid.
- EVT VT1 = VectorT1.getVectorElementType();
- if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32)
- return SDValue();
-
- Op0 = Op0.getOperand(0);
- Op1 = Op1.getOperand(0);
- unsigned ABDOpcode =
- (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD;
- SDValue ABD =
- DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
- return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
-}
-
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -11973,6 +12642,7 @@
// e.g. 6=3*2=(2+1)*2.
// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
// which equals to (1+2)*16-(1+2).
+
// TrailingZeroes is used to test if the mul can be lowered to
// shift+add+shift.
unsigned TrailingZeroes = ConstValue.countTrailingZeros();
@@ -12351,6 +13021,11 @@
if (!VT.isVector())
return SDValue();
+ // The combining code currently only works for NEON vectors. In particular,
+ // it does not work for SVE when dealing with vectors wider than 128 bits.
+ if (!VT.is64BitVector() && !VT.is128BitVector())
+ return SDValue();
+
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND)
return SDValue();
@@ -12359,6 +13034,44 @@
if (N1.getOpcode() != ISD::AND)
return SDValue();
+ // InstCombine does (not (neg a)) => (add a -1).
+ // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
+ // Loop over all combinations of AND operands.
+ for (int i = 1; i >= 0; --i) {
+ for (int j = 1; j >= 0; --j) {
+ SDValue O0 = N0->getOperand(i);
+ SDValue O1 = N1->getOperand(j);
+ SDValue Sub, Add, SubSibling, AddSibling;
+
+ // Find a SUB and an ADD operand, one from each AND.
+ if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
+ Sub = O0;
+ Add = O1;
+ SubSibling = N0->getOperand(1 - i);
+ AddSibling = N1->getOperand(1 - j);
+ } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
+ Add = O0;
+ Sub = O1;
+ AddSibling = N0->getOperand(1 - i);
+ SubSibling = N1->getOperand(1 - j);
+ } else
+ continue;
+
+ if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
+ continue;
+
+ // Constant ones is always righthand operand of the Add.
+ if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
+ continue;
+
+ if (Sub.getOperand(1) != Add.getOperand(0))
+ continue;
+
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
+ }
+ }
+
+ // (or (and a b) (and (not a) c)) => (bsl a b c)
// We only have to look for constant vectors here since the general, variable
// case can be handled in TableGen.
unsigned Bits = VT.getScalarSizeInBits();
@@ -13066,6 +13779,13 @@
SDValue RHS = Op->getOperand(1);
SetCCInfoAndKind InfoAndKind;
+ // If both operands are a SET_CC, then we don't want to perform this
+ // folding and create another csel as this results in more instructions
+ // (and higher register usage).
+ if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
+ isSetCCOrZExtSetCC(RHS, InfoAndKind))
+ return SDValue();
+
// If neither operand is a SET_CC, give up.
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
std::swap(LHS, RHS);
@@ -13136,6 +13856,29 @@
DAG.getConstant(0, DL, MVT::i64));
}
+// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
+static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue Dot = N->getOperand(0);
+ SDValue A = N->getOperand(1);
+ // Handle commutivity
+ auto isZeroDot = [](SDValue Dot) {
+ return (Dot.getOpcode() == AArch64ISD::UDOT ||
+ Dot.getOpcode() == AArch64ISD::SDOT) &&
+ isZerosVector(Dot.getOperand(0).getNode());
+ };
+ if (!isZeroDot(Dot))
+ std::swap(Dot, A);
+ if (!isZeroDot(Dot))
+ return SDValue();
+
+ return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
+ Dot.getOperand(2));
+}
+
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
@@ -13195,6 +13938,8 @@
// Try to change sum of two reductions.
if (SDValue Val = performUADDVCombine(N, DAG))
return Val;
+ if (SDValue Val = performAddDotCombine(N, DAG))
+ return Val;
return performAddSubLongCombine(N, DCI, DAG);
}
@@ -13336,15 +14081,16 @@
SDLoc DL(N);
SDValue Op1 = N->getOperand(1);
SDValue Op2 = N->getOperand(2);
- EVT ScalarTy = Op1.getValueType();
+ EVT ScalarTy = Op2.getValueType();
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+ ScalarTy = MVT::i32;
- if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
- Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
- Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
- }
-
- return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
- Op1, Op2);
+ // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
+ SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
+ SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
+ SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
+ return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
}
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
@@ -13534,20 +14280,47 @@
Zero);
}
+static bool isAllActivePredicate(SDValue N) {
+ unsigned NumElts = N.getValueType().getVectorMinNumElements();
+
+ // Look through cast.
+ while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
+ N = N.getOperand(0);
+ // When reinterpreting from a type with fewer elements the "new" elements
+ // are not active, so bail if they're likely to be used.
+ if (N.getValueType().getVectorMinNumElements() < NumElts)
+ return false;
+ }
+
+ // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
+ // or smaller than the implicit element type represented by N.
+ // NOTE: A larger element count implies a smaller element type.
+ if (N.getOpcode() == AArch64ISD::PTRUE &&
+ N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
+ return N.getValueType().getVectorMinNumElements() >= NumElts;
+
+ return false;
+}
+
// If a merged operation has no inactive lanes we can relax it to a predicated
// or unpredicated operation, which potentially allows better isel (perhaps
// using immediate forms) or relaxing register reuse requirements.
-static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc,
- SelectionDAG &DAG) {
+static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG,
+ bool UnpredOp = false) {
assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
SDValue Pg = N->getOperand(1);
// ISD way to specify an all active predicate.
- if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
- (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
- return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
- N->getOperand(2), N->getOperand(3));
+ if (isAllActivePredicate(Pg)) {
+ if (UnpredOp)
+ return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2),
+ N->getOperand(3));
+ else
+ return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg,
+ N->getOperand(2), N->getOperand(3));
+ }
// FUTURE: SplatVector(true)
return SDValue();
@@ -13638,6 +14411,12 @@
N->getOperand(1));
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
+ case Intrinsic::aarch64_sve_mul:
+ return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
+ case Intrinsic::aarch64_sve_smulh:
+ return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
+ case Intrinsic::aarch64_sve_umulh:
+ return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
case Intrinsic::aarch64_sve_smin:
return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
case Intrinsic::aarch64_sve_umin:
@@ -13652,6 +14431,44 @@
return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
case Intrinsic::aarch64_sve_asr:
return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
+ case Intrinsic::aarch64_sve_fadd:
+ return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
+ case Intrinsic::aarch64_sve_fsub:
+ return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmul:
+ return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
+ case Intrinsic::aarch64_sve_add:
+ return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
+ case Intrinsic::aarch64_sve_sub:
+ return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
+ case Intrinsic::aarch64_sve_and:
+ return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
+ case Intrinsic::aarch64_sve_bic:
+ return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
+ case Intrinsic::aarch64_sve_eor:
+ return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
+ case Intrinsic::aarch64_sve_orr:
+ return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
+ case Intrinsic::aarch64_sve_sqadd:
+ return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
+ case Intrinsic::aarch64_sve_sqsub:
+ return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
+ case Intrinsic::aarch64_sve_uqadd:
+ return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
+ case Intrinsic::aarch64_sve_uqsub:
+ return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
+ case Intrinsic::aarch64_sve_sqadd_x:
+ return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_sve_sqsub_x:
+ return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_sve_uqadd_x:
+ return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_sve_uqsub_x:
+ return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
@@ -13664,29 +14481,34 @@
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
break;
+ case Intrinsic::aarch64_sve_fcmpge:
case Intrinsic::aarch64_sve_cmpge:
- if (!N->getOperand(2).getValueType().isFloatingPoint())
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
- N->getValueType(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), DAG.getCondCode(ISD::SETGE));
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGE));
break;
+ case Intrinsic::aarch64_sve_fcmpgt:
case Intrinsic::aarch64_sve_cmpgt:
- if (!N->getOperand(2).getValueType().isFloatingPoint())
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
- N->getValueType(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), DAG.getCondCode(ISD::SETGT));
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGT));
break;
+ case Intrinsic::aarch64_sve_fcmpeq:
case Intrinsic::aarch64_sve_cmpeq:
- if (!N->getOperand(2).getValueType().isFloatingPoint())
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
- N->getValueType(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
break;
+ case Intrinsic::aarch64_sve_fcmpne:
case Intrinsic::aarch64_sve_cmpne:
- if (!N->getOperand(2).getValueType().isFloatingPoint())
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
- N->getValueType(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+ break;
+ case Intrinsic::aarch64_sve_fcmpuo:
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETUO));
break;
case Intrinsic::aarch64_sve_fadda:
return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
@@ -13744,8 +14566,8 @@
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
- (N->getOperand(0).getOpcode() == AArch64ISD::UABD ||
- N->getOperand(0).getOpcode() == AArch64ISD::SABD)) {
+ (N->getOperand(0).getOpcode() == ISD::ABDU ||
+ N->getOperand(0).getOpcode() == ISD::ABDS)) {
SDNode *ABDNode = N->getOperand(0).getNode();
SDValue NewABD =
tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
@@ -13754,78 +14576,7 @@
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
}
-
- // This is effectively a custom type legalization for AArch64.
- //
- // Type legalization will split an extend of a small, legal, type to a larger
- // illegal type by first splitting the destination type, often creating
- // illegal source types, which then get legalized in isel-confusing ways,
- // leading to really terrible codegen. E.g.,
- // %result = v8i32 sext v8i8 %value
- // becomes
- // %losrc = extract_subreg %value, ...
- // %hisrc = extract_subreg %value, ...
- // %lo = v4i32 sext v4i8 %losrc
- // %hi = v4i32 sext v4i8 %hisrc
- // Things go rapidly downhill from there.
- //
- // For AArch64, the [sz]ext vector instructions can only go up one element
- // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
- // take two instructions.
- //
- // This implies that the most efficient way to do the extend from v8i8
- // to two v4i32 values is to first extend the v8i8 to v8i16, then do
- // the normal splitting to happen for the v8i16->v8i32.
-
- // This is pre-legalization to catch some cases where the default
- // type legalization will create ill-tempered code.
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
-
- // We're only interested in cleaning things up for non-legal vector types
- // here. If both the source and destination are legal, things will just
- // work naturally without any fiddling.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT ResVT = N->getValueType(0);
- if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
- return SDValue();
- // If the vector type isn't a simple VT, it's beyond the scope of what
- // we're worried about here. Let legalization do its thing and hope for
- // the best.
- SDValue Src = N->getOperand(0);
- EVT SrcVT = Src->getValueType(0);
- if (!ResVT.isSimple() || !SrcVT.isSimple())
- return SDValue();
-
- // If the source VT is a 64-bit fixed or scalable vector, we can play games
- // and get the better results we want.
- if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
- return SDValue();
-
- unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
- ElementCount SrcEC = SrcVT.getVectorElementCount();
- SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
- SDLoc DL(N);
- Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
-
- // Now split the rest of the operation into two halves, each with a 64
- // bit source.
- EVT LoVT, HiVT;
- SDValue Lo, Hi;
- LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
-
- EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
- LoVT.getVectorElementCount());
- Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getConstant(0, DL, MVT::i64));
- Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
- Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
- Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
-
- // Now combine the parts back together so we still have a single result
- // like the combiner expects.
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+ return SDValue();
}
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
@@ -14235,6 +14986,16 @@
S->getMemOperand()->getFlags());
}
+static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
+
+ // splice(pg, op1, undef) -> op1
+ if (N->getOperand(2).isUndef())
+ return N->getOperand(1);
+
+ return SDValue();
+}
+
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
@@ -14260,6 +15021,86 @@
return SDValue();
}
+static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opc = N->getOpcode();
+
+ assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
+ Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
+ (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
+ Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
+ "Invalid opcode.");
+
+ const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
+
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Pg = N->getOperand(1);
+ SDValue Base = N->getOperand(2);
+ SDValue Offset = N->getOperand(3);
+ SDValue Ty = N->getOperand(4);
+
+ EVT ResVT = N->getValueType(0);
+
+ const auto OffsetOpc = Offset.getOpcode();
+ const bool OffsetIsZExt =
+ OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
+ const bool OffsetIsSExt =
+ OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
+
+ // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
+ if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
+ SDValue ExtPg = Offset.getOperand(0);
+ VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
+ EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
+
+ // If the predicate for the sign- or zero-extended offset is the
+ // same as the predicate used for this load and the sign-/zero-extension
+ // was from a 32-bits...
+ if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
+ SDValue UnextendedOffset = Offset.getOperand(1);
+
+ unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
+ if (Signed)
+ NewOpc = getSignExtendedGatherOpcode(NewOpc);
+
+ return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
+ {Chain, Pg, Base, UnextendedOffset, Ty});
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize a vector shift instruction and its operand if shifted out
+/// bits are not used.
+static SDValue performVectorShiftCombine(SDNode *N,
+ const AArch64TargetLowering &TLI,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == AArch64ISD::VASHR ||
+ N->getOpcode() == AArch64ISD::VLSHR);
+
+ SDValue Op = N->getOperand(0);
+ unsigned OpScalarSize = Op.getScalarValueSizeInBits();
+
+ unsigned ShiftImm = N->getConstantOperandVal(1);
+ assert(OpScalarSize > ShiftImm && "Invalid shift imm");
+
+ APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
+ APInt DemandedMask = ~ShiftedOutBits;
+
+ if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
@@ -14384,6 +15225,29 @@
return false;
}
+static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
+ assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
+ "Expected STORE dag node in input!");
+
+ if (auto Store = dyn_cast<StoreSDNode>(N)) {
+ if (!Store->isTruncatingStore() || Store->isIndexed())
+ return SDValue();
+ SDValue Ext = Store->getValue();
+ auto ExtOpCode = Ext.getOpcode();
+ if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
+ ExtOpCode != ISD::ANY_EXTEND)
+ return SDValue();
+ SDValue Orig = Ext->getOperand(0);
+ if (Store->getMemoryVT() != Orig->getValueType(0))
+ return SDValue();
+ return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
+ Store->getBasePtr(), Store->getPointerInfo(),
+ Store->getAlign());
+ }
+
+ return SDValue();
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -14395,54 +15259,8 @@
performTBISimplification(N->getOperand(2), DCI, DAG))
return SDValue(N, 0);
- return SDValue();
-}
-
-static SDValue performMaskedGatherScatterCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
- assert(MGS && "Can only combine gather load or scatter store nodes");
-
- SDLoc DL(MGS);
- SDValue Chain = MGS->getChain();
- SDValue Scale = MGS->getScale();
- SDValue Index = MGS->getIndex();
- SDValue Mask = MGS->getMask();
- SDValue BasePtr = MGS->getBasePtr();
- ISD::MemIndexType IndexType = MGS->getIndexType();
-
- EVT IdxVT = Index.getValueType();
-
- if (DCI.isBeforeLegalize()) {
- // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
- // prior to legalisation so the result can be split if required.
- if ((IdxVT.getVectorElementType() == MVT::i8) ||
- (IdxVT.getVectorElementType() == MVT::i16)) {
- EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
- if (MGS->isIndexSigned())
- Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
- else
- Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
-
- if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
- SDValue PassThru = MGT->getPassThru();
- SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale };
- return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
- PassThru.getValueType(), DL, Ops,
- MGT->getMemOperand(),
- MGT->getIndexType(), MGT->getExtensionType());
- } else {
- auto *MSC = cast<MaskedScatterSDNode>(MGS);
- SDValue Data = MSC->getValue();
- SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
- return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
- MSC->getMemoryVT(), DL, Ops,
- MSC->getMemOperand(), IndexType,
- MSC->isTruncatingStore());
- }
- }
- }
+ if (SDValue Store = foldTruncStoreOfExt(DAG, N))
+ return Store;
return SDValue();
}
@@ -14903,6 +15721,67 @@
return SDValue();
}
+// Optimize CSEL instructions
+static SDValue performCSELCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // CSEL x, x, cc -> x
+ if (N->getOperand(0) == N->getOperand(1))
+ return N->getOperand(0);
+
+ return performCONDCombine(N, DCI, DAG, 2, 3);
+}
+
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+ // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
+ if (Cond == ISD::SETNE && isOneConstant(RHS) &&
+ LHS->getOpcode() == AArch64ISD::CSEL &&
+ isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
+ LHS->hasOneUse()) {
+ SDLoc DL(N);
+
+ // Invert CSEL's condition.
+ auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
+ auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
+ auto NewCond = getInvertedCondCode(OldCond);
+
+ // csel 0, 1, !cond, X
+ SDValue CSEL =
+ DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
+ LHS.getOperand(3));
+ return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
+ }
+
+ return SDValue();
+}
+
+static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+ "Unexpected opcode!");
+
+ SDValue Pred = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
+
+ // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
+ // => inner setcc_merge_zero
+ if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+ LHS->getOpcode() == ISD::SIGN_EXTEND &&
+ LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
+ LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+ LHS->getOperand(0)->getOperand(0) == Pred)
+ return LHS->getOperand(0);
+
+ return SDValue();
+}
+
// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
// as well as whether the test should be inverted. This code is required to
// catch these cases (as opposed to standard dag combines) because
@@ -15015,7 +15894,41 @@
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
- if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+ // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
+ // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
+ // supported types.
+ SDValue SetCC = N->getOperand(0);
+ if (SetCC.getOpcode() == ISD::SETCC &&
+ SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
+ SDValue CmpLHS = SetCC.getOperand(0);
+ EVT VT = CmpLHS.getValueType();
+ SDNode *CmpRHS = SetCC.getOperand(1).getNode();
+ SDNode *SplatLHS = N->getOperand(1).getNode();
+ SDNode *SplatRHS = N->getOperand(2).getNode();
+ APInt SplatLHSVal;
+ if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
+ VT.isSimple() &&
+ is_contained(
+ makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v2i64}),
+ VT.getSimpleVT().SimpleTy) &&
+ ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
+ SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
+ ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> Ops(
+ NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
+ VT.getScalarType()));
+ SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
+
+ auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
+ auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
+ return Or;
+ }
+ }
+
+ if (N0.getOpcode() != ISD::SETCC ||
+ CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
CCVT.getVectorElementType() != MVT::i1)
return SDValue();
@@ -15028,10 +15941,9 @@
SDValue IfTrue = N->getOperand(1);
SDValue IfFalse = N->getOperand(2);
- SDValue SetCC =
- DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
- N0.getOperand(0), N0.getOperand(1),
- cast<CondCodeSDNode>(N0.getOperand(2))->get());
+ SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+ N0.getOperand(0), N0.getOperand(1),
+ cast<CondCodeSDNode>(N0.getOperand(2))->get());
return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
IfTrue, IfFalse);
}
@@ -15049,6 +15961,9 @@
if (N0.getOpcode() != ISD::SETCC)
return SDValue();
+ if (ResVT.isScalableVector())
+ return SDValue();
+
// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
// scalar SetCCResultType. We also don't expect vectors, because we assume
// that selects fed by vector SETCCs are canonicalized to VSELECT.
@@ -15181,7 +16096,6 @@
/// [<Zn>.[S|D]{, #<imm>}]
///
/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
-
inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
unsigned ScalarSizeInBytes) {
// The immediate is not a multiple of the scalar size.
@@ -15589,6 +16503,97 @@
return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
}
+// Return true if the vector operation can guarantee only the first lane of its
+// result contains data, with all bits in other lanes set to zero.
+static bool isLanes1toNKnownZero(SDValue Op) {
+ switch (Op.getOpcode()) {
+ default:
+ return false;
+ case AArch64ISD::ANDV_PRED:
+ case AArch64ISD::EORV_PRED:
+ case AArch64ISD::FADDA_PRED:
+ case AArch64ISD::FADDV_PRED:
+ case AArch64ISD::FMAXNMV_PRED:
+ case AArch64ISD::FMAXV_PRED:
+ case AArch64ISD::FMINNMV_PRED:
+ case AArch64ISD::FMINV_PRED:
+ case AArch64ISD::ORV_PRED:
+ case AArch64ISD::SADDV_PRED:
+ case AArch64ISD::SMAXV_PRED:
+ case AArch64ISD::SMINV_PRED:
+ case AArch64ISD::UADDV_PRED:
+ case AArch64ISD::UMAXV_PRED:
+ case AArch64ISD::UMINV_PRED:
+ return true;
+ }
+}
+
+static SDValue removeRedundantInsertVectorElt(SDNode *N) {
+ assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
+ SDValue InsertVec = N->getOperand(0);
+ SDValue InsertElt = N->getOperand(1);
+ SDValue InsertIdx = N->getOperand(2);
+
+ // We only care about inserts into the first element...
+ if (!isNullConstant(InsertIdx))
+ return SDValue();
+ // ...of a zero'd vector...
+ if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
+ return SDValue();
+ // ...where the inserted data was previously extracted...
+ if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue ExtractVec = InsertElt.getOperand(0);
+ SDValue ExtractIdx = InsertElt.getOperand(1);
+
+ // ...from the first element of a vector.
+ if (!isNullConstant(ExtractIdx))
+ return SDValue();
+
+ // If we get here we are effectively trying to zero lanes 1-N of a vector.
+
+ // Ensure there's no type conversion going on.
+ if (N->getValueType(0) != ExtractVec.getValueType())
+ return SDValue();
+
+ if (!isLanes1toNKnownZero(ExtractVec))
+ return SDValue();
+
+ // The explicit zeroing is redundant.
+ return ExtractVec;
+}
+
+static SDValue
+performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ if (SDValue Res = removeRedundantInsertVectorElt(N))
+ return Res;
+
+ return performPostLD1Combine(N, DCI, true);
+}
+
+SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT Ty = N->getValueType(0);
+ if (Ty.isInteger())
+ return SDValue();
+
+ EVT IntTy = Ty.changeVectorElementTypeToInteger();
+ EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
+ if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
+ IntTy.getVectorElementType().getScalarSizeInBits())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
+ DL, ExtIntTy);
+ SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
+ DL, ExtIntTy);
+ SDValue Idx = N->getOperand(2);
+ SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
+ return DAG.getBitcast(Ty, Trunc);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -15596,8 +16601,6 @@
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
- case ISD::ABS:
- return performABSCombine(N, DAG, DCI, Subtarget);
case ISD::ADD:
case ISD::SUB:
return performAddSubCombine(N, DCI, DAG);
@@ -15635,30 +16638,53 @@
return performSelectCombine(N, DCI);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
+ case ISD::SETCC:
+ return performSETCCCombine(N, DAG);
case ISD::LOAD:
if (performTBISimplification(N->getOperand(1), DCI, DAG))
return SDValue(N, 0);
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
- case ISD::MGATHER:
- case ISD::MSCATTER:
- return performMaskedGatherScatterCombine(N, DCI, DAG);
+ case ISD::VECTOR_SPLICE:
+ return performSVESpliceCombine(N, DAG);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
case AArch64ISD::TBZ:
return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:
- return performCONDCombine(N, DCI, DAG, 2, 3);
+ return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
+ case AArch64ISD::SPLICE:
+ return performSpliceCombine(N, DAG);
case AArch64ISD::UZP1:
return performUzpCombine(N, DAG);
+ case AArch64ISD::SETCC_MERGE_ZERO:
+ return performSetccMergeZeroCombine(N, DAG);
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLD1S_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
+ return performGLD1Combine(N, DAG);
+ case AArch64ISD::VASHR:
+ case AArch64ISD::VLSHR:
+ return performVectorShiftCombine(N, *this, DCI);
case ISD::INSERT_VECTOR_ELT:
- return performPostLD1Combine(N, DCI, true);
+ return performInsertVectorEltCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DAG);
case ISD::VECREDUCE_ADD:
@@ -15882,6 +16908,24 @@
LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
return DAG.getMergeValues({Result, Chain}, DL);
}
+ case Intrinsic::aarch64_rndr:
+ case Intrinsic::aarch64_rndrrs: {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ auto Register =
+ (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
+ : AArch64SysReg::RNDRRS);
+ SDLoc DL(N);
+ SDValue A = DAG.getNode(
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
+ N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
+ SDValue B = DAG.getNode(
+ AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
+ return DAG.getMergeValues(
+ {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
+ }
default:
break;
}
@@ -16008,13 +17052,22 @@
return true;
}
-static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
+void AArch64TargetLowering::ReplaceBITCASTResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
SDLoc DL(N);
SDValue Op = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Op.getValueType();
- if (N->getValueType(0) != MVT::i16 ||
- (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
+ if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
+ assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
+ "Expected fp->int bitcast!");
+ SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
+ return;
+ }
+
+ if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
return;
Op = SDValue(
@@ -16108,6 +17161,7 @@
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
@@ -16118,10 +17172,8 @@
N->getOperand(0), // Chain in
};
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
-
unsigned Opcode;
- switch (MemOp->getOrdering()) {
+ switch (MemOp->getMergedOrdering()) {
case AtomicOrdering::Monotonic:
Opcode = AArch64::CASPX;
break;
@@ -16156,15 +17208,32 @@
return;
}
+ unsigned Opcode;
+ switch (MemOp->getMergedOrdering()) {
+ case AtomicOrdering::Monotonic:
+ Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
+ break;
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CMP_SWAP_128_RELEASE;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CMP_SWAP_128;
+ break;
+ default:
+ llvm_unreachable("Unexpected ordering!");
+ }
+
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
New.first, New.second, N->getOperand(0)};
SDNode *CmpSwap = DAG.getMachineNode(
- AArch64::CMP_SWAP_128, SDLoc(N),
- DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
-
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+ Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
+ Ops);
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
@@ -16242,6 +17311,10 @@
case ISD::EXTRACT_SUBVECTOR:
ReplaceExtractSubVectorResults(N, Results, DAG);
return;
+ case ISD::INSERT_SUBVECTOR:
+ // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
+ // to common code for result type legalisation
+ return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
assert((VT == MVT::i8 || VT == MVT::i16) &&
@@ -16381,19 +17454,26 @@
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
+
+ // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
+ // it.
+ unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
+ if (Size > 64)
+ return AtomicExpansionKind::None;
+
return AtomicExpansionKind::LLSC;
}
-Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
+ Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
// single i128 here.
- if (ValTy->getPrimitiveSizeInBits() == 128) {
+ if (ValueTy->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
Function *Ldxr = Intrinsic::getDeclaration(M, Int);
@@ -16403,10 +17483,10 @@
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
- Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
- Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
return Builder.CreateOr(
- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
}
Type *Tys[] = { Addr->getType() };
@@ -16414,22 +17494,20 @@
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
- Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
-
const DataLayout &DL = M->getDataLayout();
- IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
+ IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
- return Builder.CreateBitCast(Trunc, EltTy);
+ return Builder.CreateBitCast(Trunc, ValueTy);
}
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
- IRBuilder<> &Builder) const {
+ IRBuilderBase &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
}
-Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -16466,15 +17544,17 @@
}
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
- if (Ty->isArrayTy())
- return true;
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const {
+ if (!Ty->isArrayTy()) {
+ const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
+ return TySize.isScalable() && TySize.getKnownMinSize() > 128;
+ }
- const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
- if (TySize.isScalable() && TySize.getKnownMinSize() > 128)
- return true;
-
- return false;
+ // All non aggregate members of the type must have the same type
+ SmallVector<EVT> ValueVTs;
+ ComputeValueVTs(*this, DL, Ty, ValueVTs);
+ return is_splat(ValueVTs);
}
bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
@@ -16482,7 +17562,7 @@
return false;
}
-static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
+static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
@@ -16492,7 +17572,7 @@
IRB.getInt8PtrTy()->getPointerTo(0));
}
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
// Android provides a fixed TLS slot for the stack cookie. See the definition
// of TLS_SLOT_STACK_GUARD in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
@@ -16541,7 +17621,8 @@
return TargetLowering::getSSPStackGuardCheck(M);
}
-Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+Value *
+AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
@@ -16866,6 +17947,66 @@
return DAG.getMergeValues(MergedValues, DL);
}
+static SDValue convertFixedMaskToScalableVector(SDValue Mask,
+ SelectionDAG &DAG) {
+ SDLoc DL(Mask);
+ EVT InVT = Mask.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+ auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
+ auto Op2 = DAG.getConstant(0, DL, ContainerVT);
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+ EVT CmpVT = Pg.getValueType();
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
+ {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Load = cast<MaskedLoadSDNode>(Op);
+
+ if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)
+ return SDValue();
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
+
+ SDValue PassThru;
+ bool IsPassThruZeroOrUndef = false;
+
+ if (Load->getPassThru()->isUndef()) {
+ PassThru = DAG.getUNDEF(ContainerVT);
+ IsPassThruZeroOrUndef = true;
+ } else {
+ if (ContainerVT.isInteger())
+ PassThru = DAG.getConstant(0, DL, ContainerVT);
+ else
+ PassThru = DAG.getConstantFP(0, DL, ContainerVT);
+ if (isZerosVector(Load->getPassThru().getNode()))
+ IsPassThruZeroOrUndef = true;
+ }
+
+ auto NewLoad = DAG.getMaskedLoad(
+ ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+ Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
+ Load->getAddressingMode(), Load->getExtensionType());
+
+ if (!IsPassThruZeroOrUndef) {
+ SDValue OldPassThru =
+ convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
+ NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);
+ }
+
+ auto Result = convertFromScalableVector(DAG, VT, NewLoad);
+ SDValue MergedValues[2] = {Result, Load->getChain()};
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
// Convert all fixed length vector stores larger than NEON to masked_stores.
SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {
@@ -16883,6 +18024,26 @@
Store->isTruncatingStore());
}
+SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Store = cast<MaskedStoreSDNode>(Op);
+
+ if (Store->isTruncatingStore())
+ return SDValue();
+
+ SDLoc DL(Op);
+ EVT VT = Store->getValue().getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
+
+ return DAG.getMaskedStore(
+ Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+ Mask, Store->getMemoryVT(), Store->getMemOperand(),
+ Store->getAddressingMode(), Store->isTruncatingStore());
+}
+
SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -16902,6 +18063,16 @@
EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
+ // If this is not a full vector, extend, div, and truncate it.
+ EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+ if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
+ unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
+ SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
+ SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
+ }
+
// Convert the operands to scalable vectors.
SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
@@ -17005,6 +18176,35 @@
return convertFromScalableVector(DAG, VT, Val);
}
+SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT InVT = Op.getOperand(0).getValueType();
+ assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
+
+ auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
+ Op.getOperand(1), Op.getOperand(2));
+
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
// Convert vector operation 'Op' to an equivalent predicated operation whereby
// the original operation's type is used to construct a suitable predicate.
// NOTE: The results for inactive lanes are undefined.
@@ -17221,10 +18421,6 @@
assert(Op.getValueType() == InVT.changeTypeToInteger() &&
"Expected integer result of the same bit length as the inputs!");
- // Expand floating point vector comparisons.
- if (InVT.isFloatingPoint())
- return SDValue();
-
auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
@@ -17238,6 +18434,229 @@
return convertFromScalableVector(DAG, Op.getValueType(), Promote);
}
+SDValue
+AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto SrcOp = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT =
+ getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
+
+ SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
+ Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
+ return convertFromScalableVector(DAG, VT, Op);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ unsigned NumOperands = Op->getNumOperands();
+
+ assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ auto SrcOp1 = Op.getOperand(0);
+ auto SrcOp2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ EVT SrcVT = SrcOp1.getValueType();
+
+ if (NumOperands > 2) {
+ SmallVector<SDValue, 4> Ops;
+ EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+ for (unsigned I = 0; I < NumOperands; I += 2)
+ Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
+ Op->getOperand(I), Op->getOperand(I + 1)));
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
+ }
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
+ SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
+ SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
+
+ Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
+
+ return convertFromScalableVector(DAG, VT, Op);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ SDValue Pg = getPredicateForVector(DAG, DL, VT);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ExtendVT = ContainerVT.changeVectorElementType(
+ SrcVT.getVectorElementType());
+
+ Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
+
+ Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
+ Val = getSVESafeBitCast(ExtendVT, Val, DAG);
+ Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, Val, DAG.getUNDEF(ContainerVT));
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ EVT RoundVT = ContainerSrcVT.changeVectorElementType(
+ VT.getVectorElementType());
+ SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
+ Op.getOperand(1), DAG.getUNDEF(RoundVT));
+ Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
+ Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
+
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Val);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
+ unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+ : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+
+ if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
+ ContainerDstVT.getVectorElementType().getSizeInBits()) {
+ SDValue Pg = getPredicateForVector(DAG, DL, VT);
+
+ Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ VT.changeTypeToInteger(), Val);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
+ // Safe to use a larger than specified operand since we just unpacked the
+ // data, hence the upper bits are zero.
+ Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
+ DAG.getUNDEF(ContainerDstVT));
+ return convertFromScalableVector(DAG, VT, Val);
+ } else {
+ EVT CvtVT = ContainerSrcVT.changeVectorElementType(
+ ContainerDstVT.getVectorElementType());
+ SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
+ Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
+ Val = convertFromScalableVector(DAG, SrcVT, Val);
+
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Val);
+ }
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
+ unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
+ : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+
+ if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
+ ContainerDstVT.getVectorElementType().getSizeInBits()) {
+ EVT CvtVT = ContainerDstVT.changeVectorElementType(
+ ContainerSrcVT.getVectorElementType());
+ SDValue Pg = getPredicateForVector(DAG, DL, VT);
+
+ Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = getSVESafeBitCast(CvtVT, Val, DAG);
+ Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
+ DAG.getUNDEF(ContainerDstVT));
+ return convertFromScalableVector(DAG, VT, Val);
+ } else {
+ EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
+ SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
+
+ // Safe to use a larger than specified result since an fp_to_int where the
+ // result doesn't fit into the destination is undefined.
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
+ Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
+
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ auto ShuffleMask = SVN->getMask();
+
+ SDLoc DL(Op);
+ SDValue Op1 = Op.getOperand(0);
+ SDValue Op2 = Op.getOperand(1);
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
+ Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
+
+ bool ReverseEXT = false;
+ unsigned Imm;
+ if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
+ Imm == VT.getVectorNumElements() - 1) {
+ if (ReverseEXT)
+ std::swap(Op1, Op2);
+
+ EVT ScalarTy = VT.getVectorElementType();
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+ ScalarTy = MVT::i32;
+ SDValue Scalar = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
+ DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
+ Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -17260,8 +18679,6 @@
EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
- assert((VT == PackedVT || InVT == PackedInVT) &&
- "Cannot cast between unpacked scalable vector types!");
// Pack input if required.
if (InVT != PackedInVT)
@@ -17275,3 +18692,60 @@
return Op;
}
+
+bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
+ return ::isAllActivePredicate(N);
+}
+
+EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
+ return ::getPromotedVTForPredicate(VT);
+}
+
+bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
+
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case AArch64ISD::VSHL: {
+ // Match (VSHL (VLSHR Val X) X)
+ SDValue ShiftL = Op;
+ SDValue ShiftR = Op->getOperand(0);
+ if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
+ return false;
+
+ if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
+ return false;
+
+ unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
+ unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
+
+ // Other cases can be handled as well, but this is not
+ // implemented.
+ if (ShiftRBits != ShiftLBits)
+ return false;
+
+ unsigned ScalarSize = Op.getScalarValueSizeInBits();
+ assert(ScalarSize > ShiftLBits && "Invalid shift imm");
+
+ APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
+ APInt UnusedBits = ~OriginalDemandedBits;
+
+ if ((ZeroBits & UnusedBits) != ZeroBits)
+ return false;
+
+ // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
+ // used - simplify to just Val.
+ return TLO.CombineTo(Op, ShiftR->getOperand(0));
+ }
+ }
+
+ return TargetLowering::SimplifyDemandedBitsForTargetNode(
+ Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
+
+bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal(
+ unsigned Opc, LLT Ty1, LLT Ty2) const {
+ return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9550197..2b33725 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -16,6 +16,7 @@
#include "AArch64.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/CallingConv.h"
@@ -50,6 +51,10 @@
WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
CALL, // Function call.
+ // Pseudo for a OBJC call that gets emitted together with a special `mov
+ // x29, x29` marker instruction.
+ CALL_RVMARKER,
+
// Produces the full sequence of instructions for getting the thread pointer
// offset of a variable into X0, using the TLSDesc model.
TLSDESC_CALLSEQ,
@@ -61,7 +66,6 @@
RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
BRCOND, // Conditional branch instruction; "b.cond".
CSEL,
- FCSEL, // Conditional move instruction.
CSINV, // Conditional select invert.
CSNEG, // Conditional select negate.
CSINC, // Conditional select increment.
@@ -79,9 +83,13 @@
FMA_PRED,
FMAXNM_PRED,
FMINNM_PRED,
+ FMAX_PRED,
+ FMIN_PRED,
FMUL_PRED,
FSUB_PRED,
MUL_PRED,
+ MULHS_PRED,
+ MULHU_PRED,
SDIV_PRED,
SHL_PRED,
SMAX_PRED,
@@ -93,6 +101,9 @@
UMAX_PRED,
UMIN_PRED,
+ // Unpredicated vector instructions
+ BIC,
+
// Predicated instructions with the result of inactive lanes provided by the
// last operand.
FABS_MERGE_PASSTHRU,
@@ -161,9 +172,6 @@
// element must be identical.
BSP,
- // Vector arithmetic negation
- NEG,
-
// Vector shuffles
ZIP1,
ZIP2,
@@ -175,6 +183,7 @@
REV32,
REV64,
EXT,
+ SPLICE,
// Vector shift by scalar
VSHL,
@@ -227,9 +236,12 @@
SRHADD,
URHADD,
- // Absolute difference
- UABD,
- SABD,
+ // Unsigned Add Long Pairwise
+ UADDLP,
+
+ // udot/sdot instructions
+ UDOT,
+ SDOT,
// Vector across-lanes min/max
// Only the lower result lane is defined.
@@ -274,6 +286,8 @@
/// mode without emitting such REV instructions.
NVCAST,
+ MRS, // MRS, also sets the flags via a glue.
+
SMULL,
UMULL,
@@ -292,7 +306,6 @@
CLASTB_N,
LASTA,
LASTB,
- REV,
TBL,
// Floating-point reductions.
@@ -317,6 +330,10 @@
// Cast between vectors of the same element type but differ in length.
REINTERPRET_CAST,
+ // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa
+ LS64_BUILD,
+ LS64_EXTRACT,
+
LD1_MERGE_ZERO,
LD1S_MERGE_ZERO,
LDNF1_MERGE_ZERO,
@@ -425,10 +442,6 @@
LDP,
STP,
STNP,
-
- // Pseudo for a OBJC call that gets emitted together with a special `mov
- // x29, x29` marker instruction.
- CALL_RVMARKER
};
} // end namespace AArch64ISD
@@ -440,16 +453,31 @@
// be copying from a truncate. But any other 32-bit operation will zero-extend
// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
// 32 bits, they're probably just qualifying a CopyFromReg.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
static inline bool isDef32(const SDNode &N) {
unsigned Opc = N.getOpcode();
return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
- Opc != ISD::AssertZext;
+ Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
+ Opc != ISD::FREEZE;
}
} // end anonymous namespace
+namespace AArch64 {
+/// Possible values of current rounding mode, which is specified in bits
+/// 23:22 of FPCR.
+enum Rounding {
+ RN = 0, // Round to Nearest
+ RP = 1, // Round towards Plus infinity
+ RM = 2, // Round towards Minus infinity
+ RZ = 3, // Round towards Zero
+ rmMask = 3 // Bit mask selecting rounding mode
+};
+
+// Bit position of rounding mode bits in FPCR.
+const unsigned RoundingBitsPos = 22;
+} // namespace AArch64
+
class AArch64Subtarget;
class AArch64TargetMachine;
@@ -488,7 +516,7 @@
/// Returns true if the target allows unaligned memory accesses of the
/// specified type.
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
+ EVT VT, unsigned AddrSpace = 0, Align Alignment = Align(1),
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
/// LLT variant.
@@ -586,8 +614,8 @@
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.
- int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
- unsigned AS) const override;
+ InstructionCost getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
@@ -596,6 +624,9 @@
EVT VT) const override;
bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
+ bool generateFMAsInMachineCombiner(EVT VT,
+ CodeGenOpt::Level OptLevel) const override;
+
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
/// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
@@ -619,12 +650,12 @@
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
- Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const override;
- Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
- Value *Addr, AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
+ AtomicOrdering Ord) const override;
- void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
@@ -641,7 +672,7 @@
/// If the target has a standard location for the stack protector cookie,
/// returns the address of that location. Otherwise, returns nullptr.
- Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+ Value *getIRStackGuard(IRBuilderBase &IRB) const override;
void insertSSPDeclarations(Module &M) const override;
Value *getSDagStackGuard(const Module &M) const override;
@@ -649,7 +680,7 @@
/// If the target has a standard location for the unsafe stack pointer,
/// returns the address of that location. Otherwise, returns nullptr.
- Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+ Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
@@ -770,9 +801,10 @@
MachineMemOperand::Flags getTargetMMOFlags(
const Instruction &I) const override;
- bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
- CallingConv::ID CallConv,
- bool isVarArg) const override;
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const override;
+
/// Used for exception handling on Win64.
bool needsFixedCatchObjects() const override;
@@ -786,6 +818,19 @@
/// vector types this override can be removed.
bool mergeStoresAfterLegalization(EVT VT) const override;
+ // If the platform/function should have a redzone, return the size in bytes.
+ unsigned getRedZoneSize(const Function &F) const {
+ if (F.hasFnAttribute(Attribute::NoRedZone))
+ return 0;
+ return 128;
+ }
+
+ bool isAllActivePredicate(SDValue N) const;
+ EVT getPromotedVTForPredicate(EVT VT) const;
+
+ EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const override;
+
private:
/// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
@@ -793,7 +838,7 @@
bool isExtFreeImpl(const Instruction *Ext) const override;
- void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+ void addTypeForNEON(MVT VT);
void addTypeForFixedLengthSVE(MVT VT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
@@ -814,12 +859,15 @@
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
@@ -896,6 +944,7 @@
SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -906,27 +955,30 @@
SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
bool OverrideNEON = false) const;
SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
@@ -944,6 +996,7 @@
SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,
SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorMLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;
SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;
SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,
@@ -951,8 +1004,21 @@
SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorMStoreToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthExtractVectorElt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthInsertVectorElt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthBitcastToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthConcatVectorsToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthFPExtendToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthFPRoundToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthIntToFPToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
@@ -996,6 +1062,7 @@
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ bool shouldExtendGSIndex(EVT VT, EVT &EltTy) const override;
bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
@@ -1012,6 +1079,8 @@
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
void ReplaceExtractSubVectorResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
@@ -1023,6 +1092,13 @@
bool shouldLocalize(const MachineInstr &MI,
const TargetTransformInfo *TTI) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+ const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
// Normally SVE is only used for byte size vectors that do not fit within a
// NEON vector. This changes when OverrideNEON is true, allowing SVE to be
// used for 64bit and 128bit vectors as well.
@@ -1038,6 +1114,9 @@
// to transition between unpacked and packed types of the same element type,
// with BITCAST used otherwise.
SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
+
+ bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
+ LLT Ty2) const override;
};
namespace AArch64 {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 27e1d8e..84573da 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -429,11 +429,16 @@
}
let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
- mayLoad = 1, mayStore = 1 in
-def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
- (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
- GPR64:$newLo, GPR64:$newHi), []>,
- Sched<[WriteAtomic]>;
+ mayLoad = 1, mayStore = 1 in {
+class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch),
+ (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+ GPR64:$newLo, GPR64:$newHi), []>,
+ Sched<[WriteAtomic]>;
+def CMP_SWAP_128 : cmp_swap_128;
+def CMP_SWAP_128_RELEASE : cmp_swap_128;
+def CMP_SWAP_128_ACQUIRE : cmp_swap_128;
+def CMP_SWAP_128_MONOTONIC : cmp_swap_128;
+}
// v8.1 Atomic instructions:
let Predicates = [HasLSE] in {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cf08f56..9bc2539 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -36,6 +36,7 @@
def DestructiveBinaryComm : DestructiveInstTypeEnum<6>;
def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>;
def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
+def DestructiveUnaryPassthru : DestructiveInstTypeEnum<9>;
class FalseLanesEnum<bits<2> val> {
bits<2> Value = val;
@@ -721,6 +722,7 @@
}
def Imm0_1Operand : AsmImmRange<0, 1>;
+def Imm0_3Operand : AsmImmRange<0, 3>;
def Imm0_7Operand : AsmImmRange<0, 7>;
def Imm0_15Operand : AsmImmRange<0, 15>;
def Imm0_31Operand : AsmImmRange<0, 31>;
@@ -890,6 +892,12 @@
let ParserMatchClass = Imm0_63Operand;
}
+def timm0_63 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 64;
+}]> {
+ let ParserMatchClass = Imm0_63Operand;
+}
+
// imm0_31 predicate - True if the immediate is in the range [0,31]
def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 32;
@@ -940,6 +948,13 @@
let ParserMatchClass = Imm0_7Operand;
}
+// imm0_3 predicate - True if the immediate is in the range [0,3]
+def imm0_3 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 4;
+}]> {
+ let ParserMatchClass = Imm0_3Operand;
+}
+
// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 8;
@@ -1149,36 +1164,44 @@
GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
// Floating-point immediate.
-def fpimm16 : Operand<f16>,
- FPImmLeaf<f16, [{
- return AArch64_AM::getFP16Imm(Imm) != -1;
- }], SDNodeXForm<fpimm, [{
+
+def fpimm16XForm : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP16Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
- }]>> {
- let ParserMatchClass = FPImmOperand;
- let PrintMethod = "printFPImmOperand";
-}
-def fpimm32 : Operand<f32>,
- FPImmLeaf<f32, [{
- return AArch64_AM::getFP32Imm(Imm) != -1;
- }], SDNodeXForm<fpimm, [{
+ }]>;
+
+def fpimm32XForm : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP32Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
- }]>> {
+ }]>;
+
+def fpimm64XForm : SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>;
+
+def fpimm16 : Operand<f16>,
+ FPImmLeaf<f16, [{
+ return AArch64_AM::getFP16Imm(Imm) != -1;
+ }], fpimm16XForm> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm32 : Operand<f32>,
+ FPImmLeaf<f32, [{
+ return AArch64_AM::getFP32Imm(Imm) != -1;
+ }], fpimm32XForm> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
}
def fpimm64 : Operand<f64>,
FPImmLeaf<f64, [{
return AArch64_AM::getFP64Imm(Imm) != -1;
- }], SDNodeXForm<fpimm, [{
- APFloat InVal = N->getValueAPF();
- uint32_t enc = AArch64_AM::getFP64Imm(InVal);
- return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
- }]>> {
+ }], fpimm64XForm> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
}
@@ -1192,6 +1215,13 @@
return Imm.isExactlyValue(+0.0);
}]>;
+def gi_fpimm16 : GICustomOperandRenderer<"renderFPImm16">,
+ GISDNodeXFormEquiv<fpimm16XForm>;
+def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">,
+ GISDNodeXFormEquiv<fpimm32XForm>;
+def gi_fpimm64 : GICustomOperandRenderer<"renderFPImm64">,
+ GISDNodeXFormEquiv<fpimm64XForm>;
+
// Vector lane operands
class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
let Name = NamePrefix # "IndexRange" # Min # "_" # Max;
@@ -1462,7 +1492,7 @@
// "bti" is an alias to "hint" only for certain values of CRm:Op2 fields.
if (!MCOp.isImm())
return false;
- return AArch64BTIHint::lookupBTIByEncoding((MCOp.getImm() ^ 32) >> 1) != nullptr;
+ return AArch64BTIHint::lookupBTIByEncoding(MCOp.getImm() ^ 32) != nullptr;
}];
}
@@ -1471,6 +1501,11 @@
bits<16> systemreg;
let Inst{20-5} = systemreg;
let DecoderNamespace = "Fallback";
+ // The MRS is set as a NZCV setting instruction. Not all MRS instructions
+ // require doing this. The alternative was to explicitly model each one, but
+ // it feels like it is unnecessary because it seems there are no negative
+ // consequences setting these flags for all.
+ let Defs = [NZCV];
}
// FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -1931,7 +1966,8 @@
}
class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm>
- : I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "",
+ : I<(outs GPR64:$Rd), (ins GPR64:$src, GPR64sp:$Rn), asm, "\t$Rd, $Rn",
+ "$Rd = $src",
[]>,
Sched<[WriteI, ReadI]> {
bits<5> Rd;
@@ -1944,7 +1980,8 @@
}
class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm>
- : I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> {
+ : I<(outs GPR64:$Rd), (ins GPR64:$src), asm, "\t$Rd", "$Rd = $src",
+ []>, Sched<[]> {
bits<5> Rd;
let Inst{31-15} = 0b11011010110000010;
let Inst{14-12} = opcode_prefix;
@@ -3099,6 +3136,13 @@
def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,63].
+def am_indexed8_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<1,63>", []>;
+def am_indexed16_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<2,63>", []>;
+def am_indexed32_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<4,63>", []>;
+def am_indexed64_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<8,63>", []>;
+
def gi_am_indexed8 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">,
GIComplexPatternEquiv<am_indexed8>;
@@ -3162,7 +3206,7 @@
let DecoderMethod = "DecodeUnsignedLdStInstruction";
}
-multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
@@ -3174,7 +3218,7 @@
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
-multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs),
@@ -3377,7 +3421,7 @@
def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
ro_Xextend128>;
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3399,11 +3443,11 @@
let Inst{4-0} = Rt;
}
-class ROInstAlias<string asm, RegisterOperand regtype, Instruction INST>
+class ROInstAlias<string asm, DAGOperand regtype, Instruction INST>
: InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
-multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm,
@@ -3430,7 +3474,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
@@ -3455,7 +3499,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3477,7 +3521,7 @@
let Inst{4-0} = Rt;
}
-multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3502,7 +3546,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
@@ -3527,7 +3571,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3549,7 +3593,7 @@
let Inst{4-0} = Rt;
}
-multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3574,7 +3618,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
@@ -3599,7 +3643,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3621,7 +3665,7 @@
let Inst{4-0} = Rt;
}
-multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3646,7 +3690,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
@@ -3671,7 +3715,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3693,7 +3737,7 @@
let Inst{4-0} = Rt;
}
-multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3718,7 +3762,7 @@
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
@@ -3834,7 +3878,7 @@
// Armv8.4 LDAPR & STLR with Immediate Offset instruction
multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
- RegisterOperand regtype > {
+ DAGOperand regtype > {
def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm, []>,
Sched<[WriteST]> {
@@ -3846,7 +3890,7 @@
}
multiclass BaseStoreUnscaleV84<string asm, bits<2> sz, bits<2> opc,
- RegisterOperand regtype > {
+ DAGOperand regtype > {
def i : BaseLoadStoreUnscale<sz, 0, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, []>,
@@ -3858,7 +3902,7 @@
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
-multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before LoadUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
@@ -3869,7 +3913,7 @@
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
-multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before StoreUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
@@ -4190,7 +4234,7 @@
let DecoderMethod = "DecodePairLdStInstruction";
}
-multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, DAGOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
def i : BaseLoadStorePairNoAlloc<opc, V, 1,
@@ -4204,7 +4248,7 @@
GPR64sp:$Rn, 0)>;
}
-multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+multiclass StorePairNoAlloc<bits<2> opc, bit V, DAGOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
@@ -4612,7 +4656,7 @@
class BaseIntegerToFPUnscaled<bit isUnsigned,
RegisterClass srcType, RegisterClass dstType,
- ValueType dvt, string asm, SDNode node>
+ ValueType dvt, string asm, SDPatternOperator node>
: I<(outs dstType:$Rd), (ins srcType:$Rn),
asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
Sched<[WriteFCvt]> {
@@ -4627,7 +4671,7 @@
let Inst{4-0} = Rd;
}
-multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+multiclass IntegerToFP<bit isUnsigned, string asm, SDPatternOperator node> {
// Unscaled
def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
@@ -8932,10 +8976,13 @@
SDPatternOperator OpNode> {
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftL64, asm,
- [(set (v1i64 FPR64:$Rd),
- (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+ [(set (i64 FPR64:$Rd),
+ (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
let Inst{21-16} = imm{5-0};
}
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
}
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
@@ -10496,7 +10543,7 @@
(v2i32 (AArch64duplane32
(v4i32 V128:$Rm),
VectorIndexS:$idx)))),
- (i32 0))),
+ (i64 0))),
(i64 0))))),
(EXTRACT_SUBREG
(v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
@@ -10912,8 +10959,8 @@
}
class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
- : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops,
- "$Vm = $Vd", []> {
+ : BaseCryptoV82<(outs V128:$Vdst), (ins V128:$Vd, V128:$Vn), asm, asmops,
+ "$Vd = $Vdst", []> {
let Inst{31-25} = 0b1100111;
let Inst{24-21} = 0b0110;
let Inst{20-15} = 0b000001;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 25656fa..58b6dca 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -132,28 +132,81 @@
def G_EXT: AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm);
+ let hasSideEffects = 0;
}
// Represents a vector G_ASHR with an immediate.
def G_VASHR : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+ let hasSideEffects = 0;
}
// Represents a vector G_LSHR with an immediate.
def G_VLSHR : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+ let hasSideEffects = 0;
}
// Represents an integer to FP conversion on the FPR bank.
def G_SITOF : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
}
def G_UITOF : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+def G_FCMEQ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type1:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_FCMGE : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type1:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_FCMGT : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type1:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_FCMEQZ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+def G_FCMGEZ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+def G_FCMGTZ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+def G_FCMLEZ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+def G_FCMLTZ : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
}
def : GINodeEquiv<G_REV16, AArch64rev16>;
@@ -176,9 +229,59 @@
def : GINodeEquiv<G_SITOF, AArch64sitof>;
def : GINodeEquiv<G_UITOF, AArch64uitof>;
+def : GINodeEquiv<G_FCMEQ, AArch64fcmeq>;
+def : GINodeEquiv<G_FCMGE, AArch64fcmge>;
+def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
+
+def : GINodeEquiv<G_FCMEQZ, AArch64fcmeqz>;
+def : GINodeEquiv<G_FCMGEZ, AArch64fcmgez>;
+def : GINodeEquiv<G_FCMGTZ, AArch64fcmgtz>;
+def : GINodeEquiv<G_FCMLEZ, AArch64fcmlez>;
+def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
+
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
// These are patterns that we only use for GlobalISel via the importer.
def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
(vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
(f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>;
+
+let Predicates = [HasNEON] in {
+ def : Pat<(v2f64 (sint_to_fp v2i32:$src)),
+ (SCVTFv2f64 (SSHLLv2i32_shift V64:$src, 0))>;
+ def : Pat<(v2f64 (uint_to_fp v2i32:$src)),
+ (UCVTFv2f64 (USHLLv2i32_shift V64:$src, 0))>;
+ def : Pat<(v2f32 (sint_to_fp v2i64:$src)),
+ (FCVTNv2i32 (SCVTFv2f64 V128:$src))>;
+ def : Pat<(v2f32 (uint_to_fp v2i64:$src)),
+ (FCVTNv2i32 (UCVTFv2f64 V128:$src))>;
+
+ def : Pat<(v2i64 (fp_to_sint v2f32:$src)),
+ (FCVTZSv2f64 (FCVTLv2i32 V64:$src))>;
+ def : Pat<(v2i64 (fp_to_uint v2f32:$src)),
+ (FCVTZUv2f64 (FCVTLv2i32 V64:$src))>;
+ def : Pat<(v2i32 (fp_to_sint v2f64:$src)),
+ (XTNv2i32 (FCVTZSv2f64 V128:$src))>;
+ def : Pat<(v2i32 (fp_to_uint v2f64:$src)),
+ (XTNv2i32 (FCVTZUv2f64 V128:$src))>;
+
+}
+
+let Predicates = [HasNoLSE] in {
+def : Pat<(atomic_cmp_swap_8 GPR64:$addr, GPR32:$desired, GPR32:$new),
+ (CMP_SWAP_8 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
+
+def : Pat<(atomic_cmp_swap_16 GPR64:$addr, GPR32:$desired, GPR32:$new),
+ (CMP_SWAP_16 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
+
+def : Pat<(atomic_cmp_swap_32 GPR64:$addr, GPR32:$desired, GPR32:$new),
+ (CMP_SWAP_32 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
+
+def : Pat<(atomic_cmp_swap_64 GPR64:$addr, GPR64:$desired, GPR64:$new),
+ (CMP_SWAP_64 GPR64:$addr, GPR64:$desired, GPR64:$new)>;
+}
+
+def : Pat<(int_aarch64_stlxp GPR64:$lo, GPR64:$hi, GPR64:$addr),
+ (STLXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>;
+def : Pat<(int_aarch64_stxp GPR64:$lo, GPR64:$hi, GPR64:$addr),
+ (STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6b38e21..091a62a 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
@@ -134,6 +135,9 @@
case AArch64::SPACE:
NumBytes = MI.getOperand(1).getImm();
break;
+ case AArch64::StoreSwiftAsyncContext:
+ NumBytes = 20;
+ break;
case TargetOpcode::BUNDLE:
NumBytes = getInstBundleLength(MI);
break;
@@ -1116,6 +1120,16 @@
if (!MI.getOperand(1).isReg())
return false;
+ auto NormalizeCmpValue = [](int64_t Value) -> int {
+ // Comparison immediates may be 64-bit, but CmpValue is only an int.
+ // Normalize to 0/1/2 return value, where 2 indicates any value apart from
+ // 0 or 1.
+ // TODO: Switch CmpValue to int64_t in the API to avoid this.
+ if (Value == 0 || Value == 1)
+ return Value;
+ return 2;
+ };
+
switch (MI.getOpcode()) {
default:
break;
@@ -1151,8 +1165,7 @@
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
- // FIXME: In order to convert CmpValue to 0 or 1
- CmpValue = MI.getOperand(2).getImm() != 0;
+ CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm());
return true;
case AArch64::ANDSWri:
case AArch64::ANDSXri:
@@ -1161,14 +1174,9 @@
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
- // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
- // while the type of CmpValue is int. When converting uint64_t to int,
- // the high 32 bits of uint64_t will be lost.
- // In fact it causes a bug in spec2006-483.xalancbmk
- // CmpValue is only used to compare with zero in OptimizeCompareInstr
- CmpValue = AArch64_AM::decodeLogicalImmediate(
+ CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate(
MI.getOperand(2).getImm(),
- MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+ MI.getOpcode() == AArch64::ANDSWri ? 32 : 64));
return true;
}
@@ -1365,6 +1373,18 @@
OpChanged = true;
break;
}
+ case AArch64::RDFFR_PPz: {
+ // rdffr p1.b, PredMask=p0/z <--- Definition of Pred
+ // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use
+ // `rdffrs p1.b, p0/z` above.
+ auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (Mask != PredMask)
+ return false;
+
+ NewOp = AArch64::RDFFRS_PPz;
+ OpChanged = true;
+ break;
+ }
default:
// Bail out if we don't recognize the input
return false;
@@ -1373,23 +1393,11 @@
const TargetRegisterInfo *TRI = &getRegisterInfo();
- // If the predicate is in a different block (possibly because its been
- // hoisted out), then assume the flags are set in between statements.
- if (Pred->getParent() != PTest->getParent())
+ // If another instruction between Pred and PTest accesses flags, don't remove
+ // the ptest or update the earlier instruction to modify them.
+ if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
return false;
- // If another instruction between the propagation and test sets the
- // flags, don't remove the ptest.
- MachineBasicBlock::iterator I = Pred, E = PTest;
- ++I; // Skip past the predicate op itself.
- for (; I != E; ++I) {
- const MachineInstr &Inst = *I;
-
- // TODO: If the ptest flags are unused, we could still remove it.
- if (Inst.modifiesRegister(AArch64::NZCV, TRI))
- return false;
- }
-
// If we pass all the checks, it's safe to remove the PTEST and use the flags
// as they are prior to PTEST. Sometimes this requires the tested PTEST
// operand to be replaced with an equivalent instruction that also sets the
@@ -1458,18 +1466,20 @@
if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
- // Continue only if we have a "ri" where immediate is zero.
- // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
- // function.
- assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
- if (CmpValue != 0 || SrcReg2 != 0)
+ // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1.
+ assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) &&
+ "CmpValue must be 0, 1, or 2!");
+ if (SrcReg2 != 0)
return false;
// CmpInstr is a Compare instruction if destination register is not used.
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
- return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+ if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
+ return true;
+ return (CmpValue == 0 || CmpValue == 1) &&
+ removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
}
/// Get opcode of S version of Instr.
@@ -1523,13 +1533,44 @@
}
/// Check if AArch64::NZCV should be alive in successors of MBB.
-static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
for (auto *BB : MBB->successors())
if (BB->isLiveIn(AArch64::NZCV))
return true;
return false;
}
+/// \returns The condition code operand index for \p Instr if it is a branch
+/// or select and -1 otherwise.
+static int
+findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
+ switch (Instr.getOpcode()) {
+ default:
+ return -1;
+
+ case AArch64::Bcc: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 2);
+ return Idx - 2;
+ }
+
+ case AArch64::CSINVWr:
+ case AArch64::CSINVXr:
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr:
+ case AArch64::CSELWr:
+ case AArch64::CSELXr:
+ case AArch64::CSNEGWr:
+ case AArch64::CSNEGXr:
+ case AArch64::FCSELSrrr:
+ case AArch64::FCSELDrrr: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 1);
+ return Idx - 1;
+ }
+ }
+}
+
namespace {
struct UsedNZCV {
@@ -1555,31 +1596,10 @@
/// Returns AArch64CC::Invalid if either the instruction does not use condition
/// codes or we don't optimize CmpInstr in the presence of such instructions.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
- switch (Instr.getOpcode()) {
- default:
- return AArch64CC::Invalid;
-
- case AArch64::Bcc: {
- int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
- assert(Idx >= 2);
- return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
- }
-
- case AArch64::CSINVWr:
- case AArch64::CSINVXr:
- case AArch64::CSINCWr:
- case AArch64::CSINCXr:
- case AArch64::CSELWr:
- case AArch64::CSELXr:
- case AArch64::CSNEGWr:
- case AArch64::CSNEGXr:
- case AArch64::FCSELSrrr:
- case AArch64::FCSELDrrr: {
- int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
- assert(Idx >= 1);
- return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
- }
- }
+ int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
+ return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
+ Instr.getOperand(CCIdx).getImm())
+ : AArch64CC::Invalid;
}
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
@@ -1626,6 +1646,41 @@
return UsedFlags;
}
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
+/// are not containing C or V flags and NZCV flags are not alive in successors
+/// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
+///
+/// Collect instructions using that flags in \p CCUseInstrs if provided.
+static Optional<UsedNZCV>
+examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+ const TargetRegisterInfo &TRI,
+ SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
+ MachineBasicBlock *CmpParent = CmpInstr.getParent();
+ if (MI.getParent() != CmpParent)
+ return None;
+
+ if (areCFlagsAliveInSuccessors(CmpParent))
+ return None;
+
+ UsedNZCV NZCVUsedAfterCmp;
+ for (MachineInstr &Instr : instructionsWithoutDebug(
+ std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
+ if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
+ AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+ if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
+ return None;
+ NZCVUsedAfterCmp |= getUsedNZCV(CC);
+ if (CCUseInstrs)
+ CCUseInstrs->push_back(&Instr);
+ }
+ if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
+ break;
+ }
+ if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
+ return None;
+ return NZCVUsedAfterCmp;
+}
+
static bool isADDSRegImm(unsigned Opcode) {
return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
}
@@ -1645,44 +1700,21 @@
/// or if MI opcode is not the S form there must be neither defs of flags
/// nor uses of flags between MI and CmpInstr.
/// - and C/V flags are not used after CmpInstr
-static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
- const TargetRegisterInfo *TRI) {
- assert(MI);
- assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
- assert(CmpInstr);
+static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
+ const TargetRegisterInfo &TRI) {
+ assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
- const unsigned CmpOpcode = CmpInstr->getOpcode();
+ const unsigned CmpOpcode = CmpInstr.getOpcode();
if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
return false;
- if (MI->getParent() != CmpInstr->getParent())
- return false;
-
- if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+ if (!examineCFlagsUse(MI, CmpInstr, TRI))
return false;
AccessKind AccessToCheck = AK_Write;
- if (sForm(*MI) != MI->getOpcode())
+ if (sForm(MI) != MI.getOpcode())
AccessToCheck = AK_All;
- if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
- return false;
-
- UsedNZCV NZCVUsedAfterCmp;
- for (const MachineInstr &Instr :
- instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
- CmpInstr->getParent()->instr_end())) {
- if (Instr.readsRegister(AArch64::NZCV, TRI)) {
- AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
- if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
- return false;
- NZCVUsedAfterCmp |= getUsedNZCV(CC);
- }
-
- if (Instr.modifiesRegister(AArch64::NZCV, TRI))
- break;
- }
-
- return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+ return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
}
/// Substitute an instruction comparing to zero with another instruction
@@ -1691,20 +1723,19 @@
/// Return true on success.
bool AArch64InstrInfo::substituteCmpToZero(
MachineInstr &CmpInstr, unsigned SrcReg,
- const MachineRegisterInfo *MRI) const {
- assert(MRI);
+ const MachineRegisterInfo &MRI) const {
// Get the unique definition of SrcReg.
- MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
if (!MI)
return false;
- const TargetRegisterInfo *TRI = &getRegisterInfo();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
unsigned NewOpc = sForm(*MI);
if (NewOpc == AArch64::INSTRUCTION_LIST_END)
return false;
- if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+ if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
return false;
// Update the instruction to set NZCV.
@@ -1713,7 +1744,133 @@
bool succeeded = UpdateOperandRegClass(*MI);
(void)succeeded;
assert(succeeded && "Some operands reg class are incompatible!");
- MI->addRegisterDefined(AArch64::NZCV, TRI);
+ MI->addRegisterDefined(AArch64::NZCV, &TRI);
+ return true;
+}
+
+/// \returns True if \p CmpInstr can be removed.
+///
+/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
+/// codes used in \p CCUseInstrs must be inverted.
+static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
+ int CmpValue, const TargetRegisterInfo &TRI,
+ SmallVectorImpl<MachineInstr *> &CCUseInstrs,
+ bool &IsInvertCC) {
+ assert((CmpValue == 0 || CmpValue == 1) &&
+ "Only comparisons to 0 or 1 considered for removal!");
+
+ // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
+ unsigned MIOpc = MI.getOpcode();
+ if (MIOpc == AArch64::CSINCWr) {
+ if (MI.getOperand(1).getReg() != AArch64::WZR ||
+ MI.getOperand(2).getReg() != AArch64::WZR)
+ return false;
+ } else if (MIOpc == AArch64::CSINCXr) {
+ if (MI.getOperand(1).getReg() != AArch64::XZR ||
+ MI.getOperand(2).getReg() != AArch64::XZR)
+ return false;
+ } else {
+ return false;
+ }
+ AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
+ if (MICC == AArch64CC::Invalid)
+ return false;
+
+ // NZCV needs to be defined
+ if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+ return false;
+
+ // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
+ const unsigned CmpOpcode = CmpInstr.getOpcode();
+ bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
+ if (CmpValue && !IsSubsRegImm)
+ return false;
+ if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
+ return false;
+
+ // MI conditions allowed: eq, ne, mi, pl
+ UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
+ if (MIUsedNZCV.C || MIUsedNZCV.V)
+ return false;
+
+ Optional<UsedNZCV> NZCVUsedAfterCmp =
+ examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
+ // Condition flags are not used in CmpInstr basic block successors and only
+ // Z or N flags allowed to be used after CmpInstr within its basic block
+ if (!NZCVUsedAfterCmp)
+ return false;
+ // Z or N flag used after CmpInstr must correspond to the flag used in MI
+ if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
+ (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
+ return false;
+ // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
+ if (MIUsedNZCV.N && !CmpValue)
+ return false;
+
+ // There must be no defs of flags between MI and CmpInstr
+ if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
+ return false;
+
+ // Condition code is inverted in the following cases:
+ // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+ // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
+ IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
+ (!CmpValue && MICC == AArch64CC::NE);
+ return true;
+}
+
+/// Remove comparision in csinc-cmp sequence
+///
+/// Examples:
+/// 1. \code
+/// csinc w9, wzr, wzr, ne
+/// cmp w9, #0
+/// b.eq
+/// \endcode
+/// to
+/// \code
+/// csinc w9, wzr, wzr, ne
+/// b.ne
+/// \endcode
+///
+/// 2. \code
+/// csinc x2, xzr, xzr, mi
+/// cmp x2, #1
+/// b.pl
+/// \endcode
+/// to
+/// \code
+/// csinc x2, xzr, xzr, mi
+/// b.pl
+/// \endcode
+///
+/// \param CmpInstr comparison instruction
+/// \return True when comparison removed
+bool AArch64InstrInfo::removeCmpToZeroOrOne(
+ MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
+ const MachineRegisterInfo &MRI) const {
+ MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
+ if (!MI)
+ return false;
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ SmallVector<MachineInstr *, 4> CCUseInstrs;
+ bool IsInvertCC = false;
+ if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
+ IsInvertCC))
+ return false;
+ // Make transformation
+ CmpInstr.eraseFromParent();
+ if (IsInvertCC) {
+ // Invert condition codes in CmpInstr CC users
+ for (MachineInstr *CCUseInstr : CCUseInstrs) {
+ int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
+ assert(Idx >= 0 && "Unexpected instruction using CC.");
+ MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
+ AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
+ static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
+ CCOperand.setImm(CCUse);
+ }
+ }
return true;
}
@@ -1751,6 +1908,68 @@
}
Register Reg = MI.getOperand(0).getReg();
+ Module &M = *MBB.getParent()->getFunction().getParent();
+ if (M.getStackProtectorGuard() == "sysreg") {
+ const AArch64SysReg::SysReg *SrcReg =
+ AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
+ if (!SrcReg)
+ report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
+
+ // mrs xN, sysreg
+ BuildMI(MBB, MI, DL, get(AArch64::MRS))
+ .addDef(Reg, RegState::Renamable)
+ .addImm(SrcReg->Encoding);
+ int Offset = M.getStackProtectorGuardOffset();
+ if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
+ // ldr xN, [xN, #offset]
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
+ .addDef(Reg)
+ .addUse(Reg, RegState::Kill)
+ .addImm(Offset / 8);
+ } else if (Offset >= -256 && Offset <= 255) {
+ // ldur xN, [xN, #offset]
+ BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
+ .addDef(Reg)
+ .addUse(Reg, RegState::Kill)
+ .addImm(Offset);
+ } else if (Offset >= -4095 && Offset <= 4095) {
+ if (Offset > 0) {
+ // add xN, xN, #offset
+ BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
+ .addDef(Reg)
+ .addUse(Reg, RegState::Kill)
+ .addImm(Offset)
+ .addImm(0);
+ } else {
+ // sub xN, xN, #offset
+ BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
+ .addDef(Reg)
+ .addUse(Reg, RegState::Kill)
+ .addImm(-Offset)
+ .addImm(0);
+ }
+ // ldr xN, [xN]
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
+ .addDef(Reg)
+ .addUse(Reg, RegState::Kill)
+ .addImm(0);
+ } else {
+ // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
+ // than 23760.
+ // It might be nice to use AArch64::MOVi32imm here, which would get
+ // expanded in PreSched2 after PostRA, but our lone scratch Reg already
+ // contains the MRS result. findScratchNonCalleeSaveRegister() in
+ // AArch64FrameLowering might help us find such a scratch register
+ // though. If we failed to find a scratch register, we could emit a
+ // stream of add instructions to build up the immediate. Or, we could try
+ // to insert a AArch64::MOVi32imm before register allocation so that we
+ // didn't need to scavenge for a scratch register.
+ report_fatal_error("Unable to encode Stack Protector Guard Offset");
+ }
+ MBB.erase(MI);
+ return true;
+ }
+
const GlobalValue *GV =
cast<GlobalValue>((*MI.memoperands_begin())->getValue());
const TargetMachine &TM = MBB.getParent()->getTarget();
@@ -1968,22 +2187,32 @@
});
}
-bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
+bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
switch (Opc) {
default:
return false;
case AArch64::STURSi:
+ case AArch64::STRSpre:
case AArch64::STURDi:
+ case AArch64::STRDpre:
case AArch64::STURQi:
+ case AArch64::STRQpre:
case AArch64::STURBBi:
case AArch64::STURHHi:
case AArch64::STURWi:
+ case AArch64::STRWpre:
case AArch64::STURXi:
+ case AArch64::STRXpre:
case AArch64::LDURSi:
+ case AArch64::LDRSpre:
case AArch64::LDURDi:
+ case AArch64::LDRDpre:
case AArch64::LDURQi:
+ case AArch64::LDRQpre:
case AArch64::LDURWi:
+ case AArch64::LDRWpre:
case AArch64::LDURXi:
+ case AArch64::LDRXpre:
case AArch64::LDURSWi:
case AArch64::LDURHHi:
case AArch64::LDURBBi:
@@ -2075,6 +2304,22 @@
case AArch64::LD1B_D_IMM:
case AArch64::LD1SB_D_IMM:
case AArch64::ST1B_D_IMM:
+ case AArch64::LD1RB_IMM:
+ case AArch64::LD1RB_H_IMM:
+ case AArch64::LD1RB_S_IMM:
+ case AArch64::LD1RB_D_IMM:
+ case AArch64::LD1RSB_H_IMM:
+ case AArch64::LD1RSB_S_IMM:
+ case AArch64::LD1RSB_D_IMM:
+ case AArch64::LD1RH_IMM:
+ case AArch64::LD1RH_S_IMM:
+ case AArch64::LD1RH_D_IMM:
+ case AArch64::LD1RSH_S_IMM:
+ case AArch64::LD1RSH_D_IMM:
+ case AArch64::LD1RW_IMM:
+ case AArch64::LD1RW_D_IMM:
+ case AArch64::LD1RSW_IMM:
+ case AArch64::LD1RD_IMM:
return 3;
case AArch64::ADDG:
case AArch64::STGOffset:
@@ -2102,15 +2347,25 @@
case AArch64::LDRSWui:
// Unscaled instructions.
case AArch64::STURSi:
+ case AArch64::STRSpre:
case AArch64::STURDi:
+ case AArch64::STRDpre:
case AArch64::STURQi:
+ case AArch64::STRQpre:
case AArch64::STURWi:
+ case AArch64::STRWpre:
case AArch64::STURXi:
+ case AArch64::STRXpre:
case AArch64::LDURSi:
+ case AArch64::LDRSpre:
case AArch64::LDURDi:
+ case AArch64::LDRDpre:
case AArch64::LDURQi:
+ case AArch64::LDRQpre:
case AArch64::LDURWi:
+ case AArch64::LDRWpre:
case AArch64::LDURXi:
+ case AArch64::LDRXpre:
case AArch64::LDURSWi:
return true;
}
@@ -2207,20 +2462,36 @@
// Is this a candidate for ld/st merging or pairing? For example, we don't
// touch volatiles or load/stores that have a hint to avoid pair formation.
bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
+
+ bool IsPreLdSt = isPreLdSt(MI);
+
// If this is a volatile load/store, don't mess with it.
if (MI.hasOrderedMemoryRef())
return false;
// Make sure this is a reg/fi+imm (as opposed to an address reloc).
- assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
+ // For Pre-inc LD/ST, the operand is shifted by one.
+ assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
+ MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
"Expected a reg or frame index operand.");
- if (!MI.getOperand(2).isImm())
+
+ // For Pre-indexed addressing quadword instructions, the third operand is the
+ // immediate value.
+ bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
+
+ if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
return false;
// Can't merge/pair if the instruction modifies the base register.
// e.g., ldr x0, [x0]
// This case will never occur with an FI base.
- if (MI.getOperand(1).isReg()) {
+ // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged.
+ // For example:
+ // ldr q0, [x11, #32]!
+ // ldr q1, [x11, #16]
+ // to
+ // ldp q0, q1, [x11, #32]!
+ if (MI.getOperand(1).isReg() && !IsPreLdSt) {
Register BaseReg = MI.getOperand(1).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (MI.modifiesRegister(BaseReg, TRI))
@@ -2454,6 +2725,13 @@
MinOffset = 0;
MaxOffset = 4095;
break;
+ case AArch64::StoreSwiftAsyncContext:
+ // Store is an STRXui, but there might be an ADDXri in the expansion too.
+ Scale = TypeSize::Fixed(1);
+ Width = 8;
+ MinOffset = 0;
+ MaxOffset = 4095;
+ break;
case AArch64::LDPWi:
case AArch64::LDPSi:
case AArch64::LDNPWi:
@@ -2499,6 +2777,38 @@
MinOffset = 0;
MaxOffset = 4095;
break;
+ case AArch64::STPXpre:
+ case AArch64::LDPXpost:
+ case AArch64::STPDpre:
+ case AArch64::LDPDpost:
+ Scale = TypeSize::Fixed(8);
+ Width = 8;
+ MinOffset = -512;
+ MaxOffset = 504;
+ break;
+ case AArch64::STPQpre:
+ case AArch64::LDPQpost:
+ Scale = TypeSize::Fixed(16);
+ Width = 16;
+ MinOffset = -1024;
+ MaxOffset = 1008;
+ break;
+ case AArch64::STRXpre:
+ case AArch64::STRDpre:
+ case AArch64::LDRXpost:
+ case AArch64::LDRDpost:
+ Scale = TypeSize::Fixed(1);
+ Width = 8;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
+ case AArch64::STRQpre:
+ case AArch64::LDRQpost:
+ Scale = TypeSize::Fixed(1);
+ Width = 16;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
case AArch64::ADDG:
Scale = TypeSize::Fixed(16);
Width = 0;
@@ -2623,6 +2933,42 @@
MinOffset = -64;
MaxOffset = 63;
break;
+ case AArch64::LD1RB_IMM:
+ case AArch64::LD1RB_H_IMM:
+ case AArch64::LD1RB_S_IMM:
+ case AArch64::LD1RB_D_IMM:
+ case AArch64::LD1RSB_H_IMM:
+ case AArch64::LD1RSB_S_IMM:
+ case AArch64::LD1RSB_D_IMM:
+ Scale = TypeSize::Fixed(1);
+ Width = 1;
+ MinOffset = 0;
+ MaxOffset = 63;
+ break;
+ case AArch64::LD1RH_IMM:
+ case AArch64::LD1RH_S_IMM:
+ case AArch64::LD1RH_D_IMM:
+ case AArch64::LD1RSH_S_IMM:
+ case AArch64::LD1RSH_D_IMM:
+ Scale = TypeSize::Fixed(2);
+ Width = 2;
+ MinOffset = 0;
+ MaxOffset = 63;
+ break;
+ case AArch64::LD1RW_IMM:
+ case AArch64::LD1RW_D_IMM:
+ case AArch64::LD1RSW_IMM:
+ Scale = TypeSize::Fixed(4);
+ Width = 4;
+ MinOffset = 0;
+ MaxOffset = 63;
+ break;
+ case AArch64::LD1RD_IMM:
+ Scale = TypeSize::Fixed(8);
+ Width = 8;
+ MinOffset = 0;
+ MaxOffset = 63;
+ break;
}
return true;
@@ -2649,14 +2995,18 @@
return 2;
case AArch64::LDRSui:
case AArch64::LDURSi:
+ case AArch64::LDRSpre:
case AArch64::LDRSWui:
case AArch64::LDURSWi:
+ case AArch64::LDRWpre:
case AArch64::LDRWui:
case AArch64::LDURWi:
case AArch64::STRSui:
case AArch64::STURSi:
+ case AArch64::STRSpre:
case AArch64::STRWui:
case AArch64::STURWi:
+ case AArch64::STRWpre:
case AArch64::LDPSi:
case AArch64::LDPSWi:
case AArch64::LDPWi:
@@ -2665,12 +3015,16 @@
return 4;
case AArch64::LDRDui:
case AArch64::LDURDi:
+ case AArch64::LDRDpre:
case AArch64::LDRXui:
case AArch64::LDURXi:
+ case AArch64::LDRXpre:
case AArch64::STRDui:
case AArch64::STURDi:
+ case AArch64::STRDpre:
case AArch64::STRXui:
case AArch64::STURXi:
+ case AArch64::STRXpre:
case AArch64::LDPDi:
case AArch64::LDPXi:
case AArch64::STPDi:
@@ -2680,7 +3034,9 @@
case AArch64::LDURQi:
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STRQpre:
case AArch64::LDPQi:
+ case AArch64::LDRQpre:
case AArch64::STPQi:
case AArch64::STGOffset:
case AArch64::STZGOffset:
@@ -2691,6 +3047,36 @@
}
}
+bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDRWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRSpre:
+ case AArch64::LDRDpre:
+ case AArch64::LDRQpre:
+ return true;
+ }
+}
+
+bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRWpre:
+ case AArch64::STRXpre:
+ case AArch64::STRSpre:
+ case AArch64::STRDpre:
+ case AArch64::STRQpre:
+ return true;
+ }
+}
+
+bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
+ return isPreLd(MI) || isPreSt(MI);
+}
+
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
// scaled.
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
@@ -2794,11 +3180,11 @@
// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
- if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+ if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
return false;
int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
- if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
+ if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
return false;
// Pairwise instructions have a 7-bit signed offset field.
@@ -3239,6 +3625,11 @@
return;
}
+#ifndef NDEBUG
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ errs() << TRI.getRegAsmName(DestReg) << " = COPY "
+ << TRI.getRegAsmName(SrcReg) << "\n";
+#endif
llvm_unreachable("unimplemented reg-to-reg copy");
}
@@ -4054,9 +4445,8 @@
return false;
}
-void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
- NopInst.setOpcode(AArch64::HINT);
- NopInst.addOperand(MCOperand::createImm(0));
+MCInst AArch64InstrInfo::getNop() const {
+ return MCInstBuilder(AArch64::HINT).addImm(0);
}
// AArch64 supports MachineCombiner.
@@ -6803,15 +7193,22 @@
// PACIASP EMITBKEY
// CFI_INSTRUCTION PACIBSP
// CFI_INSTRUCTION
+ unsigned PACI;
if (ShouldSignReturnAddrWithAKey) {
- BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
- .setMIFlag(MachineInstr::FrameSetup);
+ PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
} else {
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
.setMIFlag(MachineInstr::FrameSetup);
- BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
- .setMIFlag(MachineInstr::FrameSetup);
+ PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
}
+
+ auto MI = BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(PACI));
+ if (Subtarget.hasPAuth())
+ MI.addReg(AArch64::LR, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::SP, RegState::InternalRead);
+ MI.setMIFlag(MachineInstr::FrameSetup);
+
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
@@ -7171,6 +7568,26 @@
return TargetInstrInfo::describeLoadedValue(MI, Reg);
}
+bool AArch64InstrInfo::isExtendLikelyToBeFolded(
+ MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
+ assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
+ ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
+ ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
+
+ // Anyexts are nops.
+ if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
+ return true;
+
+ Register DefReg = ExtMI.getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(DefReg))
+ return false;
+
+ // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
+ // addressing mode.
+ auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
+ return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
+}
+
uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
return get(Opc).TSFlags & AArch64::ElementSizeMask;
}
@@ -7183,6 +7600,11 @@
return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
}
+unsigned int
+AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const {
+ return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2;
+}
+
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
return AArch64::BLRNoIP;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7434987..e25189e 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -79,10 +79,10 @@
/// Return true if the given load or store is a strided memory access.
static bool isStridedAccess(const MachineInstr &MI);
- /// Return true if this is an unscaled load/store.
- static bool isUnscaledLdSt(unsigned Opc);
- static bool isUnscaledLdSt(MachineInstr &MI) {
- return isUnscaledLdSt(MI.getOpcode());
+ /// Return true if it has an unscaled load/store offset.
+ static bool hasUnscaledLdStOffset(unsigned Opc);
+ static bool hasUnscaledLdStOffset(MachineInstr &MI) {
+ return hasUnscaledLdStOffset(MI.getOpcode());
}
/// Returns the unscaled load/store for the scaled load/store opcode,
@@ -95,6 +95,14 @@
return getMemScale(MI.getOpcode());
}
+ /// Returns whether the instruction is a pre-indexed load.
+ static bool isPreLd(const MachineInstr &MI);
+
+ /// Returns whether the instruction is a pre-indexed store.
+ static bool isPreSt(const MachineInstr &MI);
+
+ /// Returns whether the instruction is a pre-indexed load/store.
+ static bool isPreLdSt(const MachineInstr &MI);
/// Returns the index for the immediate for a given instruction.
static unsigned getLoadStoreImmIdx(unsigned Opc);
@@ -209,7 +217,7 @@
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
bool isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
@@ -299,6 +307,11 @@
Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
Register Reg) const override;
+ unsigned int getTailDuplicateSize(CodeGenOpt::Level OptLevel) const override;
+
+ bool isExtendLikelyToBeFolded(MachineInstr &ExtMI,
+ MachineRegisterInfo &MRI) const override;
+
static void decomposeStackOffsetForFrameOffsets(const StackOffset &Offset,
int64_t &NumBytes,
int64_t &NumPredicateVectors,
@@ -329,7 +342,9 @@
MachineBasicBlock *TBB,
ArrayRef<MachineOperand> Cond) const;
bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
- const MachineRegisterInfo *MRI) const;
+ const MachineRegisterInfo &MRI) const;
+ bool removeCmpToZeroOrOne(MachineInstr &CmpInstr, unsigned SrcReg,
+ int CmpValue, const MachineRegisterInfo &MRI) const;
/// Returns an unused general-purpose register which can be used for
/// constructing an outlined call if one exists. Returns 0 otherwise.
@@ -440,7 +455,7 @@
// struct TSFlags {
#define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits
-#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
+#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bits
#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits
#define TSFLAG_INSTR_FLAGS(X) ((X) << 9) // 2-bits
// }
@@ -467,6 +482,7 @@
DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6),
DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7),
DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8),
+ DestructiveUnaryPassthru = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9),
};
enum FalseLaneType {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 171d3db..12744e4 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -95,6 +95,7 @@
AssemblerPredicate<(all_of FeatureCRC), "crc">;
def HasLSE : Predicate<"Subtarget->hasLSE()">,
AssemblerPredicate<(all_of FeatureLSE), "lse">;
+def HasNoLSE : Predicate<"!Subtarget->hasLSE()">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
AssemblerPredicate<(all_of FeatureRAS), "ras">;
def HasRDM : Predicate<"Subtarget->hasRDM()">,
@@ -121,6 +122,12 @@
AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
+def HasSME : Predicate<"Subtarget->hasSME()">,
+ AssemblerPredicate<(all_of FeatureSME), "sme">;
+def HasSMEF64 : Predicate<"Subtarget->hasSMEF64()">,
+ AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">;
+def HasSMEI64 : Predicate<"Subtarget->hasSMEI64()">,
+ AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -247,6 +254,8 @@
def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDT_AArch64Dot: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>, SDTCisSameAs<2,3>]>;
def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>,
SDTCisSameAs<0,1>,
@@ -268,6 +277,8 @@
def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
SDTCisPtrTy<1>]>;
+def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
@@ -529,8 +540,6 @@
def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
-def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
-
def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -561,6 +570,9 @@
def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
+def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>;
+def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>;
+
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -573,16 +585,18 @@
def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
-def AArch64uabd_n : SDNode<"AArch64ISD::UABD", SDT_AArch64binvec>;
-def AArch64sabd_n : SDNode<"AArch64ISD::SABD", SDT_AArch64binvec>;
-
def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(AArch64uabd_n node:$lhs, node:$rhs),
+ [(abdu node:$lhs, node:$rhs),
(int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(AArch64sabd_n node:$lhs, node:$rhs),
+ [(abds node:$lhs, node:$rhs),
(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
+def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
+def AArch64uaddlp : PatFrags<(ops node:$src),
+ [(AArch64uaddlp_n node:$src),
+ (int_aarch64_neon_uaddlp node:$src)]>;
+
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -602,7 +616,9 @@
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
-
+def AArch64mrs : SDNode<"AArch64ISD::MRS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
+ [SDNPHasChain, SDNPOutGlue]>;
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -632,6 +648,7 @@
include "AArch64InstrFormats.td"
include "SVEInstrFormats.td"
+include "SMEInstrFormats.td"
//===----------------------------------------------------------------------===//
@@ -831,10 +848,10 @@
// ARMv8.2-A Dot Product
let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", AArch64sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", AArch64udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", AArch64sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", AArch64udot>;
}
// ARMv8.6-A BFloat
@@ -924,6 +941,45 @@
def EOR3 : CryptoRRRR_16B<0b00, "eor3">;
def BCAX : CryptoRRRR_16B<0b01, "bcax">;
def XAR : CryptoRRRi6<"xar">;
+
+class SHA3_pattern<Instruction INST, Intrinsic OpNode, ValueType VecTy>
+ : Pat<(VecTy (OpNode (VecTy V128:$Vd), (VecTy V128:$Vn), (VecTy V128:$Vm))),
+ (INST (VecTy V128:$Vd), (VecTy V128:$Vn), (VecTy V128:$Vm))>;
+
+def : Pat<(v2i64 (int_aarch64_crypto_sha512su0 (v2i64 V128:$Vn), (v2i64 V128:$Vm))),
+ (SHA512SU0 (v2i64 V128:$Vn), (v2i64 V128:$Vm))>;
+
+def : SHA3_pattern<SHA512H, int_aarch64_crypto_sha512h, v2i64>;
+def : SHA3_pattern<SHA512H2, int_aarch64_crypto_sha512h2, v2i64>;
+def : SHA3_pattern<SHA512SU1, int_aarch64_crypto_sha512su1, v2i64>;
+
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v16i8>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>;
+
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v2i64>;
+
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3s, v16i8>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3s, v8i16>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3s, v4i32>;
+def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3s, v2i64>;
+
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxs, v16i8>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxs, v8i16>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxs, v4i32>;
+def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxs, v2i64>;
+
+def : Pat<(v2i64 (int_aarch64_crypto_rax1 (v2i64 V128:$Vn), (v2i64 V128:$Vm))),
+ (RAX1 (v2i64 V128:$Vn), (v2i64 V128:$Vm))>;
+
+def : Pat<(v2i64 (int_aarch64_crypto_xar (v2i64 V128:$Vn), (v2i64 V128:$Vm), (i64 timm0_63:$imm))),
+ (XAR (v2i64 V128:$Vn), (v2i64 V128:$Vm), (timm0_63:$imm))>;
+
+
} // HasSHA3
let Predicates = [HasSM4] in {
@@ -936,6 +992,32 @@
def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">;
def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">;
def SM4E : CryptoRRTied_4S<0b0, 0b01, "sm4e">;
+
+def : Pat<(v4i32 (int_aarch64_crypto_sm3ss1 (v4i32 V128:$Vn), (v4i32 V128:$Vm), (v4i32 V128:$Va))),
+ (SM3SS1 (v4i32 V128:$Vn), (v4i32 V128:$Vm), (v4i32 V128:$Va))>;
+
+class SM3PARTW_pattern<Instruction INST, Intrinsic OpNode>
+ : Pat<(v4i32 (OpNode (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm))),
+ (INST (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm))>;
+
+class SM3TT_pattern<Instruction INST, Intrinsic OpNode>
+ : Pat<(v4i32 (OpNode (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm), (i64 VectorIndexS_timm:$imm) )),
+ (INST (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm), (VectorIndexS_timm:$imm))>;
+
+class SM4_pattern<Instruction INST, Intrinsic OpNode>
+ : Pat<(v4i32 (OpNode (v4i32 V128:$Vn), (v4i32 V128:$Vm))),
+ (INST (v4i32 V128:$Vn), (v4i32 V128:$Vm))>;
+
+def : SM3PARTW_pattern<SM3PARTW1, int_aarch64_crypto_sm3partw1>;
+def : SM3PARTW_pattern<SM3PARTW2, int_aarch64_crypto_sm3partw2>;
+
+def : SM3TT_pattern<SM3TT1A, int_aarch64_crypto_sm3tt1a>;
+def : SM3TT_pattern<SM3TT1B, int_aarch64_crypto_sm3tt1b>;
+def : SM3TT_pattern<SM3TT2A, int_aarch64_crypto_sm3tt2a>;
+def : SM3TT_pattern<SM3TT2B, int_aarch64_crypto_sm3tt2b>;
+
+def : SM4_pattern<SM4ENCKEY, int_aarch64_crypto_sm4ekey>;
+def : SM4_pattern<SM4E, int_aarch64_crypto_sm4e>;
} // HasSM4
let Predicates = [HasRCPC] in {
@@ -979,7 +1061,7 @@
}
}
-multiclass FCMLA_PATS<ValueType ty, RegisterClass Reg> {
+multiclass FCMLA_PATS<ValueType ty, DAGOperand Reg> {
def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
(!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>;
def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
@@ -990,7 +1072,7 @@
(!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>;
}
-multiclass FCMLA_LANE_PATS<ValueType ty, RegisterClass Reg, dag RHSDup> {
+multiclass FCMLA_LANE_PATS<ValueType ty, DAGOperand Reg, dag RHSDup> {
def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
(!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>;
def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
@@ -1196,6 +1278,9 @@
def MSRpstateImm1 : MSRpstateImm0_1;
def MSRpstateImm4 : MSRpstateImm0_15;
+def : Pat<(AArch64mrs imm:$id),
+ (MRS imm:$id)>;
+
// The thread pointer (on Linux, at least, where this has been implemented) is
// TPIDR_EL0.
def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
@@ -1221,6 +1306,7 @@
// FPCR register
def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>;
+def : Pat<(int_aarch64_set_fpcr i64:$val), (MSR 0xda20, GPR64:$val)>;
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
@@ -2086,6 +2172,11 @@
def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
(CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(add GPR32:$val, (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV)),
+ (CSINCWr GPR32:$val, GPR32:$val, (i32 imm:$cc))>;
+def : Pat<(add GPR64:$val, (zext (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV))),
+ (CSINCXr GPR64:$val, GPR64:$val, (i32 imm:$cc))>;
+
// The inverse of the condition code from the alias instruction is what is used
// in the aliased instruction. The parser all ready inverts the condition code
// for these aliases.
@@ -3237,7 +3328,7 @@
} // AddedComplexity = 10
// Match stores from lane 0 to the appropriate subreg's store.
-multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
+multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
ValueType VTy, ValueType STy,
SubRegIndex SubRegIdx, Operand IndexType,
Instruction STR> {
@@ -3503,6 +3594,9 @@
(STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
+def : Pat<(post_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRHpost FPR16:$Rt, GPR64sp:$addr, simm9:$off)>;
+
def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
@@ -3517,6 +3611,8 @@
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -3532,6 +3628,8 @@
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//===----------------------------------------------------------------------===//
// Load/store exclusive instructions.
@@ -3610,6 +3708,25 @@
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+// AArch64's FCVT instructions saturate when out of range.
+multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
+ def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">;
+defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">;
+
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
@@ -3635,7 +3752,7 @@
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
-multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> {
def : Pat<(i32 (to_int (round f32:$Rn))),
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
def : Pat<(i64 (to_int (round f32:$Rn))),
@@ -3644,16 +3761,32 @@
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
def : Pat<(i64 (to_int (round f64:$Rn))),
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ // These instructions saturate like fp_to_[su]int_sat.
+ def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
}
-defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
-defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
-defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
-defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
-defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
-defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
-defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
-defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil, "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil, "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
+
+
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (lround f16:$Rn)),
@@ -3720,12 +3853,9 @@
defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>;
defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
-def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
- (FRINTNDr FPR64:$Rn)>;
-
defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
@@ -3734,10 +3864,10 @@
}
let Predicates = [HasFRInt3264] in {
- defm FRINT32Z : FRIntNNT<0b00, "frint32z">;
- defm FRINT64Z : FRIntNNT<0b10, "frint64z">;
- defm FRINT32X : FRIntNNT<0b01, "frint32x">;
- defm FRINT64X : FRIntNNT<0b11, "frint64x">;
+ defm FRINT32Z : FRIntNNT<0b00, "frint32z", int_aarch64_frint32z>;
+ defm FRINT64Z : FRIntNNT<0b10, "frint64z", int_aarch64_frint64z>;
+ defm FRINT32X : FRIntNNT<0b01, "frint32x", int_aarch64_frint32x>;
+ defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>;
} // HasFRInt3264
let Predicates = [HasFullFP16] in {
@@ -3896,6 +4026,14 @@
Sched<[]>;
}
+// Pseudo instructions for homogeneous prolog/epilog
+let isPseudo = 1 in {
+ // Save CSRs in order, {FPOffset}
+ def HOM_Prolog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>;
+ // Restore CSRs in order
+ def HOM_Epilog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>;
+}
+
//===----------------------------------------------------------------------===//
// Floating point immediate move.
//===----------------------------------------------------------------------===//
@@ -4006,16 +4144,16 @@
defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>;
defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
let Predicates = [HasFRInt3264] in {
- defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z">;
- defm FRINT64Z : FRIntNNTVector<0, 1, "frint64z">;
- defm FRINT32X : FRIntNNTVector<1, 0, "frint32x">;
- defm FRINT64X : FRIntNNTVector<1, 1, "frint64x">;
+ defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>;
+ defm FRINT64Z : FRIntNNTVector<0, 1, "frint64z", int_aarch64_neon_frint64z>;
+ defm FRINT32X : FRIntNNTVector<1, 0, "frint32x", int_aarch64_neon_frint32x>;
+ defm FRINT64X : FRIntNNTVector<1, 1, "frint64x", int_aarch64_neon_frint64x>;
} // HasFRInt3264
defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
@@ -4029,14 +4167,6 @@
def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
(NOTv16i8 V128:$Vd, V128:$Vn)>;
-def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
-def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
-def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
-def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
-def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
-def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
-def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-
def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
@@ -4044,7 +4174,7 @@
def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", bitreverse>;
defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
@@ -4059,9 +4189,8 @@
defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
- BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
-defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
- int_aarch64_neon_uaddlp>;
+ BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
@@ -4102,6 +4231,37 @@
defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+// Constant vector values, used in the S/UQXTN patterns below.
+def VImmFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 85))))>;
+def VImmFFFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 51))))>;
+def VImm7F: PatLeaf<(AArch64movi_shift (i32 127), (i32 0))>;
+def VImm80: PatLeaf<(AArch64mvni_shift (i32 127), (i32 0))>;
+def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>;
+def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>;
+
+// trunc(umin(X, 255)) -> UQXTRN v8i8
+def : Pat<(v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))),
+ (UQXTNv8i8 V128:$Vn)>;
+// trunc(umin(X, 65535)) -> UQXTRN v4i16
+def : Pat<(v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))),
+ (UQXTNv4i16 V128:$Vn)>;
+// trunc(smin(smax(X, -128), 128)) -> SQXTRN
+// with reversed min/max
+def : Pat<(v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)),
+ (v8i16 VImm7F)))),
+ (SQXTNv8i8 V128:$Vn)>;
+def : Pat<(v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)),
+ (v8i16 VImm80)))),
+ (SQXTNv8i8 V128:$Vn)>;
+// trunc(smin(smax(X, -32768), 32767)) -> SQXTRN
+// with reversed min/max
+def : Pat<(v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)),
+ (v4i32 VImm7FFF)))),
+ (SQXTNv4i16 V128:$Vn)>;
+def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
+ (v4i32 VImm8000)))),
+ (SQXTNv4i16 V128:$Vn)>;
+
//===----------------------------------------------------------------------===//
// Advanced SIMD three vector instructions.
//===----------------------------------------------------------------------===//
@@ -4114,6 +4274,9 @@
defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+foreach VT = [ v8i8, v16i8, v4i16, v8i16, v2i32, v4i32, v2i64 ] in {
+def : Pat<(vnot (AArch64cmeqz VT:$Rn)), (!cast<Instruction>("CMTST"#VT) VT:$Rn, VT:$Rn)>;
+}
defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
let Predicates = [HasNEON] in {
foreach VT = [ v2f32, v4f32, v2f64 ] in
@@ -4561,8 +4724,6 @@
defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
int_aarch64_neon_usqadd>;
-def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
-
def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
(FCVTASv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
@@ -4657,6 +4818,27 @@
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+// Some float -> int -> float conversion patterns for which we want to keep the
+// int values in FP registers using the corresponding NEON instructions to
+// avoid more costly int <-> fp register transfers.
+let Predicates = [HasNEON] in {
+def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+ (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
+def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+ (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
+def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+ (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
+def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+ (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
+
+let Predicates = [HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+ (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
+def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+ (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
+}
+}
+
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
@@ -5247,6 +5429,13 @@
(i32 0xffff)),
(i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+ VectorIndexB:$idx)))), (i64 0xff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx)), sub_32)>;
+def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+ VectorIndexH:$idx)))), (i64 0xffff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx)), sub_32)>;
+
defm INS : SIMDIns;
def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
@@ -5499,6 +5688,25 @@
defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+// Patterns for uaddv(uaddlp(x)) ==> uaddlv
+def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
+ (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
+ (i64 0))), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (UADDLVv4i16v V64:$op), ssub), ssub)>;
+def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp
+ (v16i8 V128:$op))))), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (UADDLVv16i8v V128:$op), hsub), ssub)>;
+def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>;
+
+// Patterns for addp(uaddlp(x))) ==> uaddlv
+def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>;
+def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>;
+
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
// In effect, opNode is the same as (scalar_to_vector (IntNode)).
@@ -5525,7 +5733,7 @@
// If none did, fallback to the explicit patterns, consuming the vector_extract.
def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
- (i32 0)), (i64 0))),
+ (i64 0)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
bsub), ssub)>;
@@ -5534,7 +5742,7 @@
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
bsub), ssub)>;
def : Pat<(i32 (vector_extract (insert_subvector undef,
- (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
+ (v4i16 (opNode V64:$Rn)), (i64 0)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
hsub), ssub)>;
@@ -5555,7 +5763,7 @@
// If there is a sign extension after this intrinsic, consume it as smov already
// performed it
def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
- (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
+ (opNode (v8i8 V64:$Rn)), (i64 0)), (i64 0))), i8)),
(i32 (SMOVvi8to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
@@ -5567,7 +5775,7 @@
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
(i64 0)))>;
def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
- (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
+ (opNode (v4i16 V64:$Rn)), (i64 0)), (i64 0))), i16)),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
@@ -5586,7 +5794,7 @@
// If there is a masking operation keeping only what has been actually
// generated, consume it.
def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
- (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
+ (opNode (v8i8 V64:$Rn)), (i64 0)), (i64 0))), maski8_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
@@ -5598,7 +5806,7 @@
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
ssub))>;
def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
- (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
+ (opNode (v4i16 V64:$Rn)), (i64 0)), (i64 0))), maski16_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
@@ -5921,7 +6129,7 @@
(v2f32 (AArch64duplane32
(v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
- (i32 0))),
+ (i64 0))),
VectorIndexS:$idx)))),
(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
@@ -5942,7 +6150,7 @@
(v4f32 (AArch64duplane32
(v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
- (i32 0))),
+ (i64 0))),
VectorIndexS:$idx)))),
(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
@@ -5973,7 +6181,7 @@
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
- (i32 0))),
+ (i64 0))),
VectorIndexS:$idx))),
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
@@ -6569,6 +6777,46 @@
def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
def : Ld1Lane128Pat<load, VectorIndexH, v8bf16, bf16, LD1i16>;
+// Generate LD1 for extload if memory type does not match the
+// destination type, for example:
+//
+// (v4i32 (insert_vector_elt (load anyext from i8) idx))
+//
+// In this case, the index must be adjusted to match LD1 type.
+//
+class Ld1Lane128IdxOpPat<SDPatternOperator scalar_load, Operand
+ VecIndex, ValueType VTy, ValueType STy,
+ Instruction LD1, SDNodeXForm IdxOp>
+ : Pat<(vector_insert (VTy VecListOne128:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>;
+
+def VectorIndexStoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexStoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
+def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
+def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
+
+// Same as above, but the first element is populated using
+// scalar_to_vector + insert_subvector instead of insert_vector_elt.
+class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
+ SDPatternOperator ExtLoad, Instruction LD1>
+ : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+ (ResultTy (EXTRACT_SUBREG
+ (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
+
+def : Ld1Lane128FirstElm<v2i32, v8i16, extloadi16, LD1i16>;
+def : Ld1Lane128FirstElm<v2i32, v16i8, extloadi8, LD1i8>;
+def : Ld1Lane128FirstElm<v4i16, v16i8, extloadi8, LD1i8>;
+
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
: Pat<(vector_insert (VTy VecListOne64:$Rd),
@@ -7844,6 +8092,20 @@
// FIXME: add SVE dot-product patterns.
}
+// Custom DAG nodes and isel rules to make a 64-byte block out of eight GPRs,
+// so that it can be used as input to inline asm, and vice versa.
+def LS64_BUILD : SDNode<"AArch64ISD::LS64_BUILD", SDTypeProfile<1, 8, []>>;
+def LS64_EXTRACT : SDNode<"AArch64ISD::LS64_EXTRACT", SDTypeProfile<1, 2, []>>;
+def : Pat<(i64x8 (LS64_BUILD GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3,
+ GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7)),
+ (REG_SEQUENCE GPR64x8Class,
+ $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3,
+ $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7)>;
+foreach i = 0-7 in {
+ def : Pat<(i64 (LS64_EXTRACT (i64x8 GPR64x8:$val), (i32 i))),
+ (EXTRACT_SUBREG $val, !cast<SubRegIndex>("x8sub_"#i))>;
+}
+
let Predicates = [HasLS64] in {
def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
(outs GPR64x8:$Rt)>;
@@ -7861,7 +8123,12 @@
def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>;
}
+let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in
+def StoreSwiftAsyncContext
+ : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
+ []>, Sched<[]>;
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
-
+include "AArch64SMEInstrInfo.td"
include "AArch64InstrGISel.td"
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index ad180cb..bf042c8 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -247,28 +248,38 @@
return std::numeric_limits<unsigned>::max();
case AArch64::STRDui:
case AArch64::STURDi:
+ case AArch64::STRDpre:
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STRQpre:
case AArch64::STRBBui:
case AArch64::STURBBi:
case AArch64::STRHHui:
case AArch64::STURHHi:
case AArch64::STRWui:
+ case AArch64::STRWpre:
case AArch64::STURWi:
case AArch64::STRXui:
+ case AArch64::STRXpre:
case AArch64::STURXi:
case AArch64::LDRDui:
case AArch64::LDURDi:
+ case AArch64::LDRDpre:
case AArch64::LDRQui:
case AArch64::LDURQi:
+ case AArch64::LDRQpre:
case AArch64::LDRWui:
case AArch64::LDURWi:
+ case AArch64::LDRWpre:
case AArch64::LDRXui:
case AArch64::LDURXi:
+ case AArch64::LDRXpre:
case AArch64::STRSui:
case AArch64::STURSi:
+ case AArch64::STRSpre:
case AArch64::LDRSui:
case AArch64::LDURSi:
+ case AArch64::LDRSpre:
return Opc;
case AArch64::LDRSWui:
return AArch64::LDRWui;
@@ -303,33 +314,53 @@
case AArch64::STRSui:
case AArch64::STURSi:
return AArch64::STPSi;
+ case AArch64::STRSpre:
+ return AArch64::STPSpre;
case AArch64::STRDui:
case AArch64::STURDi:
return AArch64::STPDi;
+ case AArch64::STRDpre:
+ return AArch64::STPDpre;
case AArch64::STRQui:
case AArch64::STURQi:
return AArch64::STPQi;
+ case AArch64::STRQpre:
+ return AArch64::STPQpre;
case AArch64::STRWui:
case AArch64::STURWi:
return AArch64::STPWi;
+ case AArch64::STRWpre:
+ return AArch64::STPWpre;
case AArch64::STRXui:
case AArch64::STURXi:
return AArch64::STPXi;
+ case AArch64::STRXpre:
+ return AArch64::STPXpre;
case AArch64::LDRSui:
case AArch64::LDURSi:
return AArch64::LDPSi;
+ case AArch64::LDRSpre:
+ return AArch64::LDPSpre;
case AArch64::LDRDui:
case AArch64::LDURDi:
return AArch64::LDPDi;
+ case AArch64::LDRDpre:
+ return AArch64::LDPDpre;
case AArch64::LDRQui:
case AArch64::LDURQi:
return AArch64::LDPQi;
+ case AArch64::LDRQpre:
+ return AArch64::LDPQpre;
case AArch64::LDRWui:
case AArch64::LDURWi:
return AArch64::LDPWi;
+ case AArch64::LDRWpre:
+ return AArch64::LDPWpre;
case AArch64::LDRXui:
case AArch64::LDURXi:
return AArch64::LDPXi;
+ case AArch64::LDRXpre:
+ return AArch64::LDPXpre;
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return AArch64::LDPSWi;
@@ -538,6 +569,37 @@
}
}
+static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
+
+ unsigned OpcA = FirstMI.getOpcode();
+ unsigned OpcB = MI.getOpcode();
+
+ switch (OpcA) {
+ default:
+ return false;
+ case AArch64::STRSpre:
+ return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
+ case AArch64::STRDpre:
+ return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
+ case AArch64::STRQpre:
+ return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
+ case AArch64::STRWpre:
+ return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
+ case AArch64::STRXpre:
+ return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
+ case AArch64::LDRSpre:
+ return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
+ case AArch64::LDRDpre:
+ return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
+ case AArch64::LDRQpre:
+ return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
+ case AArch64::LDRWpre:
+ return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
+ case AArch64::LDRXpre:
+ return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
+ }
+}
+
// Returns the scale and offset range of pre/post indexed variants of MI.
static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
int &MinOffset, int &MaxOffset) {
@@ -560,17 +622,20 @@
static MachineOperand &getLdStRegOp(MachineInstr &MI,
unsigned PairedRegOp = 0) {
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
- unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+ bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
+ if (IsPreLdSt)
+ PairedRegOp += 1;
+ unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
return MI.getOperand(Idx);
}
static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
- unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+ unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1;
return MI.getOperand(Idx);
}
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
- unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+ unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2;
return MI.getOperand(Idx);
}
@@ -580,10 +645,10 @@
assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
int LoadSize = TII->getMemScale(LoadInst);
int StoreSize = TII->getMemScale(StoreInst);
- int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
+ int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst)
? getLdStOffsetOp(StoreInst).getImm()
: getLdStOffsetOp(StoreInst).getImm() * StoreSize;
- int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
+ int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst)
? getLdStOffsetOp(LoadInst).getImm()
: getLdStOffsetOp(LoadInst).getImm() * LoadSize;
return (UnscaledStOffset <= UnscaledLdOffset) &&
@@ -688,7 +753,7 @@
NextI = next_nodbg(NextI, E);
unsigned Opc = I->getOpcode();
- bool IsScaled = !TII->isUnscaledLdSt(Opc);
+ bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
bool MergeForward = Flags.getMergeForward();
@@ -794,7 +859,7 @@
int SExtIdx = Flags.getSExtIdx();
unsigned Opc =
SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
- bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+ bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
bool MergeForward = Flags.getMergeForward();
@@ -875,7 +940,7 @@
int Offset = getLdStOffsetOp(*I).getImm();
int PairedOffset = getLdStOffsetOp(*Paired).getImm();
- bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+ bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
if (IsUnscaled != PairedIsUnscaled) {
// We're trying to pair instructions that differ in how they are scaled. If
// I is scaled then scale the offset of Paired accordingly. Otherwise, do
@@ -893,8 +958,11 @@
}
// Which register is Rt and which is Rt2 depends on the offset order.
+ // However, for pre load/stores the Rt should be the one of the pre
+ // load/store.
MachineInstr *RtMI, *Rt2MI;
- if (Offset == PairedOffset + OffsetStride) {
+ if (Offset == PairedOffset + OffsetStride &&
+ !AArch64InstrInfo::isPreLdSt(*I)) {
RtMI = &*Paired;
Rt2MI = &*I;
// Here we swapped the assumption made for SExtIdx.
@@ -908,7 +976,7 @@
}
int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
// Scale the immediate offset, if necessary.
- if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+ if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
"Unscaled offset cannot be scaled.");
OffsetImm /= TII->getMemScale(*RtMI);
@@ -939,13 +1007,20 @@
MI.clearRegisterKills(Reg, TRI);
}
}
- MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
- .add(RegOp0)
- .add(RegOp1)
- .add(BaseRegOp)
- .addImm(OffsetImm)
- .cloneMergedMemRefs({&*I, &*Paired})
- .setMIFlags(I->mergeFlagsWith(*Paired));
+
+ unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
+ MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
+
+ // Adds the pre-index operand for pre-indexed ld/st pairs.
+ if (AArch64InstrInfo::isPreLdSt(*RtMI))
+ MIB.addReg(BaseRegOp.getReg(), RegState::Define);
+
+ MIB.add(RegOp0)
+ .add(RegOp1)
+ .add(BaseRegOp)
+ .addImm(OffsetImm)
+ .cloneMergedMemRefs({&*I, &*Paired})
+ .setMIFlags(I->mergeFlagsWith(*Paired));
(void)MIB;
@@ -1053,8 +1128,8 @@
// performance and correctness are verified only in little-endian.
if (!Subtarget->isLittleEndian())
return NextI;
- bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
- assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+ bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
+ assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
"Unsupported ld/st match");
assert(LoadSize <= StoreSize && "Invalid load size");
int UnscaledLdOffset = IsUnscaled
@@ -1231,9 +1306,9 @@
unsigned OpcA = FirstMI.getOpcode();
unsigned OpcB = MI.getOpcode();
- // Opcodes match: nothing more to check.
+ // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
if (OpcA == OpcB)
- return true;
+ return !AArch64InstrInfo::isPreLdSt(FirstMI);
// Try to match a sign-extended load/store with a zero-extended load/store.
bool IsValidLdStrOpc, PairIsValidLdStrOpc;
@@ -1256,8 +1331,14 @@
if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
return false;
+ // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
+ // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
+ // are candidate pairs that can be merged.
+ if (isPreLdStPairCandidate(FirstMI, MI))
+ return true;
+
// Try to match an unscaled load/store with a scaled load/store.
- return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+ return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
@@ -1447,7 +1528,7 @@
MBBI = next_nodbg(MBBI, E);
bool MayLoad = FirstMI.mayLoad();
- bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
+ bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
Register Reg = getLdStRegOp(FirstMI).getReg();
Register BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();
@@ -1495,7 +1576,7 @@
// a relocation.
Register MIBaseReg = getLdStBaseOp(MI).getReg();
int MIOffset = getLdStOffsetOp(MI).getImm();
- bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+ bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
if (IsUnscaled != MIIsUnscaled) {
// We're trying to pair instructions that differ in how they are scaled.
// If FirstMI is scaled then scale the offset of MI accordingly.
@@ -1516,8 +1597,41 @@
}
}
- if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
- (Offset + OffsetStride == MIOffset))) {
+ bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
+
+ if (BaseReg == MIBaseReg) {
+ // If the offset of the second ld/st is not equal to the size of the
+ // destination register it can’t be paired with a pre-index ld/st
+ // pair. Additionally if the base reg is used or modified the operations
+ // can't be paired: bail and keep looking.
+ if (IsPreLdSt) {
+ bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
+ bool IsBaseRegUsed =
+ !UsedRegUnits.available(getLdStBaseOp(MI).getReg());
+ bool IsBaseRegModified =
+ !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg());
+ // If the stored value and the address of the second instruction is
+ // the same, it needs to be using the updated register and therefore
+ // it must not be folded.
+ bool IsMIRegTheSame =
+ getLdStRegOp(MI).getReg() == getLdStBaseOp(MI).getReg();
+ if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
+ IsMIRegTheSame) {
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ } else {
+ if ((Offset != MIOffset + OffsetStride) &&
+ (Offset + OffsetStride != MIOffset)) {
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ }
+
int MinOffset = Offset < MIOffset ? Offset : MIOffset;
if (FindNarrowMerge) {
// If the alignment requirements of the scaled wide load/store
@@ -1849,6 +1963,7 @@
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
+ MachineFunction &MF = *MemMI.getMF();
Register BaseReg = getLdStBaseOp(MemMI).getReg();
int Offset = getLdStOffsetOp(MemMI).getImm();
@@ -1876,11 +1991,16 @@
return E;
}
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ unsigned RedZoneSize =
+ Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
+
// Track which register units have been modified and used between the first
// insn (inclusive) and the second insn.
ModifiedRegUnits.clear();
UsedRegUnits.clear();
unsigned Count = 0;
+ bool MemAcessBeforeSPPreInc = false;
do {
MBBI = prev_nodbg(MBBI, B);
MachineInstr &MI = *MBBI;
@@ -1891,8 +2011,13 @@
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
+ if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
+ // Check that the update value is within our red zone limit (which may be
+ // zero).
+ if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
+ return E;
return MBBI;
+ }
// Update the status of what the instruction clobbered and used.
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
@@ -1902,6 +2027,11 @@
if (!ModifiedRegUnits.available(BaseReg) ||
!UsedRegUnits.available(BaseReg))
return E;
+ // Keep track if we have a memory access before an SP pre-increment, in this
+ // case we need to validate later that the update amount respects the red
+ // zone.
+ if (BaseRegSP && MBBI->mayLoadOrStore())
+ MemAcessBeforeSPPreInc = true;
} while (MBBI != B && Count < Limit);
return E;
}
@@ -1968,7 +2098,7 @@
// Early exit if the offset is not possible to match. (6 bits of positive
// range, plus allow an extra one in case we find a later insn that matches
// with Offset-1)
- bool IsUnscaled = TII->isUnscaledLdSt(MI);
+ bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
int Offset = getLdStOffsetOp(MI).getImm();
int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
// Allow one more for offset.
@@ -1983,7 +2113,7 @@
findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
if (Paired != E) {
++NumPairCreated;
- if (TII->isUnscaledLdSt(MI))
+ if (TII->hasUnscaledLdStOffset(MI))
++NumUnscaledPairCreated;
// Keeping the iterator straight is a pain, so we let the merge routine tell
// us what the next instruction is after it's done mucking about.
@@ -2018,7 +2148,7 @@
}
// Don't know how to handle unscaled pre/post-index versions below, so bail.
- if (TII->isUnscaledLdSt(MI.getOpcode()))
+ if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
return false;
// Look back to try to find a pre-index instruction. For example,
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
new file mode 100644
index 0000000..be19d49
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -0,0 +1,614 @@
+//===- AArch64LowerHomogeneousPrologEpilog.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that lowers homogeneous prolog/epilog instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME \
+ "AArch64 homogeneous prolog/epilog lowering pass"
+
+cl::opt<int> FrameHelperSizeThreshold(
+ "frame-helper-size-threshold", cl::init(2), cl::Hidden,
+ cl::desc("The minimum number of instructions that are outlined in a frame "
+ "helper (default = 2)"));
+
+namespace {
+
+class AArch64LowerHomogeneousPE {
+public:
+ const AArch64InstrInfo *TII;
+
+ AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI)
+ : M(M), MMI(MMI) {}
+
+ bool run();
+ bool runOnMachineFunction(MachineFunction &Fn);
+
+private:
+ Module *M;
+ MachineModuleInfo *MMI;
+
+ bool runOnMBB(MachineBasicBlock &MBB);
+ bool runOnMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+
+ /// Lower a HOM_Prolog pseudo instruction into a helper call
+ /// or a sequence of homogeneous stores.
+ /// When a a fp setup follows, it can be optimized.
+ bool lowerProlog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ /// Lower a HOM_Epilog pseudo instruction into a helper call
+ /// or a sequence of homogeneous loads.
+ /// When a return follow, it can be optimized.
+ bool lowerEpilog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+};
+
+class AArch64LowerHomogeneousPrologEpilog : public ModulePass {
+public:
+ static char ID;
+
+ AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) {
+ initializeAArch64LowerHomogeneousPrologEpilogPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.addPreserved<MachineModuleInfoWrapperPass>();
+ AU.setPreservesAll();
+ ModulePass::getAnalysisUsage(AU);
+ }
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override {
+ return AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME;
+ }
+};
+
+} // end anonymous namespace
+
+char AArch64LowerHomogeneousPrologEpilog::ID = 0;
+
+INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog,
+ "aarch64-lower-homogeneous-prolog-epilog",
+ AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false)
+
+bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ MachineModuleInfo *MMI =
+ &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ return AArch64LowerHomogeneousPE(&M, MMI).run();
+}
+
+bool AArch64LowerHomogeneousPE::run() {
+ bool Changed = false;
+ for (auto &F : *M) {
+ if (F.empty())
+ continue;
+
+ MachineFunction *MF = MMI->getMachineFunction(F);
+ if (!MF)
+ continue;
+ Changed |= runOnMachineFunction(*MF);
+ }
+
+ return Changed;
+}
+enum FrameHelperType { Prolog, PrologFrame, Epilog, EpilogTail };
+
+/// Return a frame helper name with the given CSRs and the helper type.
+/// For instance, a prolog helper that saves x19 and x20 is named as
+/// OUTLINED_FUNCTION_PROLOG_x19x20.
+static std::string getFrameHelperName(SmallVectorImpl<unsigned> &Regs,
+ FrameHelperType Type, unsigned FpOffset) {
+ std::ostringstream RegStream;
+ switch (Type) {
+ case FrameHelperType::Prolog:
+ RegStream << "OUTLINED_FUNCTION_PROLOG_";
+ break;
+ case FrameHelperType::PrologFrame:
+ RegStream << "OUTLINED_FUNCTION_PROLOG_FRAME" << FpOffset << "_";
+ break;
+ case FrameHelperType::Epilog:
+ RegStream << "OUTLINED_FUNCTION_EPILOG_";
+ break;
+ case FrameHelperType::EpilogTail:
+ RegStream << "OUTLINED_FUNCTION_EPILOG_TAIL_";
+ break;
+ }
+
+ for (auto Reg : Regs)
+ RegStream << AArch64InstPrinter::getRegisterName(Reg);
+
+ return RegStream.str();
+}
+
+/// Create a Function for the unique frame helper with the given name.
+/// Return a newly created MachineFunction with an empty MachineBasicBlock.
+static MachineFunction &createFrameHelperMachineFunction(Module *M,
+ MachineModuleInfo *MMI,
+ StringRef Name) {
+ LLVMContext &C = M->getContext();
+ Function *F = M->getFunction(Name);
+ assert(F == nullptr && "Function has been created before");
+ F = Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+ Function::ExternalLinkage, Name, M);
+ assert(F && "Function was null!");
+
+ // Use ODR linkage to avoid duplication.
+ F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+ F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ // Set no-opt/minsize, so we don't insert padding between outlined
+ // functions.
+ F->addFnAttr(Attribute::OptimizeNone);
+ F->addFnAttr(Attribute::NoInline);
+ F->addFnAttr(Attribute::MinSize);
+ F->addFnAttr(Attribute::Naked);
+
+ MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
+ // Remove unnecessary register liveness and set NoVRegs.
+ MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
+ MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
+ MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+ MF.getRegInfo().freezeReservedRegs(MF);
+
+ // Create entry block.
+ BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+ IRBuilder<> Builder(EntryBB);
+ Builder.CreateRetVoid();
+
+ // Insert the new block into the function.
+ MachineBasicBlock *MBB = MF.CreateMachineBasicBlock();
+ MF.insert(MF.begin(), MBB);
+
+ return MF;
+}
+
+/// Emit a store-pair instruction for frame-setup.
+static void emitStore(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Pos,
+ const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2,
+ int Offset, bool IsPreDec) {
+ bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
+ assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
+ unsigned Opc;
+ if (IsPreDec)
+ Opc = IsFloat ? AArch64::STPDpre : AArch64::STPXpre;
+ else
+ Opc = IsFloat ? AArch64::STPDi : AArch64::STPXi;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
+ if (IsPreDec)
+ MIB.addDef(AArch64::SP);
+ MIB.addReg(Reg2)
+ .addReg(Reg1)
+ .addReg(AArch64::SP)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+}
+
+/// Emit a load-pair instruction for frame-destroy.
+static void emitLoad(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Pos,
+ const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2,
+ int Offset, bool IsPostDec) {
+ bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
+ assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
+ unsigned Opc;
+ if (IsPostDec)
+ Opc = IsFloat ? AArch64::LDPDpost : AArch64::LDPXpost;
+ else
+ Opc = IsFloat ? AArch64::LDPDi : AArch64::LDPXi;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
+ if (IsPostDec)
+ MIB.addDef(AArch64::SP);
+ MIB.addReg(Reg2, getDefRegState(true))
+ .addReg(Reg1, getDefRegState(true))
+ .addReg(AArch64::SP)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameDestroy);
+}
+
+/// Return a unique function if a helper can be formed with the given Regs
+/// and frame type.
+/// 1) _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22:
+/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller
+/// stp x20, x19, [sp, #16]
+/// ret
+///
+/// 2) _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22:
+/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller
+/// stp x20, x19, [sp, #16]
+/// add fp, sp, #32
+/// ret
+///
+/// 3) _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22:
+/// mov x16, x30
+/// ldp x29, x30, [sp, #32]
+/// ldp x20, x19, [sp, #16]
+/// ldp x22, x21, [sp], #48
+/// ret x16
+///
+/// 4) _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22:
+/// ldp x29, x30, [sp, #32]
+/// ldp x20, x19, [sp, #16]
+/// ldp x22, x21, [sp], #48
+/// ret
+/// @param M module
+/// @param MMI machine module info
+/// @param Regs callee save regs that the helper will handle
+/// @param Type frame helper type
+/// @return a helper function
+static Function *getOrCreateFrameHelper(Module *M, MachineModuleInfo *MMI,
+ SmallVectorImpl<unsigned> &Regs,
+ FrameHelperType Type,
+ unsigned FpOffset = 0) {
+ assert(Regs.size() >= 2);
+ auto Name = getFrameHelperName(Regs, Type, FpOffset);
+ auto *F = M->getFunction(Name);
+ if (F)
+ return F;
+
+ auto &MF = createFrameHelperMachineFunction(M, MMI, Name);
+ MachineBasicBlock &MBB = *MF.begin();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+ int Size = (int)Regs.size();
+ switch (Type) {
+ case FrameHelperType::Prolog:
+ case FrameHelperType::PrologFrame: {
+ // Compute the remaining SP adjust beyond FP/LR.
+ auto LRIdx = std::distance(
+ Regs.begin(), std::find(Regs.begin(), Regs.end(), AArch64::LR));
+
+ // If the register stored to the lowest address is not LR, we must subtract
+ // more from SP here.
+ if (LRIdx != Size - 2) {
+ assert(Regs[Size - 2] != AArch64::LR);
+ emitStore(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1],
+ LRIdx - Size + 2, true);
+ }
+
+ // Store CSRs in the reverse order.
+ for (int I = Size - 3; I >= 0; I -= 2) {
+ // FP/LR has been stored at call-site.
+ if (Regs[I - 1] == AArch64::LR)
+ continue;
+ emitStore(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I], Size - I - 1,
+ false);
+ }
+ if (Type == FrameHelperType::PrologFrame)
+ BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ADDXri))
+ .addDef(AArch64::FP)
+ .addUse(AArch64::SP)
+ .addImm(FpOffset)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
+ .addReg(AArch64::LR);
+ break;
+ }
+ case FrameHelperType::Epilog:
+ case FrameHelperType::EpilogTail:
+ if (Type == FrameHelperType::Epilog)
+ // Stash LR to X16
+ BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs))
+ .addDef(AArch64::X16)
+ .addReg(AArch64::XZR)
+ .addUse(AArch64::LR)
+ .addImm(0);
+
+ for (int I = 0; I < Size - 2; I += 2)
+ emitLoad(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1], Size - I - 2,
+ false);
+ // Restore the last CSR with post-increment of SP.
+ emitLoad(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1], Size,
+ true);
+
+ BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
+ .addReg(Type == FrameHelperType::Epilog ? AArch64::X16 : AArch64::LR);
+ break;
+ }
+
+ return M->getFunction(Name);
+}
+
+/// This function checks if a frame helper should be used for
+/// HOM_Prolog/HOM_Epilog pseudo instruction expansion.
+/// @param MBB machine basic block
+/// @param NextMBBI next instruction following HOM_Prolog/HOM_Epilog
+/// @param Regs callee save registers that are saved or restored.
+/// @param Type frame helper type
+/// @return True if a use of helper is qualified.
+static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &NextMBBI,
+ SmallVectorImpl<unsigned> &Regs,
+ FrameHelperType Type) {
+ const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+ auto RegCount = Regs.size();
+ assert(RegCount > 0 && (RegCount % 2 == 0));
+ // # of instructions that will be outlined.
+ int InstCount = RegCount / 2;
+
+ // Do not use a helper call when not saving LR.
+ if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end())
+ return false;
+
+ switch (Type) {
+ case FrameHelperType::Prolog:
+ // Prolog helper cannot save FP/LR.
+ InstCount--;
+ break;
+ case FrameHelperType::PrologFrame: {
+ // Effecitvely no change in InstCount since FpAdjusment is included.
+ break;
+ }
+ case FrameHelperType::Epilog:
+ // Bail-out if X16 is live across the epilog helper because it is used in
+ // the helper to handle X30.
+ for (auto NextMI = NextMBBI; NextMI != MBB.end(); NextMI++) {
+ if (NextMI->readsRegister(AArch64::W16, TRI))
+ return false;
+ }
+ // Epilog may not be in the last block. Check the liveness in successors.
+ for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+ if (SuccMBB->isLiveIn(AArch64::W16) || SuccMBB->isLiveIn(AArch64::X16))
+ return false;
+ }
+ // No change in InstCount for the regular epilog case.
+ break;
+ case FrameHelperType::EpilogTail: {
+ // EpilogTail helper includes the caller's return.
+ if (NextMBBI == MBB.end())
+ return false;
+ if (NextMBBI->getOpcode() != AArch64::RET_ReallyLR)
+ return false;
+ InstCount++;
+ break;
+ }
+ }
+
+ return InstCount >= FrameHelperSizeThreshold;
+}
+
+/// Lower a HOM_Epilog pseudo instruction into a helper call while
+/// creating the helper on demand. Or emit a sequence of loads in place when not
+/// using a helper call.
+///
+/// 1. With a helper including ret
+/// HOM_Epilog x30, x29, x19, x20, x21, x22 ; MBBI
+/// ret ; NextMBBI
+/// =>
+/// b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22
+/// ... ; NextMBBI
+///
+/// 2. With a helper
+/// HOM_Epilog x30, x29, x19, x20, x21, x22
+/// =>
+/// bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22
+///
+/// 3. Without a helper
+/// HOM_Epilog x30, x29, x19, x20, x21, x22
+/// =>
+/// ldp x29, x30, [sp, #32]
+/// ldp x20, x19, [sp, #16]
+/// ldp x22, x21, [sp], #48
+bool AArch64LowerHomogeneousPE::lowerEpilog(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ auto &MF = *MBB.getParent();
+ MachineInstr &MI = *MBBI;
+
+ DebugLoc DL = MI.getDebugLoc();
+ SmallVector<unsigned, 8> Regs;
+ for (auto &MO : MI.operands())
+ if (MO.isReg())
+ Regs.push_back(MO.getReg());
+ int Size = (int)Regs.size();
+ if (Size == 0)
+ return false;
+ // Registers are in pair.
+ assert(Size % 2 == 0);
+ assert(MI.getOpcode() == AArch64::HOM_Epilog);
+
+ auto Return = NextMBBI;
+ if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::EpilogTail)) {
+ // When MBB ends with a return, emit a tail-call to the epilog helper
+ auto *EpilogTailHelper =
+ getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::EpilogTail);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::TCRETURNdi))
+ .addGlobalAddress(EpilogTailHelper)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameDestroy)
+ .copyImplicitOps(MI)
+ .copyImplicitOps(*Return);
+ NextMBBI = std::next(Return);
+ Return->removeFromParent();
+ } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs,
+ FrameHelperType::Epilog)) {
+ // The default epilog helper case.
+ auto *EpilogHelper =
+ getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Epilog);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addGlobalAddress(EpilogHelper)
+ .setMIFlag(MachineInstr::FrameDestroy)
+ .copyImplicitOps(MI);
+ } else {
+ // Fall back to no-helper.
+ for (int I = 0; I < Size - 2; I += 2)
+ emitLoad(MF, MBB, MBBI, *TII, Regs[I], Regs[I + 1], Size - I - 2, false);
+ // Restore the last CSR with post-increment of SP.
+ emitLoad(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], Size, true);
+ }
+
+ MBBI->removeFromParent();
+ return true;
+}
+
+/// Lower a HOM_Prolog pseudo instruction into a helper call while
+/// creating the helper on demand. Or emit a sequence of stores in place when
+/// not using a helper call.
+///
+/// 1. With a helper including frame-setup
+/// HOM_Prolog x30, x29, x19, x20, x21, x22, 32
+/// =>
+/// stp x29, x30, [sp, #-16]!
+/// bl _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22
+///
+/// 2. With a helper
+/// HOM_Prolog x30, x29, x19, x20, x21, x22
+/// =>
+/// stp x29, x30, [sp, #-16]!
+/// bl _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22
+///
+/// 3. Without a helper
+/// HOM_Prolog x30, x29, x19, x20, x21, x22
+/// =>
+/// stp x22, x21, [sp, #-48]!
+/// stp x20, x19, [sp, #16]
+/// stp x29, x30, [sp, #32]
+bool AArch64LowerHomogeneousPE::lowerProlog(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ auto &MF = *MBB.getParent();
+ MachineInstr &MI = *MBBI;
+
+ DebugLoc DL = MI.getDebugLoc();
+ SmallVector<unsigned, 8> Regs;
+ int LRIdx = 0;
+ Optional<int> FpOffset;
+ for (auto &MO : MI.operands()) {
+ if (MO.isReg()) {
+ if (MO.getReg() == AArch64::LR)
+ LRIdx = Regs.size();
+ Regs.push_back(MO.getReg());
+ } else if (MO.isImm()) {
+ FpOffset = MO.getImm();
+ }
+ }
+ int Size = (int)Regs.size();
+ if (Size == 0)
+ return false;
+ // Allow compact unwind case only for oww.
+ assert(Size % 2 == 0);
+ assert(MI.getOpcode() == AArch64::HOM_Prolog);
+
+ if (FpOffset &&
+ shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::PrologFrame)) {
+ // FP/LR is stored at the top of stack before the prolog helper call.
+ emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true);
+ auto *PrologFrameHelper = getOrCreateFrameHelper(
+ M, MMI, Regs, FrameHelperType::PrologFrame, *FpOffset);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addGlobalAddress(PrologFrameHelper)
+ .setMIFlag(MachineInstr::FrameSetup)
+ .copyImplicitOps(MI)
+ .addReg(AArch64::FP, RegState::Implicit | RegState::Define)
+ .addReg(AArch64::SP, RegState::Implicit);
+ } else if (!FpOffset && shouldUseFrameHelper(MBB, NextMBBI, Regs,
+ FrameHelperType::Prolog)) {
+ // FP/LR is stored at the top of stack before the prolog helper call.
+ emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true);
+ auto *PrologHelper =
+ getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Prolog);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addGlobalAddress(PrologHelper)
+ .setMIFlag(MachineInstr::FrameSetup)
+ .copyImplicitOps(MI);
+ } else {
+ // Fall back to no-helper.
+ emitStore(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], -Size, true);
+ for (int I = Size - 3; I >= 0; I -= 2)
+ emitStore(MF, MBB, MBBI, *TII, Regs[I - 1], Regs[I], Size - I - 1, false);
+ if (FpOffset) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri))
+ .addDef(AArch64::FP)
+ .addUse(AArch64::SP)
+ .addImm(*FpOffset)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ MBBI->removeFromParent();
+ return true;
+}
+
+/// Process each machine instruction
+/// @param MBB machine basic block
+/// @param MBBI current instruction iterator
+/// @param NextMBBI next instruction iterator which can be updated
+/// @return True when IR is changed.
+bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case AArch64::HOM_Prolog:
+ return lowerProlog(MBB, MBBI, NextMBBI);
+ case AArch64::HOM_Epilog:
+ return lowerEpilog(MBB, MBBI, NextMBBI);
+ }
+ return false;
+}
+
+bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= runOnMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ bool Modified = false;
+ for (auto &MBB : MF)
+ Modified |= runOnMBB(MBB);
+ return Modified;
+}
+
+ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() {
+ return new AArch64LowerHomogeneousPrologEpilog();
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 10e191f..450c270 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -39,7 +39,7 @@
unsigned TargetFlags = MO.getTargetFlags();
const Triple &TheTriple = Printer.TM.getTargetTriple();
if (!TheTriple.isOSBinFormatCOFF())
- return Printer.getSymbol(GV);
+ return Printer.getSymbolPreferLocal(*GV);
assert(TheTriple.isOSWindows() &&
"Windows is the only supported COFF target");
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 41343ba..6950675 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -73,8 +73,8 @@
const StringRef Key =
F.getFnAttribute("sign-return-address-key").getValueAsString();
- assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
- return Key.equals_lower("b_key");
+ assert(Key.equals_insensitive("a_key") || Key.equals_insensitive("b_key"));
+ return Key.equals_insensitive("b_key");
}
AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
@@ -94,9 +94,11 @@
return;
}
- const StringRef BTIEnable = F.getFnAttribute("branch-target-enforcement").getValueAsString();
- assert(BTIEnable.equals_lower("true") || BTIEnable.equals_lower("false"));
- BranchTargetEnforcement = BTIEnable.equals_lower("true");
+ const StringRef BTIEnable =
+ F.getFnAttribute("branch-target-enforcement").getValueAsString();
+ assert(BTIEnable.equals_insensitive("true") ||
+ BTIEnable.equals_insensitive("false"));
+ BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
}
bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index f60e2b6..e5e08e6 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -54,6 +54,12 @@
/// callee is expected to pop the args.
unsigned ArgumentStackToRestore = 0;
+ /// Space just below incoming stack pointer reserved for arguments being
+ /// passed on the stack during a tail call. This will be the difference
+ /// between the largest tail call argument space needed in this function and
+ /// what's already available by reusing space of incoming arguments.
+ unsigned TailCallReservedStack = 0;
+
/// HasStackFrame - True if this function has a stack frame. Set by
/// determineCalleeSaves().
bool HasStackFrame = false;
@@ -159,6 +165,14 @@
/// indirect branch destinations.
bool BranchTargetEnforcement = false;
+ /// Whether this function has an extended frame record [Ctx, FP, LR]. If so,
+ /// bit 60 of the in-memory FP will be 1 to enable other tools to detect the
+ /// extended record.
+ bool HasSwiftAsyncContext = false;
+
+ /// The stack slot where the Swift asynchronous context is stored.
+ int SwiftAsyncContextFrameIdx = std::numeric_limits<int>::max();
+
public:
explicit AArch64FunctionInfo(MachineFunction &MF);
@@ -172,6 +186,11 @@
ArgumentStackToRestore = bytes;
}
+ unsigned getTailCallReservedStack() const { return TailCallReservedStack; }
+ void setTailCallReservedStack(unsigned bytes) {
+ TailCallReservedStack = bytes;
+ }
+
bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
void setStackSizeSVE(uint64_t S) {
@@ -239,6 +258,13 @@
MaxOffset = std::max<int64_t>(Offset + ObjSize, MaxOffset);
}
+ if (SwiftAsyncContextFrameIdx != std::numeric_limits<int>::max()) {
+ int64_t Offset = MFI.getObjectOffset(getSwiftAsyncContextFrameIdx());
+ int64_t ObjSize = MFI.getObjectSize(getSwiftAsyncContextFrameIdx());
+ MinOffset = std::min<int64_t>(Offset, MinOffset);
+ MaxOffset = std::max<int64_t>(Offset + ObjSize, MaxOffset);
+ }
+
unsigned Size = alignTo(MaxOffset - MinOffset, 16);
assert((!HasCalleeSavedStackSize || getCalleeSavedStackSize() == Size) &&
"Invalid size calculated for callee saves");
@@ -372,6 +398,16 @@
bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
+ void setHasSwiftAsyncContext(bool HasContext) {
+ HasSwiftAsyncContext = HasContext;
+ }
+ bool hasSwiftAsyncContext() const { return HasSwiftAsyncContext; }
+
+ void setSwiftAsyncContextFrameIdx(int FI) {
+ SwiftAsyncContextFrameIdx = FI;
+ }
+ int getSwiftAsyncContextFrameIdx() const { return SwiftAsyncContextFrameIdx; }
+
private:
// Hold the lists of LOHs.
MILOHContainer LOHContainerSet;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index f3b8ef1..e8217ea 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -11,14 +11,13 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
-namespace {
-
/// CMN, CMP, TST followed by Bcc
static bool isArithmeticBccPair(const MachineInstr *FirstMI,
const MachineInstr &SecondMI, bool CmpOnly) {
@@ -410,13 +409,7 @@
return false;
}
-} // end namespace
-
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAArch64MacroFusionDAGMutation() {
return createMacroFusionDAGMutation(shouldScheduleAdjacent);
}
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.h
index 4e7ccbe..2999e7a 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.h
@@ -11,6 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACROFUSION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACROFUSION_H
+
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -21,3 +24,5 @@
std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation();
} // llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACROFUSION_H
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index d503c39..367594f 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -14,8 +14,6 @@
// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "aarch64-pbqp"
-
#include "AArch64PBQPRegAlloc.h"
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
@@ -28,6 +26,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
+#define DEBUG_TYPE "aarch64-pbqp"
+
using namespace llvm;
namespace {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
index 7bbd992..615ce7d 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -10,7 +10,7 @@
//===----------------------------------------------------------------------===//
/// General Purpose Registers: W, X.
-def GPRRegBank : RegisterBank<"GPR", [GPR64all]>;
+def GPRRegBank : RegisterBank<"GPR", [XSeqPairsClass]>;
/// Floating Point/Vector Registers: B, H, S, D, Q.
def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index f90856d..d1b901e 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -100,6 +100,8 @@
MF->getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
return CSR_AArch64_AAPCS_SwiftError_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
+ return CSR_AArch64_AAPCS_SwiftTail_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_AArch64_RT_MostRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::Win64)
@@ -134,6 +136,8 @@
MF->getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
+ return CSR_Darwin_AArch64_AAPCS_SwiftTail_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
return CSR_Darwin_AArch64_AAPCS_SaveList;
@@ -199,6 +203,8 @@
->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
+ if (CC == CallingConv::SwiftTail)
+ return CSR_Darwin_AArch64_AAPCS_SwiftTail_RegMask;
if (CC == CallingConv::PreserveMost)
return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
return CSR_Darwin_AArch64_AAPCS_RegMask;
@@ -233,6 +239,11 @@
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
: CSR_AArch64_AAPCS_SwiftError_RegMask;
+ if (CC == CallingConv::SwiftTail) {
+ if (SCS)
+ report_fatal_error("ShadowCallStack attribute not supported with swifttail");
+ return CSR_AArch64_AAPCS_SwiftTail_RegMask;
+ }
if (CC == CallingConv::PreserveMost)
return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
: CSR_AArch64_RT_MostRegs_RegMask;
@@ -382,7 +393,7 @@
// stack needs to be dynamically re-aligned, the base pointer is the only
// reliable way to reference the locals.
if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) {
- if (needsStackRealignment(MF))
+ if (hasStackRealignment(MF))
return true;
if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
@@ -437,7 +448,7 @@
assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
AFI->hasCalculatedStackSizeSVE()) &&
"Expected SVE area to be calculated by this point");
- return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();
+ return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE();
}
bool AArch64RegisterInfo::requiresFrameIndexScavenging(
@@ -741,6 +752,9 @@
case AArch64::FPR128RegClassID:
return 32;
+ case AArch64::MatrixIndexGPR32_12_15RegClassID:
+ return 4;
+
case AArch64::DDRegClassID:
case AArch64::DDDRegClassID:
case AArch64::DDDDRegClassID:
@@ -761,7 +775,7 @@
const auto &MFI = MF.getFrameInfo();
if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects())
return AArch64::SP;
- else if (needsStackRealignment(MF))
+ else if (hasStackRealignment(MF))
return getBaseRegister();
return getFrameRegister(MF);
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 28d1988..67680e3 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -24,11 +24,9 @@
def bsub : SubRegIndex<8>;
def hsub : SubRegIndex<16>;
def ssub : SubRegIndex<32>;
- def dsub : SubRegIndex<32>;
+ def dsub : SubRegIndex<64>;
def sube32 : SubRegIndex<32>;
def subo32 : SubRegIndex<32>;
- def qhisub : SubRegIndex<64>;
- def qsub : SubRegIndex<64>;
def sube64 : SubRegIndex<64>;
def subo64 : SubRegIndex<64>;
// SVE
@@ -47,6 +45,16 @@
def qsub1 : SubRegIndex<128>;
def qsub2 : SubRegIndex<128>;
def qsub3 : SubRegIndex<128>;
+ // Note: Code depends on these having consecutive numbers
+ def zasubb : SubRegIndex<2048>; // (16 x 16)/1 bytes = 2048 bits
+ def zasubh0 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits
+ def zasubh1 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits
+ def zasubs0 : SubRegIndex<512>; // (16 x 16)/4 bytes = 512 bits
+ def zasubs1 : SubRegIndex<512>; // (16 x 16)/4 bytes = 512 bits
+ def zasubd0 : SubRegIndex<256>; // (16 x 16)/8 bytes = 256 bits
+ def zasubd1 : SubRegIndex<256>; // (16 x 16)/8 bytes = 256 bits
+ def zasubq0 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits
+ def zasubq1 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits
}
let Namespace = "AArch64" in {
@@ -724,7 +732,9 @@
!foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)),
!foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>;
-def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>;
+def GPR64x8Class : RegisterClass<"AArch64", [i64x8], 512, (trunc Tuples8X, 12)> {
+ let Size = 512;
+}
def GPR64x8AsmOp : AsmOperandClass {
let Name = "GPR64x8";
let ParserMethod = "tryParseGPR64x8";
@@ -1151,10 +1161,235 @@
let PrintMethod = "printRegWithShiftExtend<false, " # Scale # ", 'x', 0>";
}
-foreach Scale = [8, 16, 32, 64] in {
+foreach Scale = [8, 16, 32, 64, 128] in {
def GPR64shiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64shifted", Scale, "GPR64">;
def GPR64shifted # Scale : GPR64ExtendRegisterOperand<"GPR64shiftedAsmOpnd" # Scale, Scale, GPR64>;
def GPR64NoXZRshiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64NoXZRshifted", Scale, "GPR64common">;
def GPR64NoXZRshifted # Scale : GPR64ExtendRegisterOperand<"GPR64NoXZRshiftedAsmOpnd" # Scale, Scale, GPR64common>;
}
+
+// Accumulator array tiles.
+def ZAQ0 : AArch64Reg<0, "za0.q">;
+def ZAQ1 : AArch64Reg<1, "za1.q">;
+def ZAQ2 : AArch64Reg<2, "za2.q">;
+def ZAQ3 : AArch64Reg<3, "za3.q">;
+def ZAQ4 : AArch64Reg<4, "za4.q">;
+def ZAQ5 : AArch64Reg<5, "za5.q">;
+def ZAQ6 : AArch64Reg<6, "za6.q">;
+def ZAQ7 : AArch64Reg<7, "za7.q">;
+def ZAQ8 : AArch64Reg<8, "za8.q">;
+def ZAQ9 : AArch64Reg<9, "za9.q">;
+def ZAQ10 : AArch64Reg<10, "za10.q">;
+def ZAQ11 : AArch64Reg<11, "za11.q">;
+def ZAQ12 : AArch64Reg<12, "za12.q">;
+def ZAQ13 : AArch64Reg<13, "za13.q">;
+def ZAQ14 : AArch64Reg<14, "za14.q">;
+def ZAQ15 : AArch64Reg<15, "za15.q">;
+
+let SubRegIndices = [zasubq0, zasubq1] in {
+ def ZAD0 : AArch64Reg<0, "za0.d", [ZAQ0, ZAQ8]>;
+ def ZAD1 : AArch64Reg<1, "za1.d", [ZAQ1, ZAQ9]>;
+ def ZAD2 : AArch64Reg<2, "za2.d", [ZAQ2, ZAQ10]>;
+ def ZAD3 : AArch64Reg<3, "za3.d", [ZAQ3, ZAQ11]>;
+ def ZAD4 : AArch64Reg<4, "za4.d", [ZAQ4, ZAQ12]>;
+ def ZAD5 : AArch64Reg<5, "za5.d", [ZAQ5, ZAQ13]>;
+ def ZAD6 : AArch64Reg<6, "za6.d", [ZAQ6, ZAQ14]>;
+ def ZAD7 : AArch64Reg<7, "za7.d", [ZAQ7, ZAQ15]>;
+}
+
+let SubRegIndices = [zasubd0, zasubd1] in {
+ def ZAS0 : AArch64Reg<0, "za0.s", [ZAD0, ZAD4]>;
+ def ZAS1 : AArch64Reg<1, "za1.s", [ZAD1, ZAD5]>;
+ def ZAS2 : AArch64Reg<2, "za2.s", [ZAD2, ZAD6]>;
+ def ZAS3 : AArch64Reg<3, "za3.s", [ZAD3, ZAD7]>;
+}
+
+let SubRegIndices = [zasubs0, zasubs1] in {
+ def ZAH0 : AArch64Reg<0, "za0.h", [ZAS0, ZAS2]>;
+ def ZAH1 : AArch64Reg<1, "za1.h", [ZAS1, ZAS3]>;
+}
+
+let SubRegIndices = [zasubh0, zasubh1] in {
+ def ZAB0 : AArch64Reg<0, "za0.b", [ZAH0, ZAH1]>;
+}
+
+let SubRegIndices = [zasubb] in {
+ def ZA : AArch64Reg<0, "za", [ZAB0]>;
+}
+
+// SME Register Classes
+
+// Accumulator array
+def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
+ let Size = 2048;
+}
+
+// Accumulator array as single tiles
+def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
+ let Size = 2048;
+}
+def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
+ let Size = 1024;
+}
+def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
+ let Size = 512;
+}
+def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
+ let Size = 256;
+}
+def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
+ let Size = 128;
+}
+
+// SME Register Operands
+// There are three types of SME matrix register operands:
+// * Tiles:
+//
+// These tiles make up the larger accumulator matrix. The tile representation
+// has an element type suffix, e.g. za0.b or za15.q and can be any of the
+// registers:
+// ZAQ0..ZAQ15
+// ZAD0..ZAD7
+// ZAS0..ZAS3
+// ZAH0..ZAH1
+// or ZAB0
+//
+// * Tile vectors:
+//
+// Their representation is similar to regular tiles, but they have an extra
+// 'h' or 'v' to tell how the vector at [reg+offset] is layed out in the tile,
+// horizontally or vertically.
+//
+// e.g. za1h.h or za15v.q, which corresponds to vectors in registers ZAH1 and
+// ZAQ15, respectively. The horizontal/vertical is more a property of the
+// instruction, than a property of the asm-operand itself, or its register.
+// The distinction is required for the parsing/printing of the operand,
+// as from a compiler's perspective, the whole tile is read/written.
+//
+// * Accumulator matrix:
+//
+// This is the entire matrix accumulator register ZA (<=> ZAB0), printed as
+// 'za'.
+
+//
+// Tiles
+//
+
+class MatrixTileAsmOperand<string RC, int EltSize> : AsmOperandClass {
+ let Name = "MatrixTile" # EltSize;
+ let DiagnosticType = "Invalid" # Name;
+ let ParserMethod = "tryParseMatrixRegister";
+ let RenderMethod = "addMatrixOperands";
+ let PredicateMethod = "isMatrixRegOperand<"
+ # "MatrixKind::Tile" # ", "
+ # EltSize # ", AArch64::" # RC # "RegClassID>";
+}
+
+class MatrixTileOperand<int EltSize, int NumBitsForTile, RegisterClass RC>
+ : RegisterOperand<RC> {
+ let ParserMatchClass = MatrixTileAsmOperand<!cast<string>(RC), EltSize>;
+ let DecoderMethod = "DecodeMatrixTile<" # NumBitsForTile # ">";
+ let PrintMethod = "printMatrixTile";
+}
+
+def TileOp32 : MatrixTileOperand<32, 2, MPR32>;
+def TileOp64 : MatrixTileOperand<64, 3, MPR64>;
+
+//
+// Tile vectors (horizontal and vertical)
+//
+
+class MatrixTileVectorAsmOperand<string RC, int EltSize, int IsVertical>
+ : AsmOperandClass {
+ let Name = "MatrixTileVector" # !if(IsVertical, "V", "H") # EltSize;
+ let DiagnosticType = "Invalid" # Name;
+ let ParserMethod = "tryParseMatrixRegister";
+ let RenderMethod = "addMatrixOperands";
+ let PredicateMethod = "isMatrixRegOperand<"
+ # "MatrixKind::"
+ # !if(IsVertical, "Col", "Row") # ", "
+ # EltSize # ", AArch64::" # RC # "RegClassID>";
+}
+
+class MatrixTileVectorOperand<int EltSize, int NumBitsForTile,
+ RegisterClass RC, int IsVertical>
+ : RegisterOperand<RC> {
+ let ParserMatchClass = MatrixTileVectorAsmOperand<!cast<string>(RC), EltSize,
+ IsVertical>;
+ let DecoderMethod = "DecodeMatrixTile<" # NumBitsForTile # ">";
+ let PrintMethod = "printMatrixTileVector<" # IsVertical # ">";
+}
+
+def TileVectorOpH8 : MatrixTileVectorOperand< 8, 0, MPR8, 0>;
+def TileVectorOpH16 : MatrixTileVectorOperand< 16, 1, MPR16, 0>;
+def TileVectorOpH32 : MatrixTileVectorOperand< 32, 2, MPR32, 0>;
+def TileVectorOpH64 : MatrixTileVectorOperand< 64, 3, MPR64, 0>;
+def TileVectorOpH128 : MatrixTileVectorOperand<128, 4, MPR128, 0>;
+
+def TileVectorOpV8 : MatrixTileVectorOperand< 8, 0, MPR8, 1>;
+def TileVectorOpV16 : MatrixTileVectorOperand< 16, 1, MPR16, 1>;
+def TileVectorOpV32 : MatrixTileVectorOperand< 32, 2, MPR32, 1>;
+def TileVectorOpV64 : MatrixTileVectorOperand< 64, 3, MPR64, 1>;
+def TileVectorOpV128 : MatrixTileVectorOperand<128, 4, MPR128, 1>;
+
+//
+// Accumulator matrix
+//
+
+class MatrixAsmOperand<string RC, int EltSize> : AsmOperandClass {
+ let Name = "Matrix";
+ let DiagnosticType = "Invalid" # Name;
+ let ParserMethod = "tryParseMatrixRegister";
+ let RenderMethod = "addMatrixOperands";
+ let PredicateMethod = "isMatrixRegOperand<"
+ # "MatrixKind::Array" # ", "
+ # EltSize # ", AArch64::" # RC # "RegClassID>";
+}
+
+class MatrixOperand<RegisterClass RC, int EltSize> : RegisterOperand<RC> {
+ let ParserMatchClass = MatrixAsmOperand<!cast<string>(RC), EltSize>;
+ let PrintMethod = "printMatrix<" # EltSize # ">";
+}
+
+def MatrixOp : MatrixOperand<MPR, 0>;
+
+class MatrixTileListAsmOperand : AsmOperandClass {
+ let Name = "MatrixTileList";
+ let ParserMethod = "tryParseMatrixTileList";
+ let RenderMethod = "addMatrixTileListOperands";
+ let PredicateMethod = "isMatrixTileList";
+}
+
+class MatrixTileListOperand : Operand<i8> {
+ let ParserMatchClass = MatrixTileListAsmOperand<>;
+ let DecoderMethod = "DecodeMatrixTileListRegisterClass";
+ let EncoderMethod = "EncodeMatrixTileListRegisterClass";
+ let PrintMethod = "printMatrixTileList";
+}
+
+def MatrixTileList : MatrixTileListOperand<>;
+
+def MatrixIndexGPR32_12_15 : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 12, 15)> {
+ let DiagnosticType = "InvalidMatrixIndexGPR32_12_15";
+}
+def MatrixIndexGPR32Op12_15 : RegisterOperand<MatrixIndexGPR32_12_15> {
+ let EncoderMethod = "encodeMatrixIndexGPR32";
+}
+
+def SVCROperand : AsmOperandClass {
+ let Name = "SVCR";
+ let ParserMethod = "tryParseSVCR";
+ let DiagnosticType = "Invalid" # Name;
+}
+
+def svcr_op : Operand<i32> {
+ let ParserMatchClass = SVCROperand;
+ let PrintMethod = "printSVCROp";
+ let DecoderMethod = "DecodeSVCROp";
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ return AArch64SVCR::lookupSVCRByEncoding(MCOp.getImm()) != nullptr;
+ }];
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index cb4dc84..c4965e7 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -186,12 +186,16 @@
struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
const char *getThunkPrefix() { return SLSBLRNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
+ ComdatThunks &= !MF.getSubtarget<AArch64Subtarget>().hardenSlsNoComdat();
// FIXME: This could also check if there are any BLRs in the function
// to more accurately reflect if a thunk will be needed.
return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr();
}
void insertThunks(MachineModuleInfo &MMI);
void populateThunk(MachineFunction &MF);
+
+private:
+ bool ComdatThunks = true;
};
} // namespace
@@ -200,7 +204,7 @@
// based on which registers are actually used in BLR instructions in this
// function. But would that be a worthwhile optimization?
for (auto T : SLSBLRThunks)
- createThunkFunction(MMI, T.Name);
+ createThunkFunction(MMI, T.Name, ComdatThunks);
}
void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
new file mode 100644
index 0000000..6a0fa2f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -0,0 +1,143 @@
+//=- AArch64SMEInstrInfo.td - AArch64 SME Instructions -*- tablegen -*-----=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Matrix Extension (SME) Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Add vector elements horizontally or vertically to ZA tile.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSME] in {
+def ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha">;
+def ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva">;
+}
+
+let Predicates = [HasSMEI64] in {
+def ADDHA_MPPZ_D : sme_add_vector_to_tile_u64<0b0, "addha">;
+def ADDVA_MPPZ_D : sme_add_vector_to_tile_u64<0b1, "addva">;
+}
+
+let Predicates = [HasSME] in {
+//===----------------------------------------------------------------------===//
+// Outer products
+//===----------------------------------------------------------------------===//
+
+defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa">;
+defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops">;
+
+def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">;
+def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">;
+}
+
+let Predicates = [HasSMEF64] in {
+def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">;
+def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">;
+}
+
+let Predicates = [HasSME] in {
+defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa">;
+defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops">;
+
+def SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa">;
+def SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops">;
+def UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa">;
+def UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops">;
+def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">;
+def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">;
+def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">;
+def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">;
+}
+
+let Predicates = [HasSMEI64] in {
+def SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa">;
+def SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops">;
+def UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa">;
+def UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops">;
+def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">;
+def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">;
+def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">;
+def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">;
+}
+
+let Predicates = [HasSME] in {
+//===----------------------------------------------------------------------===//
+// Loads and stores
+//===----------------------------------------------------------------------===//
+
+defm LD1_MXIPXX : sme_mem_ld_ss<"ld1">;
+defm ST1_MXIPXX : sme_mem_st_ss<"st1">;
+
+//===----------------------------------------------------------------------===//
+// Spill + fill
+//===----------------------------------------------------------------------===//
+
+defm LDR_ZA : sme_fill<"ldr">;
+defm STR_ZA : sme_spill<"str">;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+defm INSERT_MXIPZ : sme_vector_to_tile<"mova">;
+defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">;
+
+//===----------------------------------------------------------------------===//
+// Zero instruction
+//===----------------------------------------------------------------------===//
+
+defm ZERO_M : sme_zero<"zero">;
+
+//===----------------------------------------------------------------------===//
+// Mode selection and state access instructions
+//===----------------------------------------------------------------------===//
+
+// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or
+// both fields:
+//
+// MSR SVCRSM, #<imm1>
+// MSR SVCRZA, #<imm1>
+// MSR SVCRSMZA, #<imm1>
+//
+// It's tricky to using the existing pstate operand defined in
+// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2,
+// when these fields are also encoded in CRm[3:1].
+class MSRpstatesvcrImm0_1
+ : PstateWriteSimple<(ins svcr_op:$pstatefield, imm0_1:$imm), "msr",
+ "\t$pstatefield, $imm">,
+ Sched<[WriteSys]> {
+ bits<3> pstatefield;
+ bit imm;
+ let Inst{18-16} = 0b011; // op1
+ let Inst{11-9} = pstatefield;
+ let Inst{8} = imm;
+ let Inst{7-5} = 0b011; // op2
+}
+
+def MSRpstatesvcrImm1 : MSRpstatesvcrImm0_1;
+def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>;
+def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>;
+def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>;
+
+def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>;
+def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>;
+def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>;
+
+//===----------------------------------------------------------------------===//
+// SVE2 instructions
+//===----------------------------------------------------------------------===//
+
+def REVD_ZPmZ : sve2_int_perm_revd<"revd">;
+
+defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>;
+defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>;
+
+defm DUP_PPzPRI : sve2_int_perm_dup_p<"dup">;
+
+} // End let Predicates = [HasSME]
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e09b840..91c3aec 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -182,6 +182,8 @@
def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
+def AArch64fmax_p : SDNode<"AArch64ISD::FMAX_PRED", SDT_AArch64Arith>;
+def AArch64fmin_p : SDNode<"AArch64ISD::FMIN_PRED", SDT_AArch64Arith>;
def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>;
def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>;
def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>;
@@ -190,10 +192,12 @@
def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
+def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>;
def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
+def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>;
def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
@@ -205,8 +209,8 @@
def AArch64cnt_mt : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>;
def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
def AArch64fabs_mt : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
-def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
-def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>;
@@ -249,37 +253,32 @@
def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
-def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
-
def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;
-def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
-def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
+def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>;
+
+def step_vector_oneuse : PatFrag<(ops node:$idx),
+ (step_vector node:$idx), [{
+ return N->hasOneUse();
+}]>;
def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
-def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs),
- [(setoge node:$lhs, node:$rhs),
- (setge node:$lhs, node:$rhs)]>;
-def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs),
- [(setogt node:$lhs, node:$rhs),
- (setgt node:$lhs, node:$rhs)]>;
-def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs),
- [(setoeq node:$lhs, node:$rhs),
- (seteq node:$lhs, node:$rhs)]>;
-def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs),
- [(setone node:$lhs, node:$rhs),
- (setne node:$lhs, node:$rhs)]>;
def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
(AArch64mul_p node:$pred, node:$src1, node:$src2), [{
return N->hasOneUse();
}]>;
+def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>,
+ SDTCisSameAs<0,1>, SDTCisSameAs<1,2>
+]>;
+
+def AArch64bic : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>;
let Predicates = [HasSVE] in {
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
@@ -288,17 +287,17 @@
def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
- defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
- defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
- defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
- defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
- defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
- defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
+ defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>;
+ defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
+ defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
+ defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>;
+ defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>;
+ defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>;
defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
- defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;
+ defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", AArch64bic>;
defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
@@ -318,13 +317,13 @@
defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
- defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>;
- defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
+ defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>;
+ defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>;
defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
- defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
- defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
- defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
- defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
+ defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
+ defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
+ defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
+ defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>;
defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
@@ -338,13 +337,14 @@
defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
- defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
- defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
- defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;
+ defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
+ defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
+ defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;
defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
+ defm BIC_ZI : sve_int_log_imm_bic<AArch64bic>;
defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>;
defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>;
@@ -357,6 +357,8 @@
defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>;
defm MUL_ZPZZ : sve_int_bin_pred_bhsd<AArch64mul_p>;
+ defm SMULH_ZPZZ : sve_int_bin_pred_bhsd<AArch64smulh_p>;
+ defm UMULH_ZPZZ : sve_int_bin_pred_bhsd<AArch64umulh_p>;
defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
@@ -366,8 +368,8 @@
defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>;
defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>;
- defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
- defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
+ defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>;
+ defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>;
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
@@ -396,10 +398,10 @@
defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>;
defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>;
- defm SMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64smax_p>;
- defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>;
- defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>;
- defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>;
+ defm SMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64smax_p>;
+ defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>;
+ defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>;
+ defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>;
defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
@@ -432,6 +434,8 @@
defm FMUL_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmul_p>;
defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>;
defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>;
+ defm FMAX_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmax_p>;
+ defm FMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmin_p>;
defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
@@ -461,31 +465,57 @@
defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
- defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
- defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
- defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
- defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;
+ defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", "FMLA_ZPZZZ", int_aarch64_sve_fmla, "FMAD_ZPmZZ">;
+ defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", "FMLS_ZPZZZ", int_aarch64_sve_fmls, "FMSB_ZPmZZ">;
+ defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", "FNMLA_ZPZZZ", int_aarch64_sve_fnmla, "FNMAD_ZPmZZ">;
+ defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", "FNMLS_ZPZZZ", int_aarch64_sve_fnmls, "FNMSB_ZPmZZ">;
- defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
- defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
- defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
- defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
+ defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad, "FMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb, "FMLS_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>;
- // Add patterns for FMA where disabled lanes are undef.
- // FIXME: Implement a pseudo so we can choose a better instruction after
- // regalloc.
- def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
- (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
+ defm FMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmla, int_aarch64_sve_fmad>;
+ defm FMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmls, int_aarch64_sve_fmsb>;
+ defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmla, int_aarch64_sve_fnmad>;
+ defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmls, int_aarch64_sve_fnmsb>;
+
+ multiclass fma<ValueType Ty, ValueType PredTy, string Suffix> {
+ // Zd = Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -(Za + Zn * Zm)
+ def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), ZPR:$Zn, ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLA_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + -Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), (AArch64fneg_mt (PredTy (AArch64ptrue 31)), Ty:$Zn, (Ty (undef))), ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLS_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+ }
+
+ defm : fma<nxv8f16, nxv8i1, "H">;
+ defm : fma<nxv4f16, nxv4i1, "H">;
+ defm : fma<nxv2f16, nxv2i1, "H">;
+ defm : fma<nxv4f32, nxv4i1, "S">;
+ defm : fma<nxv2f32, nxv2i1, "S">;
+ defm : fma<nxv2f64, nxv2i1, "D">;
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
@@ -504,8 +534,8 @@
defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;
// Splat immediate (unpredicated)
- defm DUP_ZI : sve_int_dup_imm<"dup">;
- defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
+ defm DUP_ZI : sve_int_dup_imm<"dup">;
+ defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;
// Splat immediate (predicated)
@@ -556,6 +586,14 @@
def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_D $a, $b)>;
+ // Duplicate immediate FP into all vector elements.
+ def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+ (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+ (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+ (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
+
// Duplicate FP immediate into all vector elements
let AddedComplexity = 2 in {
def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
@@ -575,7 +613,7 @@
// Select elements from either vector (predicated)
defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
- defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
+ defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
@@ -587,8 +625,8 @@
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
- defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
- defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
+ defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>;
+ defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>;
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
@@ -931,7 +969,7 @@
// st1h z0.d, p0, [x0, z0.d, uxtw]
defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
+ defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Scatters using packed, unscaled 32-bit offsets, e.g.
@@ -1188,6 +1226,20 @@
def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)),
(UZP1_ZZZ_H $v1, $v2)>;
+ // Splice with lane equal to -1
+ def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 -1))),
+ (INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
+ (LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>;
+ def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 -1))),
+ (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
+ (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>;
+ def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 -1))),
+ (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
+ (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>;
+ def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 -1))),
+ (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
+ (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
+
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
@@ -1217,20 +1269,20 @@
defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
- defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>;
- defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>;
- defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>;
- defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>;
- defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
+ defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
+ defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
+ defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+ defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
- defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
- defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
- defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
- defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
- defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
- defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
+ defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
+ defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
+ defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>;
+ defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>;
+ defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
+ defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>;
defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
@@ -1329,8 +1381,8 @@
defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
- defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
- defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;
+ defm INCP_XP : sve_int_count_r_x64<0b10000, "incp", null_frag, add>;
+ defm DECP_XP : sve_int_count_r_x64<0b10100, "decp", null_frag, sub>;
defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;
@@ -1339,19 +1391,19 @@
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
- defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
- defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
- defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
- defm INDEX_II : sve_int_index_ii<"index", index_vector>;
+ defm INDEX_RR : sve_int_index_rr<"index", step_vector, step_vector_oneuse, AArch64mul_p_oneuse>;
+ defm INDEX_IR : sve_int_index_ir<"index", step_vector, step_vector_oneuse, AArch64mul_p, AArch64mul_p_oneuse>;
+ defm INDEX_RI : sve_int_index_ri<"index", step_vector, step_vector_oneuse>;
+ defm INDEX_II : sve_int_index_ii<"index", step_vector, step_vector_oneuse>;
// Unpredicated shifts
defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>;
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>;
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>;
- defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
- defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
- defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
+ defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>;
+ defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>;
+ defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>;
// Predicated shifts
defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>;
@@ -1364,10 +1416,10 @@
defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
- defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
- defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
- defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
- defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
+ defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
+ defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
+ defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
+ defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
}
defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
@@ -1377,9 +1429,9 @@
defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
- defm ASR_ZPZZ : sve_int_bin_pred_bhsd<AArch64asr_p>;
- defm LSR_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsr_p>;
- defm LSL_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsl_p>;
+ defm ASR_ZPZZ : sve_int_bin_pred_bhsd<AArch64asr_p>;
+ defm LSR_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsr_p>;
+ defm LSL_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsl_p>;
defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
@@ -1605,6 +1657,45 @@
def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
+ let AddedComplexity = 1 in {
+ class LD1RPat<ValueType vt, SDPatternOperator operator,
+ Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
+ Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
+ (load (ptrue 31), GPR64:$base, $offset)>;
+ }
+
+ // LDR1 of 8-bit data
+ def : LD1RPat<nxv16i8, extloadi8, LD1RB_IMM, PTRUE_B, i32, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv8i16, zextloadi8, LD1RB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv4i32, zextloadi8, LD1RB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv2i64, zextloadi8, LD1RB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv8i16, sextloadi8, LD1RSB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv4i32, sextloadi8, LD1RSB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+ def : LD1RPat<nxv2i64, sextloadi8, LD1RSB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+
+ // LDR1 of 16-bit data
+ def : LD1RPat<nxv8i16, extloadi16, LD1RH_IMM, PTRUE_H, i32, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv4i32, zextloadi16, LD1RH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv2i64, zextloadi16, LD1RH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv4i32, sextloadi16, LD1RSH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv2i64, sextloadi16, LD1RSH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+
+ // LDR1 of 32-bit data
+ def : LD1RPat<nxv4i32, load, LD1RW_IMM, PTRUE_S, i32, am_indexed32_6b, uimm6s4>;
+ def : LD1RPat<nxv2i64, zextloadi32, LD1RW_D_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+ def : LD1RPat<nxv2i64, sextloadi32, LD1RSW_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+
+ // LDR1 of 64-bit data
+ def : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
+
+ // LD1R of FP data
+ def : LD1RPat<nxv8f16, load, LD1RH_IMM, PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
+ def : LD1RPat<nxv4f32, load, LD1RW_IMM, PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
+ def : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
+ def : LD1RPat<nxv2f64, load, LD1RD_IMM, PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+
// LD1R of 128-bit masked data
def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
(LD1RQ_B_IMM $gp, $base, (i64 0))>;
@@ -1624,12 +1715,12 @@
def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
(LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
- def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
- def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
- def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
- def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
- def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
- def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_UNDEF_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_UNDEF_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_UNDEF_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
// General case that we ideally never want to match.
def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;
@@ -1760,12 +1851,10 @@
// Add more complex addressing modes here as required
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
- // reg + reg
let AddedComplexity = 1 in {
def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
(RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
}
- // reg + imm
let AddedComplexity = 2 in {
def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
(RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
@@ -1807,12 +1896,10 @@
multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
- // reg + reg
let AddedComplexity = 1 in {
def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
(RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
}
- // reg + imm
let AddedComplexity = 2 in {
def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
@@ -1856,13 +1943,18 @@
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
- multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
- Instruction PTrue> {
+ multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegRegInst,
+ Instruction RegImmInst, Instruction PTrue,
+ ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
+ def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)),
+ (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
+ }
+ let AddedComplexity = 2 in {
def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
- let AddedComplexity = 2 in {
+ let AddedComplexity = 3 in {
def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
@@ -1871,32 +1963,36 @@
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
}
- defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
- defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
- defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
- defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
- defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
- defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
- defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
- defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
- defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
- defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
- defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>;
- defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
+ defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
+ defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
+ defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
+ defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+ defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+ defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+ defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
- multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
- Instruction PTrue> {
+ multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
+ Instruction RegImmInst, Instruction PTrue,
+ ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
+ def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
+ (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
+ }
+ let AddedComplexity = 2 in {
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
-
- let AddedComplexity = 2 in {
+ let AddedComplexity = 3 in {
def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
@@ -1905,35 +2001,54 @@
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
}
- defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
- defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
- defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
- defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
- defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
- defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
- defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
- defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
- defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
- defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
- defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
- defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
- defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
- defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
- defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
- defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
- defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
- defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
- defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
- defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
- defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
- defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
- defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
- defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
+ defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
+ defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
+ defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
+ defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
+ defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
+ defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
+ defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
+ defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
+ defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
+ defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S, LD1SH_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D, LD1SH_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+ defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D, LD1SW_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+ defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+ defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+ defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+ defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+ defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+ defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+
+ // Allow using the reg+reg form of ld1b/st1b for memory accesses with the
+ // same width as nxv16i8. This saves an add in cases where we would
+ // otherwise compute the address separately.
+ multiclass unpred_loadstore_bitcast<ValueType Ty> {
+ let Predicates = [IsLE] in {
+ def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))),
+ (LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
+ def : Pat<(store (Ty ZPR:$val), (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
+ (ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
+ }
+ }
+ defm : unpred_loadstore_bitcast<nxv8i16>;
+ defm : unpred_loadstore_bitcast<nxv8f16>;
+ defm : unpred_loadstore_bitcast<nxv8bf16>;
+ defm : unpred_loadstore_bitcast<nxv4f32>;
+ defm : unpred_loadstore_bitcast<nxv4i32>;
+ defm : unpred_loadstore_bitcast<nxv2i64>;
+ defm : unpred_loadstore_bitcast<nxv2f64>;
multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
@@ -2192,12 +2307,30 @@
GPR64:$src)>;
// Insert FP scalar into vector with scalar index
+ def : Pat<(nxv2f16 (vector_insert (nxv2f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+ (CPY_ZPmV_H ZPR:$vec,
+ (CMPEQ_PPzZZ_D (PTRUE_D 31),
+ (INDEX_II_D 0, 1),
+ (DUP_ZR_D GPR64:$index)),
+ $src)>;
+ def : Pat<(nxv4f16 (vector_insert (nxv4f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+ (CPY_ZPmV_H ZPR:$vec,
+ (CMPEQ_PPzZZ_S (PTRUE_S 31),
+ (INDEX_II_S 0, 1),
+ (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ $src)>;
def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
(CPY_ZPmV_H ZPR:$vec,
(CMPEQ_PPzZZ_H (PTRUE_H 31),
(INDEX_II_H 0, 1),
(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
$src)>;
+ def : Pat<(nxv2f32 (vector_insert (nxv2f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+ (CPY_ZPmV_S ZPR:$vec,
+ (CMPEQ_PPzZZ_D (PTRUE_D 31),
+ (INDEX_II_D 0, 1),
+ (DUP_ZR_D GPR64:$index)),
+ $src) >;
def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
(CPY_ZPmV_S ZPR:$vec,
(CMPEQ_PPzZZ_S (PTRUE_S 31),
@@ -2244,8 +2377,6 @@
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
- def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
- (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
@@ -2292,6 +2423,16 @@
def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
(f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
}
+
+ // Splice with lane bigger or equal to 0
+ def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_15 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_15:$index)>;
+ def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_7 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_7:$index)>;
+ def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_3 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_3:$index)>;
+ def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_1 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_1:$index)>;
}
let Predicates = [HasSVE, HasMatMulInt8] in {
@@ -2351,28 +2492,10 @@
// SVE2 integer multiply vectors (unpredicated)
defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>;
- defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
- defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
+ defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag, AArch64smulh_p>;
+ defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>;
defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
- // Add patterns for unpredicated version of smulh and umulh.
- def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
- (SMULH_ZZZ_B $Op1, $Op2)>;
- def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
- (SMULH_ZZZ_H $Op1, $Op2)>;
- def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
- (SMULH_ZZZ_S $Op1, $Op2)>;
- def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
- (SMULH_ZZZ_D $Op1, $Op2)>;
- def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
- (UMULH_ZZZ_B $Op1, $Op2)>;
- def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
- (UMULH_ZZZ_H $Op1, $Op2)>;
- def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
- (UMULH_ZZZ_S $Op1, $Op2)>;
- def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
- (UMULH_ZZZ_D $Op1, $Op2)>;
-
// SVE2 complex integer dot product (indexed)
defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
@@ -2472,18 +2595,25 @@
defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;
// SVE2 saturating/rounding bitwise shift left (predicated)
- defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>;
- defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>;
- defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
- defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
- defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>;
- defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>;
- defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>;
- defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>;
- defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
- defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
- defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
- defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
+ defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl, "SRSHL_ZPZZ", DestructiveBinaryCommWithRev, "SRSHLR_ZPmZ">;
+ defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl, "URSHL_ZPZZ", DestructiveBinaryCommWithRev, "URSHLR_ZPmZ">;
+ defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag, "SRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag, "URSHLR_ZPZZ", DestructiveBinaryCommWithRev, "URSHL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl, "SQSHL_ZPZZ", DestructiveBinaryCommWithRev, "SQSHLR_ZPmZ">;
+ defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl, "UQSHL_ZPZZ", DestructiveBinaryCommWithRev, "UQSHLR_ZPmZ">;
+ defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl, "SQRSHL_ZPZZ", DestructiveBinaryCommWithRev, "SQRSHLR_ZPmZ">;
+ defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl, "UQRSHL_ZPZZ", DestructiveBinaryCommWithRev, "UQRSHLR_ZPmZ">;
+ defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag, "SQSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SQSHL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag, "UQSHLR_ZPZZ", DestructiveBinaryCommWithRev, "UQSHL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag, "SQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SQRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag, "UQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "UQRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+
+ defm SRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_srshl>;
+ defm URSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_urshl>;
+ defm SQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqshl>;
+ defm UQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
+ defm SQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
+ defm UQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
@@ -2494,11 +2624,11 @@
}
// SVE2 predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl", "SQSHL_ZPZI", int_aarch64_sve_sqshl>;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl", "UQSHL_ZPZI", int_aarch64_sve_uqshl>;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA53.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA53.td
index c5ff1fc..65c84b1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -27,7 +27,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 50911fd..0e68007 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -90,6 +90,7 @@
def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
let ResourceCycles = [3]; }
def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
+def CortexA55WriteVLD1SI : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; let SingleIssue = 1; }
def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
let ResourceCycles = [2]; }
def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
@@ -105,13 +106,19 @@
def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
let ResourceCycles = [8]; }
+def CortexA55WriteLDP1 : SchedWriteRes<[]> { let Latency = 4; }
+def CortexA55WriteLDP2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; }
+def CortexA55WriteLDP4 : SchedWriteRes<[CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd]> { let Latency = 6; }
+
// Pre/Post Indexing - Performed as part of address generation
def : WriteRes<WriteAdr, []> { let Latency = 0; }
// Store
-def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 4; }
-def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 4; }
-def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 4; }
+let RetireOOO = 1 in {
+def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 1; }
+}
def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
@@ -151,6 +158,8 @@
// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
+
+let RetireOOO = 1 in {
def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
let ResourceCycles = [29]; }
def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
@@ -166,7 +175,7 @@
let ResourceCycles = [9]; }
def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
let ResourceCycles = [19]; }
-
+}
//===----------------------------------------------------------------------===//
// Subtarget-specific SchedRead types.
@@ -223,7 +232,10 @@
//---
// Miscellaneous
//---
-def : InstRW<[CortexA55WriteVLD2,CortexA55WriteVLD1], (instregex "LDP.*")>;
+def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS[^W]")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ")>;
def : InstRW<[WriteI], (instrs COPY)>;
//---
// Vector Loads - 64-bit per cycle
@@ -336,4 +348,5 @@
def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
index aa5bec8..c1eacca 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -32,7 +32,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 8abcb80..11df304 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -19,7 +19,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index 8413a06..6a33258 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -25,7 +25,8 @@
let CompleteModel = 1; // Use the default model otherwise.
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 34e8beb..db066a1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -25,7 +25,8 @@
let CompleteModel = 1; // Use the default model otherwise.
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 403aac8..0429b6a 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -25,7 +25,8 @@
let CompleteModel = 1; // Use the default model otherwise.
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index a17ab36..8bb95e4 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -24,7 +24,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index ba14bf1..45964e1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -28,7 +28,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index 9c50f97..125eb28 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -26,7 +26,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 95c29dd2..8d8675b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -26,7 +26,8 @@
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a5bc366..3eb4c04 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -52,10 +52,6 @@
}
return SDValue();
}
-bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
- CodeGenOpt::Level OptLevel) const {
- return OptLevel >= CodeGenOpt::Aggressive;
-}
static const int kSetTagLoopThreshold = 176;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d94fd84..7d53bd4 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -28,7 +28,6 @@
SDValue Chain, SDValue Op1, SDValue Op2,
MachinePointerInfo DstPtrInfo,
bool ZeroData) const override;
- bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
};
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index ab49e0c..f37fedd 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -52,6 +53,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <iterator>
@@ -238,7 +240,7 @@
<< ") zero\n");
Value *Ptr = BasePtr;
if (Offset)
- Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ Ptr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), Ptr, Offset);
IRB.CreateCall(SetTagZeroFn,
{Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
}
@@ -248,7 +250,7 @@
<< ") undef\n");
Value *Ptr = BasePtr;
if (Offset)
- Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ Ptr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), Ptr, Offset);
IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
}
@@ -257,7 +259,7 @@
LLVM_DEBUG(dbgs() << " " << *A << "\n " << *B << "\n");
Value *Ptr = BasePtr;
if (Offset)
- Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ Ptr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), Ptr, Offset);
IRB.CreateCall(StgpFn, {Ptr, A, B});
}
@@ -284,6 +286,7 @@
class AArch64StackTagging : public FunctionPass {
struct AllocaInfo {
AllocaInst *AI;
+ TrackingVH<Instruction> OldAI; // Track through RAUW to replace debug uses.
SmallVector<IntrinsicInst *, 2> LifetimeStart;
SmallVector<IntrinsicInst *, 2> LifetimeEnd;
SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
@@ -518,24 +521,6 @@
Info.AI = NewAI;
}
-// Helper function to check for post-dominance.
-static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A,
- const IntrinsicInst *B) {
- const BasicBlock *ABB = A->getParent();
- const BasicBlock *BBB = B->getParent();
-
- if (ABB != BBB)
- return PDT->dominates(ABB, BBB);
-
- for (const Instruction &I : *ABB) {
- if (&I == B)
- return true;
- if (&I == A)
- return false;
- }
- llvm_unreachable("Corrupt instruction list");
-}
-
// FIXME: check for MTE extension
bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -557,14 +542,16 @@
Instruction *I = &*IT;
if (auto *AI = dyn_cast<AllocaInst>(I)) {
Allocas[AI].AI = AI;
+ Allocas[AI].OldAI = AI;
continue;
}
if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(I)) {
- if (auto *AI =
- dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation())) {
- Allocas[AI].DbgVariableIntrinsics.push_back(DVI);
- }
+ for (Value *V : DVI->location_ops())
+ if (auto *AI = dyn_cast_or_null<AllocaInst>(V))
+ if (Allocas[AI].DbgVariableIntrinsics.empty() ||
+ Allocas[AI].DbgVariableIntrinsics.back() != DVI)
+ Allocas[AI].DbgVariableIntrinsics.push_back(DVI);
continue;
}
@@ -662,32 +649,11 @@
cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
- // We need to ensure that if we tag some object, we certainly untag it
- // before the function exits.
- if (PDT != nullptr && postDominates(PDT, End, Start)) {
- untagAlloca(AI, End, Size);
- } else {
- SmallVector<Instruction *, 8> ReachableRetVec;
- unsigned NumCoveredExits = 0;
- for (auto &RI : RetVec) {
- if (!isPotentiallyReachable(Start, RI, nullptr, DT))
- continue;
- ReachableRetVec.push_back(RI);
- if (DT != nullptr && DT->dominates(End, RI))
- ++NumCoveredExits;
- }
- // If there's a mix of covered and non-covered exits, just put the untag
- // on exits, so we avoid the redundancy of untagging twice.
- if (NumCoveredExits == ReachableRetVec.size()) {
- untagAlloca(AI, End, Size);
- } else {
- for (auto &RI : ReachableRetVec)
- untagAlloca(AI, RI, Size);
- // We may have inserted untag outside of the lifetime interval.
- // Remove the lifetime end call for this alloca.
- End->eraseFromParent();
- }
- }
+
+ auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
+ if (!DT || !PDT ||
+ !forAllReachableExits(*DT, *PDT, Start, End, RetVec, TagEnd))
+ End->eraseFromParent();
} else {
uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
@@ -705,9 +671,7 @@
// Fixup debug intrinsics to point to the new alloca.
for (auto DVI : Info.DbgVariableIntrinsics)
- DVI->setArgOperand(
- 0,
- MetadataAsValue::get(F->getContext(), LocalAsMetadata::get(Info.AI)));
+ DVI->replaceVariableLocationOp(Info.OldAI, Info.AI);
}
// If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index 41096a9..076ed9b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -203,6 +203,7 @@
}
}
+namespace {
struct SlotWithTag {
int FI;
int Tag;
@@ -213,6 +214,7 @@
return FI == Other.FI && Tag == Other.Tag;
}
};
+} // namespace
namespace llvm {
template <> struct DenseMapInfo<SlotWithTag> {
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 71b2bb1..b22eb3b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -47,17 +47,8 @@
cl::desc("Call nonlazybind functions via direct GOT load"),
cl::init(false), cl::Hidden);
-static cl::opt<unsigned> SVEVectorBitsMax(
- "aarch64-sve-vector-bits-max",
- cl::desc("Assume SVE vector registers are at most this big, "
- "with zero meaning no maximum size is assumed."),
- cl::init(0), cl::Hidden);
-
-static cl::opt<unsigned> SVEVectorBitsMin(
- "aarch64-sve-vector-bits-min",
- cl::desc("Assume SVE vector registers are at least this big, "
- "with zero meaning no minimum size is assumed."),
- cl::init(0), cl::Hidden);
+static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
+ cl::desc("Enable the use of AA during codegen."));
AArch64Subtarget &
AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
@@ -86,9 +77,8 @@
case CortexA35:
break;
case CortexA53:
- PrefFunctionLogAlignment = 3;
- break;
case CortexA55:
+ PrefFunctionLogAlignment = 4;
break;
case CortexA57:
MaxInterleaveFactor = 4;
@@ -208,14 +198,17 @@
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
- const TargetMachine &TM, bool LittleEndian)
+ const TargetMachine &TM, bool LittleEndian,
+ unsigned MinSVEVectorSizeInBitsOverride,
+ unsigned MaxSVEVectorSizeInBitsOverride)
: AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
IsLittle(LittleEndian),
- TargetTriple(TT), FrameLowering(),
- InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
- TLInfo(TM, *this) {
+ MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
+ MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
+ FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)),
+ TSInfo(), TLInfo(TM, *this) {
if (AArch64::isX18ReservedByDefault(TT))
ReserveXRegister.set(18);
@@ -354,29 +347,9 @@
MFI.computeMaxCallFrameSize(MF);
}
-unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
- assert(HasSVE && "Tried to get SVE vector length without SVE support!");
- assert(SVEVectorBitsMax % 128 == 0 &&
- "SVE requires vector length in multiples of 128!");
- assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
- "Minimum SVE vector size should not be larger than its maximum!");
- if (SVEVectorBitsMax == 0)
- return 0;
- return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
-}
-
-unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
- assert(HasSVE && "Tried to get SVE vector length without SVE support!");
- assert(SVEVectorBitsMin % 128 == 0 &&
- "SVE requires vector length in multiples of 128!");
- assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
- "Minimum SVE vector size should not be larger than its maximum!");
- if (SVEVectorBitsMax == 0)
- return (SVEVectorBitsMin / 128) * 128;
- return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
-}
-
bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
// Prefer NEON unless larger SVE registers are available.
return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
}
+
+bool AArch64Subtarget::useAA() const { return UseAA; }
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 8fe2f12..e0ef8df 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -183,6 +183,14 @@
bool HasSVE2SHA3 = false;
bool HasSVE2BitPerm = false;
+ // Armv9-A Extensions
+ bool HasRME = false;
+
+ // Arm Scalable Matrix Extension (SME)
+ bool HasSME = false;
+ bool HasSMEF64 = false;
+ bool HasSMEI64 = false;
+
// Future architecture extensions.
bool HasETE = false;
bool HasTRBE = false;
@@ -196,9 +204,14 @@
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
bool HasZeroCycleZeroing = false;
bool HasZeroCycleZeroingGP = false;
- bool HasZeroCycleZeroingFP = false;
bool HasZeroCycleZeroingFPWorkaround = false;
+ // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
+ // as movi is more efficient across all cores. Newer cores can eliminate
+ // fmovs early and there is no difference with movi, but this not true for
+ // all implementations.
+ bool HasZeroCycleZeroingFP = true;
+
// StrictAlign - Disallow unaligned memory accesses.
bool StrictAlign = false;
@@ -209,7 +222,6 @@
unsigned MinVectorRegisterBitWidth = 64;
bool OutlineAtomics = false;
- bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
bool CustomAsCheapAsMove = false;
@@ -237,6 +249,7 @@
bool AllowTaggedGlobals = false;
bool HardenSlsRetBr = false;
bool HardenSlsBlr = false;
+ bool HardenSlsNoComdat = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
@@ -256,6 +269,9 @@
bool IsLittle;
+ unsigned MinSVEVectorSizeInBits;
+ unsigned MaxSVEVectorSizeInBits;
+
/// TargetTriple - What processor and OS we're targeting.
Triple TargetTriple;
@@ -286,7 +302,9 @@
/// of the specified triple.
AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM,
- bool LittleEndian);
+ bool LittleEndian,
+ unsigned MinSVEVectorSizeInBitsOverride = 0,
+ unsigned MaxSVEVectorSizeInBitsOverride = 0);
const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
@@ -395,6 +413,7 @@
bool hardenSlsRetBr() const { return HardenSlsRetBr; }
bool hardenSlsBlr() const { return HardenSlsBlr; }
+ bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
bool useEL1ForTP() const { return UseEL1ForTP; }
bool useEL2ForTP() const { return UseEL2ForTP; }
@@ -471,6 +490,11 @@
return HasEnhancedCounterVirtualization;
}
+ // Arm Scalable Matrix Extension (SME)
+ bool hasSME() const { return HasSME; }
+ bool hasSMEF64() const { return HasSMEF64; }
+ bool hasSMEI64() const { return HasSMEI64; }
+
bool isLittleEndian() const { return IsLittle; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
@@ -489,7 +513,7 @@
TargetTriple.getEnvironment() == Triple::GNUILP32;
}
- bool useAA() const override { return UseAA; }
+ bool useAA() const override;
bool outlineAtomics() const { return OutlineAtomics; }
@@ -557,7 +581,7 @@
bool enableEarlyIfConversion() const override;
- bool enableAdvancedRASplitCost() const override { return true; }
+ bool enableAdvancedRASplitCost() const override { return false; }
std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
@@ -579,8 +603,16 @@
// Return the known range for the bit length of SVE data registers. A value
// of 0 means nothing is known about that particular limit beyong what's
// implied by the architecture.
- unsigned getMaxSVEVectorSizeInBits() const;
- unsigned getMinSVEVectorSizeInBits() const;
+ unsigned getMaxSVEVectorSizeInBits() const {
+ assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+ return MaxSVEVectorSizeInBits;
+ }
+
+ unsigned getMinSVEVectorSizeInBits() const {
+ assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+ return MinSVEVectorSizeInBits;
+ }
+
bool useSVEForFixedLengthVectors() const;
};
} // End llvm namespace
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 01ac52b..f400916 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -369,6 +369,26 @@
def : PState<"TCO", 0b11100>;
//===----------------------------------------------------------------------===//
+// SVCR instruction options.
+//===----------------------------------------------------------------------===//
+
+class SVCR<string name, bits<3> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<3> Encoding;
+ let Encoding = encoding;
+ code Requires = [{ {} }];
+}
+
+let Requires = [{ {AArch64::FeatureSME} }] in {
+def : SVCR<"SVCRSM", 0b001>;
+def : SVCR<"SVCRZA", 0b010>;
+def : SVCR<"SVCRSMZA", 0b011>;
+}
+
+//===----------------------------------------------------------------------===//
// PSB instruction options.
//===----------------------------------------------------------------------===//
@@ -387,18 +407,18 @@
// BTI instruction options.
//===----------------------------------------------------------------------===//
-class BTI<string name, bits<2> encoding> : SearchableTable {
+class BTI<string name, bits<3> encoding> : SearchableTable {
let SearchableFields = ["Name", "Encoding"];
let EnumValueField = "Encoding";
string Name = name;
- bits<2> Encoding;
+ bits<3> Encoding;
let Encoding = encoding;
}
-def : BTI<"c", 0b01>;
-def : BTI<"j", 0b10>;
-def : BTI<"jc", 0b11>;
+def : BTI<"c", 0b010>;
+def : BTI<"j", 0b100>;
+def : BTI<"jc", 0b110>;
//===----------------------------------------------------------------------===//
// TLBI (translation lookaside buffer invalidate) instruction options.
@@ -531,6 +551,14 @@
defm : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>;
} //FeatureTLB_RMI
+// Armv9-A Realm Management Extention TLBI Instructions
+let Requires = ["AArch64::FeatureRME"] in {
+defm : TLBI<"RPAOS", 0b110, 0b1000, 0b0100, 0b011>;
+defm : TLBI<"RPALOS", 0b110, 0b1000, 0b0100, 0b111>;
+defm : TLBI<"PAALLOS", 0b110, 0b1000, 0b0001, 0b100, 0>;
+defm : TLBI<"PAALL", 0b110, 0b1000, 0b0111, 0b100, 0>;
+}
+
// Armv8.5-A Prediction Restriction by Context instruction options:
class PRCTX<string name, bits<4> crm> : SearchableTable {
let SearchableFields = ["Name", "Encoding"];
@@ -743,6 +771,19 @@
def : RWSysReg<"SCXTNUM_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b111>;
}
+// v9a Realm Management Extension registers
+let Requires = [{ {AArch64::FeatureRME} }] in {
+def : RWSysReg<"MFAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b101>;
+def : RWSysReg<"GPCCR_EL3", 0b11, 0b110, 0b0010, 0b0001, 0b110>;
+def : RWSysReg<"GPTBR_EL3", 0b11, 0b110, 0b0010, 0b0001, 0b100>;
+}
+
+// v9-a Scalable Matrix Extension (SME) registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureSME} }] in {
+def : ROSysReg<"ID_AA64SMFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b101>;
+}
+
//===----------------------
// Write-only regs
//===----------------------
@@ -1601,3 +1642,23 @@
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcAppleA7} }] in
def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
+
+// Scalable Matrix Extension (SME)
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureSME} }] in {
+def : RWSysReg<"SMCR_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b110>;
+def : RWSysReg<"SMCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b110>;
+def : RWSysReg<"SMCR_EL3", 0b11, 0b110, 0b0001, 0b0010, 0b110>;
+def : RWSysReg<"SMCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b110>;
+def : RWSysReg<"SVCR", 0b11, 0b011, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SMPRI_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b100>;
+def : RWSysReg<"SMPRIMAP_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b101>;
+def : ROSysReg<"SMIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"TPIDR2_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b101>;
+} // HasSME
+
+// v8.4a MPAM and SME registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureMPAM, AArch64::FeatureSME} }] in {
+def : RWSysReg<"MPAMSM_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b011>;
+} // HasMPAM, HasSME
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index bec1758..99bcb2f 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -158,9 +158,23 @@
static cl::opt<bool>
EnableBranchTargets("aarch64-enable-branch-targets", cl::Hidden,
- cl::desc("Enable the AAcrh64 branch target pass"),
+ cl::desc("Enable the AArch64 branch target pass"),
cl::init(true));
+static cl::opt<unsigned> SVEVectorBitsMaxOpt(
+ "aarch64-sve-vector-bits-max",
+ cl::desc("Assume SVE vector registers are at most this big, "
+ "with zero meaning no maximum size is assumed."),
+ cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> SVEVectorBitsMinOpt(
+ "aarch64-sve-vector-bits-min",
+ cl::desc("Assume SVE vector registers are at least this big, "
+ "with zero meaning no minimum size is assumed."),
+ cl::init(0), cl::Hidden);
+
+extern cl::opt<bool> EnableHomogeneousPrologEpilog;
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -182,6 +196,7 @@
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
+ initializeAArch64O0PreLegalizerCombinerPass(*PR);
initializeAArch64PreLegalizerCombinerPass(*PR);
initializeAArch64PostLegalizerCombinerPass(*PR);
initializeAArch64PostLegalizerLoweringPass(*PR);
@@ -197,6 +212,7 @@
initializeAArch64SLSHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
initializeAArch64StackTaggingPreRAPass(*PR);
+ initializeAArch64LowerHomogeneousPrologEpilogPass(*PR);
}
//===----------------------------------------------------------------------===//
@@ -345,14 +361,54 @@
std::string FS =
FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
- auto &I = SubtargetMap[CPU + FS];
+ SmallString<512> Key;
+
+ unsigned MinSVEVectorSize = 0;
+ unsigned MaxSVEVectorSize = 0;
+ Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange);
+ if (VScaleRangeAttr.isValid()) {
+ std::tie(MinSVEVectorSize, MaxSVEVectorSize) =
+ VScaleRangeAttr.getVScaleRangeArgs();
+ MinSVEVectorSize *= 128;
+ MaxSVEVectorSize *= 128;
+ } else {
+ MinSVEVectorSize = SVEVectorBitsMinOpt;
+ MaxSVEVectorSize = SVEVectorBitsMaxOpt;
+ }
+
+ assert(MinSVEVectorSize % 128 == 0 &&
+ "SVE requires vector length in multiples of 128!");
+ assert(MaxSVEVectorSize % 128 == 0 &&
+ "SVE requires vector length in multiples of 128!");
+ assert((MaxSVEVectorSize >= MinSVEVectorSize || MaxSVEVectorSize == 0) &&
+ "Minimum SVE vector size should not be larger than its maximum!");
+
+ // Sanitize user input in case of no asserts
+ if (MaxSVEVectorSize == 0)
+ MinSVEVectorSize = (MinSVEVectorSize / 128) * 128;
+ else {
+ MinSVEVectorSize =
+ (std::min(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128;
+ MaxSVEVectorSize =
+ (std::max(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128;
+ }
+
+ Key += "SVEMin";
+ Key += std::to_string(MinSVEVectorSize);
+ Key += "SVEMax";
+ Key += std::to_string(MaxSVEVectorSize);
+ Key += CPU;
+ Key += FS;
+
+ auto &I = SubtargetMap[Key];
if (!I) {
// This needs to be done before we create a new subtarget since any
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
- isLittle);
+ isLittle, MinSVEVectorSize,
+ MaxSVEVectorSize);
}
return I.get();
}
@@ -428,6 +484,7 @@
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+ void addPreEmitPass2() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -558,8 +615,10 @@
}
void AArch64PassConfig::addPreLegalizeMachineIR() {
- bool IsOptNone = getOptLevel() == CodeGenOpt::None;
- addPass(createAArch64PreLegalizerCombiner(IsOptNone));
+ if (getOptLevel() == CodeGenOpt::None)
+ addPass(createAArch64O0PreLegalizerCombiner());
+ else
+ addPass(createAArch64PreLegalizerCombiner());
}
bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -584,7 +643,7 @@
}
bool AArch64PassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
if (getOptLevel() != CodeGenOpt::None)
addPass(createAArch64PostSelectOptimize());
return false;
@@ -634,6 +693,9 @@
}
void AArch64PassConfig::addPreSched2() {
+ // Lower homogeneous frame instructions
+ if (EnableHomogeneousPrologEpilog)
+ addPass(createAArch64LowerHomogeneousPrologEpilogPass());
// Expand some pseudo instructions to allow proper scheduling.
addPass(createAArch64ExpandPseudoPass());
// Use load/store pair instructions when possible.
@@ -676,9 +738,12 @@
if (BranchRelaxation)
addPass(&BranchRelaxationPassID);
- // Identify valid longjmp targets for Windows Control Flow Guard.
- if (TM->getTargetTriple().isOSWindows())
+ if (TM->getTargetTriple().isOSWindows()) {
+ // Identify valid longjmp targets for Windows Control Flow Guard.
addPass(createCFGuardLongjmpPass());
+ // Identify valid eh continuation targets for Windows EHCont Guard.
+ addPass(createEHContGuardCatchretPass());
+ }
if (TM->getOptLevel() != CodeGenOpt::None && EnableCompressJumpTables)
addPass(createAArch64CompressJumpTablesPass());
@@ -686,8 +751,11 @@
if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
TM->getTargetTriple().isOSBinFormatMachO())
addPass(createAArch64CollectLOHPass());
+}
- // SVE bundles move prefixes with destructive operations.
+void AArch64PassConfig::addPreEmitPass2() {
+ // SVE bundles move prefixes with destructive operations. BLR_RVMARKER pseudo
+ // instructions are lowered to bundles as well.
addPass(createUnpackMachineBundles(nullptr));
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7fda6b8..01236aa 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -44,7 +45,7 @@
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-int AArch64TTIImpl::getIntImmCost(int64_t Val) {
+InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
return 0;
@@ -59,8 +60,8 @@
}
/// Calculate the cost of materializing the given constant.
-int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -74,20 +75,20 @@
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- int Cost = 0;
+ InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
- return std::max(1, Cost);
+ return std::max<InstructionCost>(1, Cost);
}
-int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -144,7 +145,7 @@
if (Idx == ImmIdx) {
int NumConstants = (BitSize + 63) / 64;
- int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+ InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
@@ -152,9 +153,10 @@
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -180,7 +182,7 @@
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
int NumConstants = (BitSize + 63) / 64;
- int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+ InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
@@ -212,7 +214,7 @@
return TTI::PSK_Software;
}
-unsigned
+InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
auto *RetTy = ICA.getReturnType();
@@ -235,12 +237,605 @@
return LT.first;
break;
}
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat: {
+ static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
+ MVT::v8i16, MVT::v2i32, MVT::v4i32,
+ MVT::v2i64};
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
+ // need to extend the type, as it uses shr(qadd(shl, shl)).
+ unsigned Instrs =
+ LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
+ if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
+ return LT.first * Instrs;
+ break;
+ }
+ case Intrinsic::abs: {
+ static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
+ MVT::v8i16, MVT::v2i32, MVT::v4i32,
+ MVT::v2i64};
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }))
+ return LT.first;
+ break;
+ }
+ case Intrinsic::experimental_stepvector: {
+ InstructionCost Cost = 1; // Cost of the `index' instruction
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ // Legalisation of illegal vectors involves an `index' instruction plus
+ // (LT.first - 1) vector adds.
+ if (LT.first > 1) {
+ Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
+ InstructionCost AddCost =
+ getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
+ Cost += AddCost * (LT.first - 1);
+ }
+ return Cost;
+ }
+ case Intrinsic::bitreverse: {
+ static const CostTblEntry BitreverseTbl[] = {
+ {Intrinsic::bitreverse, MVT::i32, 1},
+ {Intrinsic::bitreverse, MVT::i64, 1},
+ {Intrinsic::bitreverse, MVT::v8i8, 1},
+ {Intrinsic::bitreverse, MVT::v16i8, 1},
+ {Intrinsic::bitreverse, MVT::v4i16, 2},
+ {Intrinsic::bitreverse, MVT::v8i16, 2},
+ {Intrinsic::bitreverse, MVT::v2i32, 2},
+ {Intrinsic::bitreverse, MVT::v4i32, 2},
+ {Intrinsic::bitreverse, MVT::v1i64, 2},
+ {Intrinsic::bitreverse, MVT::v2i64, 2},
+ };
+ const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
+ const auto *Entry =
+ CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
+ // Cost Model is using the legal type(i32) that i8 and i16 will be converted
+ // to +1 so that we match the actual lowering cost
+ if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
+ TLI->getValueType(DL, RetTy, true) == MVT::i16)
+ return LegalisationCost.first * Entry->Cost + 1;
+ if (Entry)
+ return LegalisationCost.first * Entry->Cost;
+ break;
+ }
+ case Intrinsic::ctpop: {
+ static const CostTblEntry CtpopCostTbl[] = {
+ {ISD::CTPOP, MVT::v2i64, 4},
+ {ISD::CTPOP, MVT::v4i32, 3},
+ {ISD::CTPOP, MVT::v8i16, 2},
+ {ISD::CTPOP, MVT::v16i8, 1},
+ {ISD::CTPOP, MVT::i64, 4},
+ {ISD::CTPOP, MVT::v2i32, 3},
+ {ISD::CTPOP, MVT::v4i16, 2},
+ {ISD::CTPOP, MVT::v8i8, 1},
+ {ISD::CTPOP, MVT::i32, 5},
+ };
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ MVT MTy = LT.second;
+ if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
+ // Extra cost of +1 when illegal vector types are legalized by promoting
+ // the integer type.
+ int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
+ RetTy->getScalarSizeInBits()
+ ? 1
+ : 0;
+ return LT.first * Entry->Cost + ExtraCost;
+ }
+ break;
+ }
default:
break;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+static Optional<Instruction *> processPhiNode(InstCombiner &IC,
+ IntrinsicInst &II) {
+ SmallVector<Instruction *, 32> Worklist;
+ auto RequiredType = II.getType();
+
+ auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
+ assert(PN && "Expected Phi Node!");
+
+ // Don't create a new Phi unless we can remove the old one.
+ if (!PN->hasOneUse())
+ return None;
+
+ for (Value *IncValPhi : PN->incoming_values()) {
+ auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
+ if (!Reinterpret ||
+ Reinterpret->getIntrinsicID() !=
+ Intrinsic::aarch64_sve_convert_to_svbool ||
+ RequiredType != Reinterpret->getArgOperand(0)->getType())
+ return None;
+ }
+
+ // Create the new Phi
+ LLVMContext &Ctx = PN->getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(PN);
+ PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+ Worklist.push_back(PN);
+
+ for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+ auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+ NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+ Worklist.push_back(Reinterpret);
+ }
+
+ // Cleanup Phi Node and reinterprets
+ return IC.replaceInstUsesWith(II, NPN);
+}
+
+static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
+ IntrinsicInst &II) {
+ // If the reinterpret instruction operand is a PHI Node
+ if (isa<PHINode>(II.getArgOperand(0)))
+ return processPhiNode(IC, II);
+
+ SmallVector<Instruction *, 32> CandidatesForRemoval;
+ Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
+
+ const auto *IVTy = cast<VectorType>(II.getType());
+
+ // Walk the chain of conversions.
+ while (Cursor) {
+ // If the type of the cursor has fewer lanes than the final result, zeroing
+ // must take place, which breaks the equivalence chain.
+ const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+ if (CursorVTy->getElementCount().getKnownMinValue() <
+ IVTy->getElementCount().getKnownMinValue())
+ break;
+
+ // If the cursor has the same type as I, it is a viable replacement.
+ if (Cursor->getType() == IVTy)
+ EarliestReplacement = Cursor;
+
+ auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
+
+ // If this is not an SVE conversion intrinsic, this is the end of the chain.
+ if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+ Intrinsic::aarch64_sve_convert_to_svbool ||
+ IntrinsicCursor->getIntrinsicID() ==
+ Intrinsic::aarch64_sve_convert_from_svbool))
+ break;
+
+ CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+ Cursor = IntrinsicCursor->getOperand(0);
+ }
+
+ // If no viable replacement in the conversion chain was found, there is
+ // nothing to do.
+ if (!EarliestReplacement)
+ return None;
+
+ return IC.replaceInstUsesWith(II, EarliestReplacement);
+}
+
+static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
+ IntrinsicInst &II) {
+ IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
+ if (!Pg)
+ return None;
+
+ if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
+ if (PTruePattern != AArch64SVEPredPattern::vl1)
+ return None;
+
+ // The intrinsic is inserting into lane zero so use an insert instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Insert = InsertElementInst::Create(
+ II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
+ Insert->insertBefore(&II);
+ Insert->takeName(&II);
+
+ return IC.replaceInstUsesWith(II, Insert);
+}
+
+static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
+ IntrinsicInst &II) {
+ LLVMContext &Ctx = II.getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(&II);
+
+ // Check that the predicate is all active
+ auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
+ if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
+ if (PTruePattern != AArch64SVEPredPattern::all)
+ return None;
+
+ // Check that we have a compare of zero..
+ auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
+ if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+ return None;
+
+ auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
+ if (!DupXArg || !DupXArg->isZero())
+ return None;
+
+ // ..against a dupq
+ auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
+ if (!DupQLane ||
+ DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
+ return None;
+
+ // Where the dupq is a lane 0 replicate of a vector insert
+ if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
+ return None;
+
+ auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
+ if (!VecIns ||
+ VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+ return None;
+
+ // Where the vector insert is a fixed constant vector insert into undef at
+ // index zero
+ if (!isa<UndefValue>(VecIns->getArgOperand(0)))
+ return None;
+
+ if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
+ return None;
+
+ auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
+ if (!ConstVec)
+ return None;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
+ auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
+ if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
+ return None;
+
+ unsigned NumElts = VecTy->getNumElements();
+ unsigned PredicateBits = 0;
+
+ // Expand intrinsic operands to a 16-bit byte level predicate
+ for (unsigned I = 0; I < NumElts; ++I) {
+ auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
+ if (!Arg)
+ return None;
+ if (!Arg->isZero())
+ PredicateBits |= 1 << (I * (16 / NumElts));
+ }
+
+ // If all bits are zero bail early with an empty predicate
+ if (PredicateBits == 0) {
+ auto *PFalse = Constant::getNullValue(II.getType());
+ PFalse->takeName(&II);
+ return IC.replaceInstUsesWith(II, PFalse);
+ }
+
+ // Calculate largest predicate type used (where byte predicate is largest)
+ unsigned Mask = 8;
+ for (unsigned I = 0; I < 16; ++I)
+ if ((PredicateBits & (1 << I)) != 0)
+ Mask |= (I % 8);
+
+ unsigned PredSize = Mask & -Mask;
+ auto *PredType = ScalableVectorType::get(
+ Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
+
+ // Ensure all relevant bits are set
+ for (unsigned I = 0; I < 16; I += PredSize)
+ if ((PredicateBits & (1 << I)) == 0)
+ return None;
+
+ auto *PTruePat =
+ ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
+ auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
+ {PredType}, {PTruePat});
+ auto *ConvertToSVBool = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
+ auto *ConvertFromSVBool =
+ Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
+ {II.getType()}, {ConvertToSVBool});
+
+ ConvertFromSVBool->takeName(&II);
+ return IC.replaceInstUsesWith(II, ConvertFromSVBool);
+}
+
+static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *Pg = II.getArgOperand(0);
+ Value *Vec = II.getArgOperand(1);
+ bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
+
+ // lastX(splat(X)) --> X
+ if (auto *SplatVal = getSplatValue(Vec))
+ return IC.replaceInstUsesWith(II, SplatVal);
+
+ auto *C = dyn_cast<Constant>(Pg);
+ if (IsAfter && C && C->isNullValue()) {
+ // The intrinsic is extracting lane 0 so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+ }
+
+ auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
+ if (!IntrPG)
+ return None;
+
+ if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
+
+ // Can the intrinsic's predicate be converted to a known constant index?
+ unsigned Idx;
+ switch (PTruePattern) {
+ default:
+ return None;
+ case AArch64SVEPredPattern::vl1:
+ Idx = 0;
+ break;
+ case AArch64SVEPredPattern::vl2:
+ Idx = 1;
+ break;
+ case AArch64SVEPredPattern::vl3:
+ Idx = 2;
+ break;
+ case AArch64SVEPredPattern::vl4:
+ Idx = 3;
+ break;
+ case AArch64SVEPredPattern::vl5:
+ Idx = 4;
+ break;
+ case AArch64SVEPredPattern::vl6:
+ Idx = 5;
+ break;
+ case AArch64SVEPredPattern::vl7:
+ Idx = 6;
+ break;
+ case AArch64SVEPredPattern::vl8:
+ Idx = 7;
+ break;
+ case AArch64SVEPredPattern::vl16:
+ Idx = 15;
+ break;
+ }
+
+ // Increment the index if extracting the element after the last active
+ // predicate element.
+ if (IsAfter)
+ ++Idx;
+
+ // Ignore extracts whose index is larger than the known minimum vector
+ // length. NOTE: This is an artificial constraint where we prefer to
+ // maintain what the user asked for until an alternative is proven faster.
+ auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
+ if (Idx >= PgVTy->getMinNumElements())
+ return None;
+
+ // The intrinsic is extracting a fixed lane so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+}
+
+static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
+ IntrinsicInst &II) {
+ LLVMContext &Ctx = II.getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(&II);
+ // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
+ // can work with RDFFR_PP for ptest elimination.
+ auto *AllPat =
+ ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
+ auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
+ {II.getType()}, {AllPat});
+ auto *RDFFR =
+ Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
+ RDFFR->takeName(&II);
+ return IC.replaceInstUsesWith(II, RDFFR);
+}
+
+static Optional<Instruction *>
+instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
+ const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
+
+ if (Pattern == AArch64SVEPredPattern::all) {
+ LLVMContext &Ctx = II.getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(&II);
+
+ Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
+ auto *VScale = Builder.CreateVScale(StepVal);
+ VScale->takeName(&II);
+ return IC.replaceInstUsesWith(II, VScale);
+ }
+
+ unsigned MinNumElts = 0;
+ switch (Pattern) {
+ default:
+ return None;
+ case AArch64SVEPredPattern::vl1:
+ case AArch64SVEPredPattern::vl2:
+ case AArch64SVEPredPattern::vl3:
+ case AArch64SVEPredPattern::vl4:
+ case AArch64SVEPredPattern::vl5:
+ case AArch64SVEPredPattern::vl6:
+ case AArch64SVEPredPattern::vl7:
+ case AArch64SVEPredPattern::vl8:
+ MinNumElts = Pattern;
+ break;
+ case AArch64SVEPredPattern::vl16:
+ MinNumElts = 16;
+ break;
+ }
+
+ return NumElts >= MinNumElts
+ ? Optional<Instruction *>(IC.replaceInstUsesWith(
+ II, ConstantInt::get(II.getType(), MinNumElts)))
+ : None;
+}
+
+static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
+ IntrinsicInst &II) {
+ IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
+ IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
+
+ if (Op1 && Op2 &&
+ Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+ Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+ Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
+
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+
+ Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
+ Type *Tys[] = {Op1->getArgOperand(0)->getType()};
+
+ auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
+
+ PTest->takeName(&II);
+ return IC.replaceInstUsesWith(II, PTest);
+ }
+
+ return None;
+}
+
+static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
+ IntrinsicInst &II) {
+ auto *OpPredicate = II.getOperand(0);
+ auto *OpMultiplicand = II.getOperand(1);
+ auto *OpMultiplier = II.getOperand(2);
+
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+
+ // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
+ // with a unit splat value, false otherwise.
+ auto IsUnitDupX = [](auto *I) {
+ auto *IntrI = dyn_cast<IntrinsicInst>(I);
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+ return false;
+
+ auto *SplatValue = IntrI->getOperand(0);
+ return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+ };
+
+ // Return true if a given instruction is an aarch64_sve_dup intrinsic call
+ // with a unit splat value, false otherwise.
+ auto IsUnitDup = [](auto *I) {
+ auto *IntrI = dyn_cast<IntrinsicInst>(I);
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
+ return false;
+
+ auto *SplatValue = IntrI->getOperand(2);
+ return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+ };
+
+ // The OpMultiplier variable should always point to the dup (if any), so
+ // swap if necessary.
+ if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
+ std::swap(OpMultiplier, OpMultiplicand);
+
+ if (IsUnitDupX(OpMultiplier)) {
+ // [f]mul pg (dupx 1) %n => %n
+ OpMultiplicand->takeName(&II);
+ return IC.replaceInstUsesWith(II, OpMultiplicand);
+ } else if (IsUnitDup(OpMultiplier)) {
+ // [f]mul pg (dup pg 1) %n => %n
+ auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
+ auto *DupPg = DupInst->getOperand(1);
+ // TODO: this is naive. The optimization is still valid if DupPg
+ // 'encompasses' OpPredicate, not only if they're the same predicate.
+ if (OpPredicate == DupPg) {
+ OpMultiplicand->takeName(&II);
+ return IC.replaceInstUsesWith(II, OpMultiplicand);
+ }
+ }
+
+ return None;
+}
+
+static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
+ IntrinsicInst &II) {
+ auto *OpVal = II.getOperand(0);
+ auto *OpIndices = II.getOperand(1);
+ VectorType *VTy = cast<VectorType>(II.getType());
+
+ // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
+ // constant splat value < minimal element count of result.
+ auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
+ if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+ return None;
+
+ auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
+ if (!SplatValue ||
+ SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
+ return None;
+
+ // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
+ // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+ auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
+ auto *VectorSplat =
+ Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
+
+ VectorSplat->takeName(&II);
+ return IC.replaceInstUsesWith(II, VectorSplat);
+}
+
+Optional<Instruction *>
+AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_convert_from_svbool:
+ return instCombineConvertFromSVBool(IC, II);
+ case Intrinsic::aarch64_sve_dup:
+ return instCombineSVEDup(IC, II);
+ case Intrinsic::aarch64_sve_cmpne:
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ return instCombineSVECmpNE(IC, II);
+ case Intrinsic::aarch64_sve_rdffr:
+ return instCombineRDFFR(IC, II);
+ case Intrinsic::aarch64_sve_lasta:
+ case Intrinsic::aarch64_sve_lastb:
+ return instCombineSVELast(IC, II);
+ case Intrinsic::aarch64_sve_cntd:
+ return instCombineSVECntElts(IC, II, 2);
+ case Intrinsic::aarch64_sve_cntw:
+ return instCombineSVECntElts(IC, II, 4);
+ case Intrinsic::aarch64_sve_cnth:
+ return instCombineSVECntElts(IC, II, 8);
+ case Intrinsic::aarch64_sve_cntb:
+ return instCombineSVECntElts(IC, II, 16);
+ case Intrinsic::aarch64_sve_ptest_any:
+ case Intrinsic::aarch64_sve_ptest_first:
+ case Intrinsic::aarch64_sve_ptest_last:
+ return instCombineSVEPTest(IC, II);
+ case Intrinsic::aarch64_sve_mul:
+ case Intrinsic::aarch64_sve_fmul:
+ return instCombineSVEVectorMul(IC, II);
+ case Intrinsic::aarch64_sve_tbl:
+ return instCombineSVETBL(IC, II);
+ }
+
+ return None;
+}
+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
@@ -297,18 +892,21 @@
return false;
// Get the total number of vector elements in the legalized types.
- unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements();
- unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
+ InstructionCost NumDstEls =
+ DstTyL.first * DstTyL.second.getVectorMinNumElements();
+ InstructionCost NumSrcEls =
+ SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
}
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -333,7 +931,7 @@
}
// TODO: Allow non-throughput costs that aren't binary.
- auto AdjustCost = [&CostKind](int Cost) {
+ auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
@@ -353,6 +951,24 @@
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+ // Truncations on nxvmiN
+ { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
+ { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
+ { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
+ { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
+ { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
+ { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
+ { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
+ { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
+ { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
+ { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
+ { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
+ { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
+ { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
+ { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
+ { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
+ { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
+
// The number of shll instructions for the extension.
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
@@ -434,6 +1050,16 @@
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
+ // Complex, from nxv2f32.
+ { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
+
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
@@ -441,6 +1067,107 @@
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
+
+ // Complex, from nxv2f64.
+ { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
+
+ // Complex, from nxv4f32.
+ { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
+ { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
+
+ // Complex, from nxv8f64. Illegal -> illegal conversions not required.
+ { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
+ { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
+ { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
+ { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
+
+ // Complex, from nxv4f64. Illegal -> illegal conversions not required.
+ { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
+ { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
+ { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
+
+ // Complex, from nxv8f32. Illegal -> illegal conversions not required.
+ { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
+ { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
+
+ // Complex, from nxv8f16.
+ { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
+ { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
+ { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
+ { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
+ { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
+
+ // Complex, from nxv4f16.
+ { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
+ { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
+ { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
+
+ // Complex, from nxv2f16.
+ { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
+ { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
+
+ // Truncate from nxvmf32 to nxvmf16.
+ { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
+ { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
+ { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
+
+ // Truncate from nxvmf64 to nxvmf16.
+ { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
+ { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
+ { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
+
+ // Truncate from nxvmf64 to nxvmf32.
+ { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
+ { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
+ { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
+
+ // Extend from nxvmf16 to nxvmf32.
+ { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
+ { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
+ { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
+
+ // Extend from nxvmf16 to nxvmf64.
+ { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
+ { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
+ { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
+
+ // Extend from nxvmf32 to nxvmf64.
+ { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
+ { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
+ { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
+
};
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
@@ -452,9 +1179,10 @@
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
-int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy,
- unsigned Index) {
+InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
+ Type *Dst,
+ VectorType *VecTy,
+ unsigned Index) {
// Make sure we were given a valid extend opcode.
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
@@ -469,7 +1197,8 @@
// Get the cost for the extract. We compute the cost (if any) for the extend
// below.
- auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+ InstructionCost Cost =
+ getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
// Legalize the types.
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
@@ -511,8 +1240,9 @@
CostKind);
}
-unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind) {
+InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
return Opcode == Instruction::PHI ? 0 : 1;
assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
@@ -520,13 +1250,13 @@
return 0;
}
-int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -545,10 +1275,10 @@
return ST->getVectorInsertExtractBaseCost();
}
-int AArch64TTIImpl::getArithmeticInstrCost(
+InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
// TODO: Handle more cost kinds.
@@ -558,7 +1288,7 @@
Opd2PropInfo, Args, CxtI);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
// add in the widening overhead specified by the sub-target. Since the
@@ -566,7 +1296,7 @@
// aren't present in the generated code and have a zero cost. By adding a
// widening overhead here, we attach the total cost of the combined operation
// to the widening instruction.
- int Cost = 0;
+ InstructionCost Cost = 0;
if (isWideningInstruction(Ty, Opcode, Args))
Cost += ST->getWideningBaseCost();
@@ -610,18 +1340,15 @@
// Vector signed division by constant are expanded to the
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
// to MULHS + SUB + SRL + ADD + SRL.
- int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ InstructionCost MulCost = getArithmeticInstrCost(
+ Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ InstructionCost AddCost = getArithmeticInstrCost(
+ Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ InstructionCost ShrCost = getArithmeticInstrCost(
+ Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
}
}
@@ -677,8 +1404,9 @@
}
}
-int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) {
+InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
+ ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -695,10 +1423,11 @@
return 1;
}
-int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy, CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
@@ -772,7 +1501,28 @@
return Options;
}
-unsigned AArch64TTIImpl::getGatherScatterOpCost(
+InstructionCost
+AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
+ if (!isa<ScalableVectorType>(Src))
+ return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ auto LT = TLI->getTypeLegalizationCost(DL, Src);
+ if (!LT.first.isValid())
+ return InstructionCost::getInvalid();
+
+ // The code-generator is currently not able to handle scalable vectors
+ // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
+ // it. This change will be removed when code-generation for these types is
+ // sufficiently reliable.
+ if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
+ return InstructionCost::getInvalid();
+
+ return LT.first * 2;
+}
+
+InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
@@ -781,35 +1531,56 @@
Alignment, CostKind, I);
auto *VT = cast<VectorType>(DataTy);
auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
- ElementCount LegalVF = LT.second.getVectorElementCount();
- Optional<unsigned> MaxNumVScale = getMaxVScale();
- assert(MaxNumVScale && "Expected valid max vscale value");
+ if (!LT.first.isValid())
+ return InstructionCost::getInvalid();
- unsigned MemOpCost =
+ // The code-generator is currently not able to handle scalable vectors
+ // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
+ // it. This change will be removed when code-generation for these types is
+ // sufficiently reliable.
+ if (cast<VectorType>(DataTy)->getElementCount() ==
+ ElementCount::getScalable(1))
+ return InstructionCost::getInvalid();
+
+ ElementCount LegalVF = LT.second.getVectorElementCount();
+ InstructionCost MemOpCost =
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
- unsigned MaxNumElementsPerGather =
- MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
- return LT.first * MaxNumElementsPerGather * MemOpCost;
+ return LT.first * MemOpCost * getMaxNumElements(LegalVF);
}
bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
}
-int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
- // TODO: Handle other cost kinds.
- if (CostKind != TTI::TCK_RecipThroughput)
- return 1;
-
+InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
- if (TLI->getValueType(DL, Ty, true) == MVT::Other)
+ if (VT == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
CostKind);
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
+ if (!LT.first.isValid())
+ return InstructionCost::getInvalid();
+
+ // The code-generator is currently not able to handle scalable vectors
+ // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
+ // it. This change will be removed when code-generation for these types is
+ // sufficiently reliable.
+ if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
+ if (VTy->getElementCount() == ElementCount::getScalable(1))
+ return InstructionCost::getInvalid();
+
+ // TODO: consider latency as well for TCK_SizeAndLatency.
+ if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+ return LT.first;
+
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
@@ -823,29 +1594,20 @@
return LT.first * 2 * AmortizationCost;
}
+ // Check truncating stores and extending loads.
if (useNeonVector(Ty) &&
- cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
- unsigned ProfitableNumElements;
- if (Opcode == Instruction::Store)
- // We use a custom trunc store lowering so v.4b should be profitable.
- ProfitableNumElements = 4;
- else
- // We scalarize the loads because there is not v.4b register and we
- // have to promote the elements to v.2.
- ProfitableNumElements = 8;
-
- if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
- unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
- unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
- // We generate 2 instructions per vector element.
- return NumVectorizableInstsToAmortize * NumVecElts * 2;
- }
+ Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
+ // v4i8 types are lowered to scalar a load/store and sshll/xtn.
+ if (VT == MVT::v4i8)
+ return 2;
+ // Otherwise we need to scalarize.
+ return cast<FixedVectorType>(Ty)->getNumElements() * 2;
}
return LT.first;
}
-int AArch64TTIImpl::getInterleavedMemoryOpCost(
+InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
@@ -871,8 +1633,9 @@
UseMaskForCond, UseMaskForGaps);
}
-int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
- int Cost = 0;
+InstructionCost
+AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+ InstructionCost Cost = 0;
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
for (auto *I : Tys) {
if (!I->isVectorTy())
@@ -958,6 +1721,41 @@
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
EnableFalkorHWPFUnrollFix)
getFalkorUnrollingPreferences(L, SE, UP);
+
+ // Scan the loop: don't unroll loops with calls as this could prevent
+ // inlining. Don't unroll vector loops either, as they don't benefit much from
+ // unrolling.
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ // Don't unroll vectorised loop.
+ if (I.getType()->isVectorTy())
+ return;
+
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
+ if (!isLoweredToCall(F))
+ continue;
+ }
+ return;
+ }
+ }
+ }
+
+ // Enable runtime unrolling for in-order models
+ // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
+ // checking for that case, we can ensure that the default behaviour is
+ // unchanged
+ if (ST->getProcFamily() != AArch64Subtarget::Others &&
+ !ST->getSchedModel().isOutOfOrder()) {
+ UP.Runtime = true;
+ UP.Partial = true;
+ UP.UpperBound = true;
+ UP.UnrollRemainder = true;
+ UP.DefaultUnrollRuntimeCount = 4;
+
+ UP.UnrollAndJam = true;
+ UP.UnrollAndJamInnerLoopThreshold = 60;
+ }
}
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
@@ -1073,42 +1871,44 @@
return Considerable;
}
-bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const {
- auto *VTy = cast<VectorType>(Ty);
- unsigned ScalarBits = Ty->getScalarSizeInBits();
- switch (Opcode) {
- case Instruction::FAdd:
- case Instruction::FMul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Mul:
+bool AArch64TTIImpl::isLegalToVectorizeReduction(
+ const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
+ if (!VF.isScalable())
+ return true;
+
+ Type *Ty = RdxDesc.getRecurrenceType();
+ if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
return false;
- case Instruction::Add:
- return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
- case Instruction::ICmp:
- return (ScalarBits < 64) &&
- (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
- case Instruction::FCmp:
- return Flags.NoNaN;
+
+ switch (RdxDesc.getRecurrenceKind()) {
+ case RecurKind::Add:
+ case RecurKind::FAdd:
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::Xor:
+ case RecurKind::SMin:
+ case RecurKind::SMax:
+ case RecurKind::UMin:
+ case RecurKind::UMax:
+ case RecurKind::FMin:
+ case RecurKind::FMax:
+ return true;
default:
- llvm_unreachable("Unhandled reduction opcode");
+ return false;
}
- return false;
}
-int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
if (!isa<ScalableVectorType>(Ty))
- return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
- CostKind);
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
"Both vector needs to be scalable");
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- int LegalizationCost = 0;
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
unsigned CmpOpcode =
@@ -1124,13 +1924,10 @@
return LegalizationCost + /*Cost of horizontal reduction*/ 2;
}
-int AArch64TTIImpl::getArithmeticReductionCostSVE(
- unsigned Opcode, VectorType *ValTy, bool IsPairwise,
- TTI::TargetCostKind CostKind) {
- assert(!IsPairwise && "Cannot be pair wise to continue");
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
- int LegalizationCost = 0;
+InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
+ unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
@@ -1148,51 +1945,162 @@
case ISD::FADD:
return LegalizationCost + 2;
default:
- // TODO: Replace for invalid when InstructionCost is used
- // cases not supported by SVE
- return 16;
+ return InstructionCost::getInvalid();
}
}
-int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
- VectorType *ValTy,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF)) {
+ if (!isa<ScalableVectorType>(ValTy))
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+
+ if (Opcode != Instruction::FAdd)
+ return InstructionCost::getInvalid();
+
+ auto *VTy = cast<ScalableVectorType>(ValTy);
+ InstructionCost Cost =
+ getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
+ Cost *= getMaxNumElements(VTy->getElementCount());
+ return Cost;
+ }
if (isa<ScalableVectorType>(ValTy))
- return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm,
- CostKind);
- if (IsPairwiseForm)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
+ return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// Horizontal adds can use the 'addv' instruction. We model the cost of these
- // instructions as normal vector adds. This is the only arithmetic vector
- // reduction operation for which we have an instruction.
+ // instructions as twice a normal vector add, plus 1 for each legalization
+ // step (LT.first). This is the only arithmetic vector reduction operation for
+ // which we have an instruction.
+ // OR, XOR and AND costs should match the codegen from:
+ // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
+ // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
+ // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
static const CostTblEntry CostTblNoPairwise[]{
- {ISD::ADD, MVT::v8i8, 1},
- {ISD::ADD, MVT::v16i8, 1},
- {ISD::ADD, MVT::v4i16, 1},
- {ISD::ADD, MVT::v8i16, 1},
- {ISD::ADD, MVT::v4i32, 1},
+ {ISD::ADD, MVT::v8i8, 2},
+ {ISD::ADD, MVT::v16i8, 2},
+ {ISD::ADD, MVT::v4i16, 2},
+ {ISD::ADD, MVT::v8i16, 2},
+ {ISD::ADD, MVT::v4i32, 2},
+ {ISD::OR, MVT::v8i8, 15},
+ {ISD::OR, MVT::v16i8, 17},
+ {ISD::OR, MVT::v4i16, 7},
+ {ISD::OR, MVT::v8i16, 9},
+ {ISD::OR, MVT::v2i32, 3},
+ {ISD::OR, MVT::v4i32, 5},
+ {ISD::OR, MVT::v2i64, 3},
+ {ISD::XOR, MVT::v8i8, 15},
+ {ISD::XOR, MVT::v16i8, 17},
+ {ISD::XOR, MVT::v4i16, 7},
+ {ISD::XOR, MVT::v8i16, 9},
+ {ISD::XOR, MVT::v2i32, 3},
+ {ISD::XOR, MVT::v4i32, 5},
+ {ISD::XOR, MVT::v2i64, 3},
+ {ISD::AND, MVT::v8i8, 15},
+ {ISD::AND, MVT::v16i8, 17},
+ {ISD::AND, MVT::v4i16, 7},
+ {ISD::AND, MVT::v8i16, 9},
+ {ISD::AND, MVT::v2i32, 3},
+ {ISD::AND, MVT::v4i32, 5},
+ {ISD::AND, MVT::v2i64, 3},
};
-
- if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
- return LT.first * Entry->Cost;
-
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
+ switch (ISD) {
+ default:
+ break;
+ case ISD::ADD:
+ if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
+ return (LT.first - 1) + Entry->Cost;
+ break;
+ case ISD::XOR:
+ case ISD::AND:
+ case ISD::OR:
+ const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
+ if (!Entry)
+ break;
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
+ if (!ValVTy->getElementType()->isIntegerTy(1) &&
+ MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
+ isPowerOf2_32(ValVTy->getNumElements())) {
+ InstructionCost ExtraCost = 0;
+ if (LT.first != 1) {
+ // Type needs to be split, so there is an extra cost of LT.first - 1
+ // arithmetic ops.
+ auto *Ty = FixedVectorType::get(ValTy->getElementType(),
+ MTy.getVectorNumElements());
+ ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+ ExtraCost *= LT.first - 1;
+ }
+ return Entry->Cost + ExtraCost;
+ }
+ break;
+ }
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
}
-int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
- int Index, VectorType *SubTp) {
+InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
+ static const CostTblEntry ShuffleTbl[] = {
+ { TTI::SK_Splice, MVT::nxv16i8, 1 },
+ { TTI::SK_Splice, MVT::nxv8i16, 1 },
+ { TTI::SK_Splice, MVT::nxv4i32, 1 },
+ { TTI::SK_Splice, MVT::nxv2i64, 1 },
+ { TTI::SK_Splice, MVT::nxv2f16, 1 },
+ { TTI::SK_Splice, MVT::nxv4f16, 1 },
+ { TTI::SK_Splice, MVT::nxv8f16, 1 },
+ { TTI::SK_Splice, MVT::nxv2bf16, 1 },
+ { TTI::SK_Splice, MVT::nxv4bf16, 1 },
+ { TTI::SK_Splice, MVT::nxv8bf16, 1 },
+ { TTI::SK_Splice, MVT::nxv2f32, 1 },
+ { TTI::SK_Splice, MVT::nxv4f32, 1 },
+ { TTI::SK_Splice, MVT::nxv2f64, 1 },
+ };
+
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ EVT PromotedVT = LT.second.getScalarType() == MVT::i1
+ ? TLI->getPromotedVTForPredicate(EVT(LT.second))
+ : LT.second;
+ Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
+ InstructionCost LegalizationCost = 0;
+ if (Index < 0) {
+ LegalizationCost =
+ getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind) +
+ getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ }
+
+ // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
+ // Cost performed on a promoted type.
+ if (LT.second.getScalarType() == MVT::i1) {
+ LegalizationCost +=
+ getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
+ TTI::CastContextHint::None, CostKind) +
+ getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
+ TTI::CastContextHint::None, CostKind);
+ }
+ const auto *Entry =
+ CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
+ assert(Entry && "Illegal Type for Splice");
+ LegalizationCost += Entry->Cost;
+ return LegalizationCost * LT.first;
+}
+
+InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp) {
+ Kind = improveShuffleKindFromMask(Kind, Mask);
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
- Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
+ Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
+ Kind == TTI::SK_Reverse) {
static const CostTblEntry ShuffleTbl[] = {
// Broadcast shuffle kinds can be performed with 'dup'.
{ TTI::SK_Broadcast, MVT::v8i8, 1 },
@@ -1226,18 +2134,69 @@
{ TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
{ TTI::SK_Select, MVT::v2f64, 1 }, // mov.
// PermuteSingleSrc shuffle kinds.
- // TODO: handle vXi8/vXi16.
{ TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
+ { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
+ { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
+ { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
+ { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
+ { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
+ { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
+ { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
+ // Reverse can be lowered with `rev`.
+ { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
+ { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
+ { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
+ { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
+ { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
+ { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
+ // Broadcast shuffle kinds for scalable vectors
+ { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
+ { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
+ { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
+ { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
+ { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
+ { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
+ { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
+ // Handle the cases for vector.reverse with scalable vectors
+ { TTI::SK_Reverse, MVT::nxv16i8, 1 },
+ { TTI::SK_Reverse, MVT::nxv8i16, 1 },
+ { TTI::SK_Reverse, MVT::nxv4i32, 1 },
+ { TTI::SK_Reverse, MVT::nxv2i64, 1 },
+ { TTI::SK_Reverse, MVT::nxv2f16, 1 },
+ { TTI::SK_Reverse, MVT::nxv4f16, 1 },
+ { TTI::SK_Reverse, MVT::nxv8f16, 1 },
+ { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
+ { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
+ { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
+ { TTI::SK_Reverse, MVT::nxv2f32, 1 },
+ { TTI::SK_Reverse, MVT::nxv4f32, 1 },
+ { TTI::SK_Reverse, MVT::nxv2f64, 1 },
+ { TTI::SK_Reverse, MVT::nxv16i1, 1 },
+ { TTI::SK_Reverse, MVT::nxv8i1, 1 },
+ { TTI::SK_Reverse, MVT::nxv4i1, 1 },
+ { TTI::SK_Reverse, MVT::nxv2i1, 1 },
};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
}
-
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
+ return getSpliceCost(Tp, Index);
+ return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 7c9360a..d55fd5b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -71,13 +71,16 @@
/// @{
using BaseT::getIntImmCost;
- int getIntImmCost(int64_t Val);
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(int64_t Val);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
@@ -97,21 +100,28 @@
return 31;
}
- unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
- unsigned getRegisterBitWidth(bool Vector) const {
- if (Vector) {
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(64);
+ case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasSVE())
- return std::max(ST->getMinSVEVectorSizeInBits(), 128u);
- if (ST->hasNEON())
- return 128;
- return 0;
+ return TypeSize::getFixed(
+ std::max(ST->getMinSVEVectorSizeInBits(), 128u));
+ return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
}
- return 64;
+ llvm_unreachable("Unsupported register kind");
}
- unsigned getMinVectorRegisterBitWidth() {
+ unsigned getMinVectorRegisterBitWidth() const {
return ST->getMinVectorRegisterBitWidth();
}
@@ -121,33 +131,55 @@
return BaseT::getMaxVScale();
}
+ /// Try to return an estimate cost factor that can be used as a multiplier
+ /// when scalarizing an operation for a vector with ElementCount \p VF.
+ /// For scalable vectors this currently takes the most pessimistic view based
+ /// upon the maximum possible value for vscale.
+ unsigned getMaxNumElements(ElementCount VF) const {
+ if (!VF.isScalable())
+ return VF.getFixedValue();
+ Optional<unsigned> MaxNumVScale = getMaxVScale();
+ assert(MaxNumVScale && "Expected valid max vscale value");
+ return *MaxNumVScale * VF.getKnownMinValue();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+ InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+ VectorType *VecTy, unsigned Index);
- int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
- int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind);
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticReductionCostSVE(unsigned Opcode,
+ VectorType *ValTy,
+ TTI::TargetCostKind CostKind);
+
+ InstructionCost getSpliceCost(VectorType *Tp, int Index);
+
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -157,23 +189,24 @@
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
+ InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+ const SCEV *Ptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool useNeonVector(const Type *Ty) const;
- int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+ InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -186,25 +219,35 @@
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
- bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
- if (!isa<ScalableVectorType>(DataType) || !ST->hasSVE())
- return false;
-
- Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
+ bool isElementTypeLegalForScalableVector(Type *Ty) const {
if (Ty->isPointerTy())
return true;
- if (Ty->isBFloatTy() || Ty->isHalfTy() ||
- Ty->isFloatTy() || Ty->isDoubleTy())
+ if (Ty->isBFloatTy() && ST->hasBF16())
return true;
- if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
+ if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
+ return true;
+
+ if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
return true;
return false;
}
+ bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
+ if (!ST->hasSVE())
+ return false;
+
+ // For fixed vectors, avoid scalarization if using SVE for them.
+ if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
+ return false; // Fall back to scalarization of masked operations.
+
+ return !DataType->getScalarType()->isIntegerTy(1) &&
+ isElementTypeLegalForScalableVector(DataType->getScalarType());
+ }
+
bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);
}
@@ -213,6 +256,27 @@
return isLegalMaskedLoadStore(DataType, Alignment);
}
+ bool isLegalMaskedGatherScatter(Type *DataType) const {
+ if (!ST->hasSVE())
+ return false;
+
+ // For fixed vectors, scalarize if not using SVE for them.
+ auto *DataTypeFVTy = dyn_cast<FixedVectorType>(DataType);
+ if (DataTypeFVTy && (!ST->useSVEForFixedLengthVectors() ||
+ DataTypeFVTy->getNumElements() < 2))
+ return false;
+
+ return !DataType->getScalarType()->isIntegerTy(1) &&
+ isElementTypeLegalForScalableVector(DataType->getScalarType());
+ }
+
+ bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
+ return isLegalMaskedGatherScatter(DataType);
+ }
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
+ return isLegalMaskedGatherScatter(DataType);
+ }
+
bool isLegalNTStore(Type *DataType, Align Alignment) {
// NOTE: The logic below is mostly geared towards LV, which calls it with
// vectors with 2 elements. We might want to improve that, if other
@@ -231,7 +295,7 @@
return BaseT::isLegalNTStore(DataType, Alignment);
}
- int getInterleavedMemoryOpCost(
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
@@ -249,15 +313,16 @@
bool supportsScalableVectors() const { return ST->hasSVE(); }
- bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const;
+ bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
+ ElementCount VF) const;
- int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+ InstructionCost getArithmeticReductionCost(
+ unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
- int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
/// @}
};
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/src/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 96c50ff..f27e9b2 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
@@ -66,9 +67,12 @@
Scalar,
NeonVector,
SVEDataVector,
- SVEPredicateVector
+ SVEPredicateVector,
+ Matrix
};
+enum class MatrixKind { Array, Tile, Row, Col };
+
enum RegConstraintEqualityTy {
EqualsReg,
EqualsSuperReg,
@@ -229,6 +233,8 @@
OperandMatchResultTy tryParseScalarRegister(unsigned &Reg);
OperandMatchResultTy tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
RegKind MatchKind);
+ OperandMatchResultTy tryParseMatrixRegister(OperandVector &Operands);
+ OperandMatchResultTy tryParseSVCR(OperandVector &Operands);
OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands);
@@ -257,6 +263,7 @@
template <RegKind VectorKind>
OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
bool ExpectMatch = false);
+ OperandMatchResultTy tryParseMatrixTileList(OperandVector &Operands);
OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
OperandMatchResultTy tryParseGPR64x8(OperandVector &Operands);
@@ -316,6 +323,9 @@
k_ShiftedImm,
k_CondCode,
k_Register,
+ k_MatrixRegister,
+ k_MatrixTileList,
+ k_SVCR,
k_VectorList,
k_VectorIndex,
k_Token,
@@ -370,6 +380,16 @@
ShiftExtendOp ShiftExtend;
};
+ struct MatrixRegOp {
+ unsigned RegNum;
+ unsigned ElementWidth;
+ MatrixKind Kind;
+ };
+
+ struct MatrixTileListOp {
+ unsigned RegMask = 0;
+ };
+
struct VectorListOp {
unsigned RegNum;
unsigned Count;
@@ -379,7 +399,7 @@
};
struct VectorIndexOp {
- unsigned Val;
+ int Val;
};
struct ImmOp {
@@ -437,13 +457,17 @@
unsigned Val;
};
- struct ExtendOp {
- unsigned Val;
+ struct SVCROp {
+ const char *Data;
+ unsigned Length;
+ unsigned PStateField;
};
union {
struct TokOp Tok;
struct RegOp Reg;
+ struct MatrixRegOp MatrixReg;
+ struct MatrixTileListOp MatrixTileList;
struct VectorListOp VectorList;
struct VectorIndexOp VectorIndex;
struct ImmOp Imm;
@@ -457,6 +481,7 @@
struct PSBHintOp PSBHint;
struct BTIHintOp BTIHint;
struct ShiftExtendOp ShiftExtend;
+ struct SVCROp SVCR;
};
// Keep the MCContext around as the MCExprs may need manipulated during
@@ -492,6 +517,12 @@
case k_Register:
Reg = o.Reg;
break;
+ case k_MatrixRegister:
+ MatrixReg = o.MatrixReg;
+ break;
+ case k_MatrixTileList:
+ MatrixTileList = o.MatrixTileList;
+ break;
case k_VectorList:
VectorList = o.VectorList;
break;
@@ -516,6 +547,9 @@
case k_ShiftExtend:
ShiftExtend = o.ShiftExtend;
break;
+ case k_SVCR:
+ SVCR = o.SVCR;
+ break;
}
}
@@ -584,6 +618,26 @@
return Reg.RegNum;
}
+ unsigned getMatrixReg() const {
+ assert(Kind == k_MatrixRegister && "Invalid access!");
+ return MatrixReg.RegNum;
+ }
+
+ unsigned getMatrixElementWidth() const {
+ assert(Kind == k_MatrixRegister && "Invalid access!");
+ return MatrixReg.ElementWidth;
+ }
+
+ MatrixKind getMatrixKind() const {
+ assert(Kind == k_MatrixRegister && "Invalid access!");
+ return MatrixReg.Kind;
+ }
+
+ unsigned getMatrixTileListRegMask() const {
+ assert(isMatrixTileList() && "Invalid access!");
+ return MatrixTileList.RegMask;
+ }
+
RegConstraintEqualityTy getRegEqualityTy() const {
assert(Kind == k_Register && "Invalid access!");
return Reg.EqualityTy;
@@ -599,7 +653,7 @@
return VectorList.Count;
}
- unsigned getVectorIndex() const {
+ int getVectorIndex() const {
assert(Kind == k_VectorIndex && "Invalid access!");
return VectorIndex.Val;
}
@@ -639,6 +693,11 @@
return StringRef(BTIHint.Data, BTIHint.Length);
}
+ StringRef getSVCR() const {
+ assert(Kind == k_SVCR && "Invalid access!");
+ return StringRef(SVCR.Data, SVCR.Length);
+ }
+
StringRef getPrefetchName() const {
assert(Kind == k_Prefetch && "Invalid access!");
return StringRef(Prefetch.Data, Prefetch.Length);
@@ -1073,6 +1132,12 @@
return SysReg.PStateField != -1U;
}
+ bool isSVCR() const {
+ if (Kind != k_SVCR)
+ return false;
+ return SVCR.PStateField != -1U;
+ }
+
bool isReg() const override {
return Kind == k_Register;
}
@@ -1093,6 +1158,9 @@
Reg.RegNum));
}
+ bool isMatrix() const { return Kind == k_MatrixRegister; }
+ bool isMatrixTileList() const { return Kind == k_MatrixTileList; }
+
template <unsigned Class> bool isSVEVectorReg() const {
RegKind RK;
switch (Class) {
@@ -1474,6 +1542,17 @@
return true;
}
+ template <MatrixKind Kind, unsigned EltSize, unsigned RegClass>
+ DiagnosticPredicate isMatrixRegOperand() const {
+ if (!isMatrix())
+ return DiagnosticPredicateTy::NoMatch;
+ if (getMatrixKind() != Kind ||
+ !AArch64MCRegisterClasses[RegClass].contains(getMatrixReg()) ||
+ EltSize != getMatrixElementWidth())
+ return DiagnosticPredicateTy::NearMatch;
+ return DiagnosticPredicateTy::Match;
+ }
+
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
// Add as immediates when possible. Null MCExpr = 0.
if (!Expr)
@@ -1489,6 +1568,11 @@
Inst.addOperand(MCOperand::createReg(getReg()));
}
+ void addMatrixOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMatrixReg()));
+ }
+
void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
@@ -1576,6 +1660,13 @@
FirstRegs[(unsigned)RegTy][0]));
}
+ void addMatrixTileListOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned RegMask = getMatrixTileListRegMask();
+ assert(RegMask <= 0xFF && "Invalid mask!");
+ Inst.addOperand(MCOperand::createImm(RegMask));
+ }
+
void addVectorIndexOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getVectorIndex()));
@@ -1765,6 +1856,12 @@
Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
}
+ void addSVCROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SVCR.PStateField));
+ }
+
void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
@@ -1871,7 +1968,7 @@
void print(raw_ostream &OS) const override;
static std::unique_ptr<AArch64Operand>
- CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+ CreateToken(StringRef Str, SMLoc S, MCContext &Ctx, bool IsSuffix = false) {
auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
@@ -1931,7 +2028,7 @@
}
static std::unique_ptr<AArch64Operand>
- CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+ CreateVectorIndex(int Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
@@ -1939,6 +2036,45 @@
return Op;
}
+ static std::unique_ptr<AArch64Operand>
+ CreateMatrixTileList(unsigned RegMask, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = std::make_unique<AArch64Operand>(k_MatrixTileList, Ctx);
+ Op->MatrixTileList.RegMask = RegMask;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static void ComputeRegsForAlias(unsigned Reg, SmallSet<unsigned, 8> &OutRegs,
+ const unsigned ElementWidth) {
+ static std::map<std::pair<unsigned, unsigned>, std::vector<unsigned>>
+ RegMap = {
+ {{0, AArch64::ZAB0},
+ {AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
+ AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7}},
+ {{8, AArch64::ZAB0},
+ {AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
+ AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7}},
+ {{16, AArch64::ZAH0},
+ {AArch64::ZAD0, AArch64::ZAD2, AArch64::ZAD4, AArch64::ZAD6}},
+ {{16, AArch64::ZAH1},
+ {AArch64::ZAD1, AArch64::ZAD3, AArch64::ZAD5, AArch64::ZAD7}},
+ {{32, AArch64::ZAS0}, {AArch64::ZAD0, AArch64::ZAD4}},
+ {{32, AArch64::ZAS1}, {AArch64::ZAD1, AArch64::ZAD5}},
+ {{32, AArch64::ZAS2}, {AArch64::ZAD2, AArch64::ZAD6}},
+ {{32, AArch64::ZAS3}, {AArch64::ZAD3, AArch64::ZAD7}},
+ };
+
+ if (ElementWidth == 64)
+ OutRegs.insert(Reg);
+ else {
+ std::vector<unsigned> Regs = RegMap[std::make_pair(ElementWidth, Reg)];
+ assert(!Regs.empty() && "Invalid tile or element width!");
+ for (auto OutReg : Regs)
+ OutRegs.insert(OutReg);
+ }
+ }
+
static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx);
@@ -2050,7 +2186,7 @@
SMLoc S,
MCContext &Ctx) {
auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx);
- Op->BTIHint.Val = Val << 1 | 32;
+ Op->BTIHint.Val = Val | 32;
Op->BTIHint.Data = Str.data();
Op->BTIHint.Length = Str.size();
Op->StartLoc = S;
@@ -2059,6 +2195,29 @@
}
static std::unique_ptr<AArch64Operand>
+ CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind,
+ SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx);
+ Op->MatrixReg.RegNum = RegNum;
+ Op->MatrixReg.ElementWidth = ElementWidth;
+ Op->MatrixReg.Kind = Kind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateSVCR(uint32_t PStateField, StringRef Str, SMLoc S, MCContext &Ctx) {
+ auto Op = std::make_unique<AArch64Operand>(k_SVCR, Ctx);
+ Op->SVCR.PStateField = PStateField;
+ Op->SVCR.Data = Str.data();
+ Op->SVCR.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
@@ -2136,6 +2295,22 @@
case k_BTIHint:
OS << getBTIHintName();
break;
+ case k_MatrixRegister:
+ OS << "<matrix " << getMatrixReg() << ">";
+ break;
+ case k_MatrixTileList: {
+ OS << "<matrixlist ";
+ unsigned RegMask = getMatrixTileListRegMask();
+ unsigned MaxBits = 8;
+ for (unsigned I = MaxBits; I > 0; --I)
+ OS << ((RegMask & (1 << (I - 1))) >> (I - 1));
+ OS << '>';
+ break;
+ }
+ case k_SVCR: {
+ OS << getSVCR();
+ break;
+ }
case k_Register:
OS << "<register " << getReg() << ">";
if (!getShiftExtendAmount() && !hasShiftExtendAmount())
@@ -2233,6 +2408,7 @@
break;
case RegKind::SVEPredicateVector:
case RegKind::SVEDataVector:
+ case RegKind::Matrix:
Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
.Case("", {0, 0})
.Case(".b", {0, 8})
@@ -2314,6 +2490,125 @@
.Default(0);
}
+static unsigned matchMatrixTileListRegName(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("za0.d", AArch64::ZAD0)
+ .Case("za1.d", AArch64::ZAD1)
+ .Case("za2.d", AArch64::ZAD2)
+ .Case("za3.d", AArch64::ZAD3)
+ .Case("za4.d", AArch64::ZAD4)
+ .Case("za5.d", AArch64::ZAD5)
+ .Case("za6.d", AArch64::ZAD6)
+ .Case("za7.d", AArch64::ZAD7)
+ .Case("za0.s", AArch64::ZAS0)
+ .Case("za1.s", AArch64::ZAS1)
+ .Case("za2.s", AArch64::ZAS2)
+ .Case("za3.s", AArch64::ZAS3)
+ .Case("za0.h", AArch64::ZAH0)
+ .Case("za1.h", AArch64::ZAH1)
+ .Case("za0.b", AArch64::ZAB0)
+ .Default(0);
+}
+
+static unsigned matchMatrixRegName(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("za", AArch64::ZA)
+ .Case("za0.q", AArch64::ZAQ0)
+ .Case("za1.q", AArch64::ZAQ1)
+ .Case("za2.q", AArch64::ZAQ2)
+ .Case("za3.q", AArch64::ZAQ3)
+ .Case("za4.q", AArch64::ZAQ4)
+ .Case("za5.q", AArch64::ZAQ5)
+ .Case("za6.q", AArch64::ZAQ6)
+ .Case("za7.q", AArch64::ZAQ7)
+ .Case("za8.q", AArch64::ZAQ8)
+ .Case("za9.q", AArch64::ZAQ9)
+ .Case("za10.q", AArch64::ZAQ10)
+ .Case("za11.q", AArch64::ZAQ11)
+ .Case("za12.q", AArch64::ZAQ12)
+ .Case("za13.q", AArch64::ZAQ13)
+ .Case("za14.q", AArch64::ZAQ14)
+ .Case("za15.q", AArch64::ZAQ15)
+ .Case("za0.d", AArch64::ZAD0)
+ .Case("za1.d", AArch64::ZAD1)
+ .Case("za2.d", AArch64::ZAD2)
+ .Case("za3.d", AArch64::ZAD3)
+ .Case("za4.d", AArch64::ZAD4)
+ .Case("za5.d", AArch64::ZAD5)
+ .Case("za6.d", AArch64::ZAD6)
+ .Case("za7.d", AArch64::ZAD7)
+ .Case("za0.s", AArch64::ZAS0)
+ .Case("za1.s", AArch64::ZAS1)
+ .Case("za2.s", AArch64::ZAS2)
+ .Case("za3.s", AArch64::ZAS3)
+ .Case("za0.h", AArch64::ZAH0)
+ .Case("za1.h", AArch64::ZAH1)
+ .Case("za0.b", AArch64::ZAB0)
+ .Case("za0h.q", AArch64::ZAQ0)
+ .Case("za1h.q", AArch64::ZAQ1)
+ .Case("za2h.q", AArch64::ZAQ2)
+ .Case("za3h.q", AArch64::ZAQ3)
+ .Case("za4h.q", AArch64::ZAQ4)
+ .Case("za5h.q", AArch64::ZAQ5)
+ .Case("za6h.q", AArch64::ZAQ6)
+ .Case("za7h.q", AArch64::ZAQ7)
+ .Case("za8h.q", AArch64::ZAQ8)
+ .Case("za9h.q", AArch64::ZAQ9)
+ .Case("za10h.q", AArch64::ZAQ10)
+ .Case("za11h.q", AArch64::ZAQ11)
+ .Case("za12h.q", AArch64::ZAQ12)
+ .Case("za13h.q", AArch64::ZAQ13)
+ .Case("za14h.q", AArch64::ZAQ14)
+ .Case("za15h.q", AArch64::ZAQ15)
+ .Case("za0h.d", AArch64::ZAD0)
+ .Case("za1h.d", AArch64::ZAD1)
+ .Case("za2h.d", AArch64::ZAD2)
+ .Case("za3h.d", AArch64::ZAD3)
+ .Case("za4h.d", AArch64::ZAD4)
+ .Case("za5h.d", AArch64::ZAD5)
+ .Case("za6h.d", AArch64::ZAD6)
+ .Case("za7h.d", AArch64::ZAD7)
+ .Case("za0h.s", AArch64::ZAS0)
+ .Case("za1h.s", AArch64::ZAS1)
+ .Case("za2h.s", AArch64::ZAS2)
+ .Case("za3h.s", AArch64::ZAS3)
+ .Case("za0h.h", AArch64::ZAH0)
+ .Case("za1h.h", AArch64::ZAH1)
+ .Case("za0h.b", AArch64::ZAB0)
+ .Case("za0v.q", AArch64::ZAQ0)
+ .Case("za1v.q", AArch64::ZAQ1)
+ .Case("za2v.q", AArch64::ZAQ2)
+ .Case("za3v.q", AArch64::ZAQ3)
+ .Case("za4v.q", AArch64::ZAQ4)
+ .Case("za5v.q", AArch64::ZAQ5)
+ .Case("za6v.q", AArch64::ZAQ6)
+ .Case("za7v.q", AArch64::ZAQ7)
+ .Case("za8v.q", AArch64::ZAQ8)
+ .Case("za9v.q", AArch64::ZAQ9)
+ .Case("za10v.q", AArch64::ZAQ10)
+ .Case("za11v.q", AArch64::ZAQ11)
+ .Case("za12v.q", AArch64::ZAQ12)
+ .Case("za13v.q", AArch64::ZAQ13)
+ .Case("za14v.q", AArch64::ZAQ14)
+ .Case("za15v.q", AArch64::ZAQ15)
+ .Case("za0v.d", AArch64::ZAD0)
+ .Case("za1v.d", AArch64::ZAD1)
+ .Case("za2v.d", AArch64::ZAD2)
+ .Case("za3v.d", AArch64::ZAD3)
+ .Case("za4v.d", AArch64::ZAD4)
+ .Case("za5v.d", AArch64::ZAD5)
+ .Case("za6v.d", AArch64::ZAD6)
+ .Case("za7v.d", AArch64::ZAD7)
+ .Case("za0v.s", AArch64::ZAS0)
+ .Case("za1v.s", AArch64::ZAS1)
+ .Case("za2v.s", AArch64::ZAS2)
+ .Case("za3v.s", AArch64::ZAS3)
+ .Case("za0v.h", AArch64::ZAH0)
+ .Case("za1v.h", AArch64::ZAH1)
+ .Case("za0v.b", AArch64::ZAB0)
+ .Default(0);
+}
+
bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success;
@@ -2341,6 +2636,9 @@
if ((RegNum = MatchNeonVectorRegName(Name)))
return Kind == RegKind::NeonVector ? RegNum : 0;
+ if ((RegNum = matchMatrixRegName(Name)))
+ return Kind == RegKind::Matrix ? RegNum : 0;
+
// The parsed register must be of RegKind Scalar
if ((RegNum = MatchRegisterName(Name)))
return Kind == RegKind::Scalar ? RegNum : 0;
@@ -2669,10 +2967,8 @@
RealVal.changeSign();
if (AddFPZeroAsLiteral && RealVal.isPosZero()) {
- Operands.push_back(
- AArch64Operand::CreateToken("#0", false, S, getContext()));
- Operands.push_back(
- AArch64Operand::CreateToken(".0", false, S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken("#0", S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(".0", S, getContext()));
} else
Operands.push_back(AArch64Operand::CreateFPImm(
RealVal, *StatusOrErr == APFloat::opOK, S, getContext()));
@@ -2700,9 +2996,8 @@
if (parseSymbolicImmVal(Imm))
return MatchOperand_ParseFail;
else if (Parser.getTok().isNot(AsmToken::Comma)) {
- SMLoc E = Parser.getTok().getLoc();
Operands.push_back(
- AArch64Operand::CreateImm(Imm, S, E, getContext()));
+ AArch64Operand::CreateImm(Imm, S, getLoc(), getContext()));
return MatchOperand_Success;
}
@@ -2711,8 +3006,8 @@
// The optional operand must be "lsl #N" where N is non-negative.
if (!Parser.getTok().is(AsmToken::Identifier) ||
- !Parser.getTok().getIdentifier().equals_lower("lsl")) {
- Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+ !Parser.getTok().getIdentifier().equals_insensitive("lsl")) {
+ Error(getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
@@ -2722,28 +3017,27 @@
parseOptionalToken(AsmToken::Hash);
if (Parser.getTok().isNot(AsmToken::Integer)) {
- Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+ Error(getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
int64_t ShiftAmount = Parser.getTok().getIntVal();
if (ShiftAmount < 0) {
- Error(Parser.getTok().getLoc(), "positive shift amount required");
+ Error(getLoc(), "positive shift amount required");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat the number
// Just in case the optional lsl #0 is used for immediates other than zero.
if (ShiftAmount == 0 && Imm != nullptr) {
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateImm(Imm, S, getLoc(), getContext()));
return MatchOperand_Success;
}
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
- S, E, getContext()));
+ Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S,
+ getLoc(), getContext()));
return MatchOperand_Success;
}
@@ -2813,6 +3107,89 @@
return false;
}
+OperandMatchResultTy
+AArch64AsmParser::tryParseSVCR(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc S = getLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ unsigned PStateImm = -1;
+ const auto *SVCR = AArch64SVCR::lookupSVCRByName(Tok.getString());
+ if (SVCR && SVCR->haveFeatures(getSTI().getFeatureBits()))
+ PStateImm = SVCR->Encoding;
+
+ Operands.push_back(
+ AArch64Operand::CreateSVCR(PStateImm, Tok.getString(), S, getContext()));
+ Parser.Lex(); // Eat identifier token.
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc S = getLoc();
+
+ StringRef Name = Tok.getString();
+
+ if (Name.equals_insensitive("za")) {
+ Parser.Lex(); // eat "za"
+ Operands.push_back(AArch64Operand::CreateMatrixRegister(
+ AArch64::ZA, /*ElementWidth=*/0, MatrixKind::Array, S, getLoc(),
+ getContext()));
+ if (getLexer().is(AsmToken::LBrac)) {
+ // There's no comma after matrix operand, so we can parse the next operand
+ // immediately.
+ if (parseOperand(Operands, false, false))
+ return MatchOperand_NoMatch;
+ }
+ return MatchOperand_Success;
+ }
+
+ // Try to parse matrix register.
+ unsigned Reg = matchRegisterNameAlias(Name, RegKind::Matrix);
+ if (!Reg)
+ return MatchOperand_NoMatch;
+
+ size_t DotPosition = Name.find('.');
+ assert(DotPosition != StringRef::npos && "Unexpected register");
+
+ StringRef Head = Name.take_front(DotPosition);
+ StringRef Tail = Name.drop_front(DotPosition);
+ StringRef RowOrColumn = Head.take_back();
+
+ MatrixKind Kind = StringSwitch<MatrixKind>(RowOrColumn)
+ .Case("h", MatrixKind::Row)
+ .Case("v", MatrixKind::Col)
+ .Default(MatrixKind::Tile);
+
+ // Next up, parsing the suffix
+ const auto &KindRes = parseVectorKind(Tail, RegKind::Matrix);
+ if (!KindRes) {
+ TokError("Expected the register to be followed by element width suffix");
+ return MatchOperand_ParseFail;
+ }
+ unsigned ElementWidth = KindRes->second;
+
+ Parser.Lex();
+
+ Operands.push_back(AArch64Operand::CreateMatrixRegister(
+ Reg, ElementWidth, Kind, S, getLoc(), getContext()));
+
+ if (getLexer().is(AsmToken::LBrac)) {
+ // There's no comma after matrix operand, so we can parse the next operand
+ // immediately.
+ if (parseOperand(Operands, false, false))
+ return MatchOperand_NoMatch;
+ }
+ return MatchOperand_Success;
+}
+
/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
/// them if present.
OperandMatchResultTy
@@ -2863,7 +3240,7 @@
// Make sure we do actually have a number, identifier or a parenthesized
// expression.
- SMLoc E = Parser.getTok().getLoc();
+ SMLoc E = getLoc();
if (!Parser.getTok().is(AsmToken::Integer) &&
!Parser.getTok().is(AsmToken::LParen) &&
!Parser.getTok().is(AsmToken::Identifier)) {
@@ -2906,9 +3283,11 @@
{"mte", {AArch64::FeatureMTE}},
{"memtag", {AArch64::FeatureMTE}},
{"tlb-rmi", {AArch64::FeatureTLB_RMI}},
+ {"pan", {AArch64::FeaturePAN}},
{"pan-rwv", {AArch64::FeaturePAN_RWV}},
{"ccpp", {AArch64::FeatureCCPP}},
{"rcpc", {AArch64::FeatureRCPC}},
+ {"rng", {AArch64::FeatureRandGen}},
{"sve", {AArch64::FeatureSVE}},
{"sve2", {AArch64::FeatureSVE2}},
{"sve2-aes", {AArch64::FeatureSVE2AES}},
@@ -2919,8 +3298,11 @@
{"xs", {AArch64::FeatureXS}},
{"pauth", {AArch64::FeaturePAuth}},
{"flagm", {AArch64::FeatureFlagM}},
+ {"rme", {AArch64::FeatureRME}},
+ {"sme", {AArch64::FeatureSME}},
+ {"sme-f64", {AArch64::FeatureSMEF64}},
+ {"sme-i64", {AArch64::FeatureSMEI64}},
// FIXME: Unsupported extensions
- {"pan", {}},
{"lor", {}},
{"rdma", {}},
{"profile", {}},
@@ -2980,8 +3362,7 @@
return TokError("invalid operand");
Mnemonic = Name;
- Operands.push_back(
- AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken("sys", NameLoc, getContext()));
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
@@ -3206,6 +3587,9 @@
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
+ if (AArch64SVCR::lookupSVCRByName(Tok.getString()))
+ return MatchOperand_NoMatch;
+
int MRSReg, MSRReg;
auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
@@ -3254,8 +3638,7 @@
// If there was an explicit qualifier, that goes on as a literal text
// operand.
if (!Kind.empty())
- Operands.push_back(
- AArch64Operand::CreateToken(Kind, false, S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(Kind, S, getContext()));
return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
}
@@ -3342,6 +3725,13 @@
RegNum, RegKind::SVEPredicateVector, ElementWidth, S,
getLoc(), getContext()));
+ if (getLexer().is(AsmToken::LBrac)) {
+ // Indexed predicate, there's no comma so try parse the next operand
+ // immediately.
+ if (parseOperand(Operands, false, false))
+ return MatchOperand_NoMatch;
+ }
+
// Not all predicates are followed by a '/m' or '/z'.
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Slash))
@@ -3354,8 +3744,7 @@
}
// Add a literal slash as operand
- Operands.push_back(
- AArch64Operand::CreateToken("/" , false, getLoc(), getContext()));
+ Operands.push_back(AArch64Operand::CreateToken("/", getLoc(), getContext()));
Parser.Lex(); // Eat the slash.
@@ -3368,8 +3757,7 @@
// Add zero/merge token.
const char *ZM = Pred == "z" ? "z" : "m";
- Operands.push_back(
- AArch64Operand::CreateToken(ZM, false, getLoc(), getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(ZM, getLoc(), getContext()));
Parser.Lex(); // Eat zero/merge token.
return MatchOperand_Success;
@@ -3467,6 +3855,120 @@
return false;
}
+OperandMatchResultTy
+AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+
+ if (Parser.getTok().isNot(AsmToken::LCurly))
+ return MatchOperand_NoMatch;
+
+ auto ParseMatrixTile = [this, &Parser](unsigned &Reg,
+ unsigned &ElementWidth) {
+ StringRef Name = Parser.getTok().getString();
+ size_t DotPosition = Name.find('.');
+ if (DotPosition == StringRef::npos)
+ return MatchOperand_NoMatch;
+
+ unsigned RegNum = matchMatrixTileListRegName(Name);
+ if (!RegNum)
+ return MatchOperand_NoMatch;
+
+ StringRef Tail = Name.drop_front(DotPosition);
+ const Optional<std::pair<int, int>> &KindRes =
+ parseVectorKind(Tail, RegKind::Matrix);
+ if (!KindRes) {
+ TokError("Expected the register to be followed by element width suffix");
+ return MatchOperand_ParseFail;
+ }
+ ElementWidth = KindRes->second;
+ Reg = RegNum;
+ Parser.Lex(); // Eat the register.
+ return MatchOperand_Success;
+ };
+
+ SMLoc S = getLoc();
+ auto LCurly = Parser.getTok();
+ Parser.Lex(); // Eat left bracket token.
+
+ // Empty matrix list
+ if (parseOptionalToken(AsmToken::RCurly)) {
+ Operands.push_back(AArch64Operand::CreateMatrixTileList(
+ /*RegMask=*/0, S, getLoc(), getContext()));
+ return MatchOperand_Success;
+ }
+
+ // Try parse {za} alias early
+ if (Parser.getTok().getString().equals_insensitive("za")) {
+ Parser.Lex(); // Eat 'za'
+
+ if (parseToken(AsmToken::RCurly, "'}' expected"))
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(AArch64Operand::CreateMatrixTileList(
+ /*RegMask=*/0xFF, S, getLoc(), getContext()));
+ return MatchOperand_Success;
+ }
+
+ SMLoc TileLoc = getLoc();
+
+ unsigned FirstReg, ElementWidth;
+ auto ParseRes = ParseMatrixTile(FirstReg, ElementWidth);
+ if (ParseRes != MatchOperand_Success) {
+ Parser.getLexer().UnLex(LCurly);
+ return ParseRes;
+ }
+
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+
+ unsigned PrevReg = FirstReg;
+ unsigned Count = 1;
+
+ SmallSet<unsigned, 8> DRegs;
+ AArch64Operand::ComputeRegsForAlias(FirstReg, DRegs, ElementWidth);
+
+ SmallSet<unsigned, 8> SeenRegs;
+ SeenRegs.insert(FirstReg);
+
+ while (parseOptionalToken(AsmToken::Comma)) {
+ TileLoc = getLoc();
+ unsigned Reg, NextElementWidth;
+ ParseRes = ParseMatrixTile(Reg, NextElementWidth);
+ if (ParseRes != MatchOperand_Success)
+ return ParseRes;
+
+ // Element size must match on all regs in the list.
+ if (ElementWidth != NextElementWidth) {
+ Error(TileLoc, "mismatched register size suffix");
+ return MatchOperand_ParseFail;
+ }
+
+ if (RI->getEncodingValue(Reg) <= (RI->getEncodingValue(PrevReg)))
+ Warning(TileLoc, "tile list not in ascending order");
+
+ if (SeenRegs.contains(Reg))
+ Warning(TileLoc, "duplicate tile in list");
+ else {
+ SeenRegs.insert(Reg);
+ AArch64Operand::ComputeRegsForAlias(Reg, DRegs, ElementWidth);
+ }
+
+ PrevReg = Reg;
+ ++Count;
+ }
+
+ if (parseToken(AsmToken::RCurly, "'}' expected"))
+ return MatchOperand_ParseFail;
+
+ unsigned RegMask = 0;
+ for (auto Reg : DRegs)
+ RegMask |= 0x1 << (RI->getEncodingValue(Reg) -
+ RI->getEncodingValue(AArch64::ZAD0));
+ Operands.push_back(
+ AArch64Operand::CreateMatrixTileList(RegMask, S, getLoc(), getContext()));
+
+ return MatchOperand_Success;
+}
+
template <RegKind VectorKind>
OperandMatchResultTy
AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
@@ -3488,7 +3990,8 @@
if (RegTok.isNot(AsmToken::Identifier) ||
ParseRes == MatchOperand_ParseFail ||
- (ParseRes == MatchOperand_NoMatch && NoMatchIsError)) {
+ (ParseRes == MatchOperand_NoMatch && NoMatchIsError &&
+ !RegTok.getString().startswith_insensitive("za"))) {
Error(Loc, "vector register expected");
return MatchOperand_ParseFail;
}
@@ -3672,19 +4175,20 @@
// Some SVE instructions have a decoration after the immediate, i.e.
// "mul vl". We parse them here and add tokens, which must be present in the
// asm string in the tablegen instruction.
- bool NextIsVL = Parser.getLexer().peekTok().getString().equals_lower("vl");
+ bool NextIsVL =
+ Parser.getLexer().peekTok().getString().equals_insensitive("vl");
bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
- if (!Parser.getTok().getString().equals_lower("mul") ||
+ if (!Parser.getTok().getString().equals_insensitive("mul") ||
!(NextIsVL || NextIsHash))
return true;
Operands.push_back(
- AArch64Operand::CreateToken("mul", false, getLoc(), getContext()));
+ AArch64Operand::CreateToken("mul", getLoc(), getContext()));
Parser.Lex(); // Eat the "mul"
if (NextIsVL) {
Operands.push_back(
- AArch64Operand::CreateToken("vl", false, getLoc(), getContext()));
+ AArch64Operand::CreateToken("vl", getLoc(), getContext()));
Parser.Lex(); // Eat the "vl"
return false;
}
@@ -3712,8 +4216,15 @@
auto Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
return true;
- Operands.push_back(AArch64Operand::CreateToken(Tok.getString(), false,
- Tok.getLoc(), getContext()));
+
+ auto Keyword = Tok.getString();
+ Keyword = StringSwitch<StringRef>(Keyword.lower())
+ .Case("sm", "sm")
+ .Case("za", "za")
+ .Default(Keyword);
+ Operands.push_back(
+ AArch64Operand::CreateToken(Keyword, Tok.getLoc(), getContext()));
+
Parser.Lex();
return false;
}
@@ -3751,17 +4262,26 @@
return false;
}
case AsmToken::LBrac: {
- SMLoc Loc = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
- getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken("[", getLoc(), getContext()));
Parser.Lex(); // Eat '['
// There's no comma after a '[', so we can parse the next operand
// immediately.
return parseOperand(Operands, false, false);
}
- case AsmToken::LCurly:
- return parseNeonVectorList(Operands);
+ case AsmToken::LCurly: {
+ if (!parseNeonVectorList(Operands))
+ return false;
+
+ Operands.push_back(
+ AArch64Operand::CreateToken("{", getLoc(), getContext()));
+ Parser.Lex(); // Eat '{'
+
+ // There's no comma after a '{', so we can parse the next operand
+ // immediately.
+ return parseOperand(Operands, false, false);
+ }
case AsmToken::Identifier: {
// If we're expecting a Condition Code operand, then just parse that.
if (isCondCode)
@@ -3776,6 +4296,11 @@
if (!parseOptionalMulOperand(Operands))
return false;
+ // If this is an "smstart" or "smstop" instruction, parse its special
+ // keyword operand as an identifier.
+ if (Mnemonic == "smstart" || Mnemonic == "smstop")
+ return parseKeywordOperand(Operands);
+
// This could be an optional "shift" or "extend" operand.
OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
// We can only continue if no tokens were eaten.
@@ -3830,10 +4355,8 @@
return TokError("expected floating-point constant #0.0");
Parser.Lex(); // Eat the token.
- Operands.push_back(
- AArch64Operand::CreateToken("#0", false, S, getContext()));
- Operands.push_back(
- AArch64Operand::CreateToken(".0", false, S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken("#0", S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(".0", S, getContext()));
return false;
}
@@ -3873,9 +4396,9 @@
Imm >>= 16;
}
if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
- Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
- Operands.push_back(AArch64Operand::CreateImm(
- MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
+ Operands[0] = AArch64Operand::CreateToken("movz", Loc, Ctx);
+ Operands.push_back(AArch64Operand::CreateImm(
+ MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
if (ShiftAmt)
Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
ShiftAmt, true, S, E, Ctx));
@@ -4025,8 +4548,7 @@
Head == "cfp" || Head == "dvp" || Head == "cpp")
return parseSysAlias(Head, NameLoc, Operands);
- Operands.push_back(
- AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(Head, NameLoc, getContext()));
Mnemonic = Head;
// Handle condition codes for a branch mnemonic
@@ -4040,8 +4562,8 @@
AArch64CC::CondCode CC = parseCondCodeString(Head);
if (CC == AArch64CC::Invalid)
return Error(SuffixLoc, "invalid condition code");
- Operands.push_back(
- AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(".", SuffixLoc, getContext(),
+ /*IsSuffix=*/true));
Operands.push_back(
AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
}
@@ -4053,8 +4575,8 @@
Head = Name.slice(Start, Next);
SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
(Head.data() - Name.data()) + 1);
- Operands.push_back(
- AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(
+ Head, SuffixLoc, getContext(), /*IsSuffix=*/true));
}
// Conditional compare instructions have a Condition Code operand, which needs
@@ -4087,23 +4609,29 @@
return true;
}
- // After successfully parsing some operands there are two special cases to
- // consider (i.e. notional operands not separated by commas). Both are due
- // to memory specifiers:
+ // After successfully parsing some operands there are three special cases
+ // to consider (i.e. notional operands not separated by commas). Two are
+ // due to memory specifiers:
// + An RBrac will end an address for load/store/prefetch
// + An '!' will indicate a pre-indexed operation.
//
+ // And a further case is '}', which ends a group of tokens specifying the
+ // SME accumulator array 'ZA' or tile vector, i.e.
+ //
+ // '{ ZA }' or '{ <ZAt><HV>.<BHSDQ>[<Wv>, #<imm>] }'
+ //
// It's someone else's responsibility to make sure these tokens are sane
// in the given context!
- SMLoc RLoc = Parser.getTok().getLoc();
if (parseOptionalToken(AsmToken::RBrac))
Operands.push_back(
- AArch64Operand::CreateToken("]", false, RLoc, getContext()));
- SMLoc ELoc = Parser.getTok().getLoc();
+ AArch64Operand::CreateToken("]", getLoc(), getContext()));
if (parseOptionalToken(AsmToken::Exclaim))
Operands.push_back(
- AArch64Operand::CreateToken("!", false, ELoc, getContext()));
+ AArch64Operand::CreateToken("!", getLoc(), getContext()));
+ if (parseOptionalToken(AsmToken::RCurly))
+ Operands.push_back(
+ AArch64Operand::CreateToken("}", getLoc(), getContext()));
++N;
} while (parseOptionalToken(AsmToken::Comma));
@@ -4558,6 +5086,8 @@
return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
case Match_InvalidImm0_1:
return Error(Loc, "immediate must be an integer in range [0, 1].");
+ case Match_InvalidImm0_3:
+ return Error(Loc, "immediate must be an integer in range [0, 3].");
case Match_InvalidImm0_7:
return Error(Loc, "immediate must be an integer in range [0, 7].");
case Match_InvalidImm0_15:
@@ -4623,6 +5153,7 @@
case Match_MRS:
return Error(Loc, "expected readable system register");
case Match_MSR:
+ case Match_InvalidSVCR:
return Error(Loc, "expected writable system register or pstate");
case Match_InvalidComplexRotationEven:
return Error(Loc, "complex rotation must be 0, 90, 180 or 270.");
@@ -4642,6 +5173,9 @@
return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #2'");
case Match_InvalidGPR64shifted64:
return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #3'");
+ case Match_InvalidGPR64shifted128:
+ return Error(
+ Loc, "register must be x0..x30 or xzr, with required shift 'lsl #4'");
case Match_InvalidGPR64NoXZRshifted8:
return Error(Loc, "register must be x0..x30 without shift");
case Match_InvalidGPR64NoXZRshifted16:
@@ -4650,6 +5184,8 @@
return Error(Loc, "register must be x0..x30 with required shift 'lsl #2'");
case Match_InvalidGPR64NoXZRshifted64:
return Error(Loc, "register must be x0..x30 with required shift 'lsl #3'");
+ case Match_InvalidGPR64NoXZRshifted128:
+ return Error(Loc, "register must be x0..x30 with required shift 'lsl #4'");
case Match_InvalidZPR32UXTW8:
case Match_InvalidZPR32SXTW8:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw)'");
@@ -4734,6 +5270,33 @@
return Error(Loc, "Invalid floating point constant, expected 0.5 or 2.0.");
case Match_InvalidSVEExactFPImmOperandZeroOne:
return Error(Loc, "Invalid floating point constant, expected 0.0 or 1.0.");
+ case Match_InvalidMatrixTileVectorH8:
+ case Match_InvalidMatrixTileVectorV8:
+ return Error(Loc, "invalid matrix operand, expected za0h.b or za0v.b");
+ case Match_InvalidMatrixTileVectorH16:
+ case Match_InvalidMatrixTileVectorV16:
+ return Error(Loc,
+ "invalid matrix operand, expected za[0-1]h.h or za[0-1]v.h");
+ case Match_InvalidMatrixTileVectorH32:
+ case Match_InvalidMatrixTileVectorV32:
+ return Error(Loc,
+ "invalid matrix operand, expected za[0-3]h.s or za[0-3]v.s");
+ case Match_InvalidMatrixTileVectorH64:
+ case Match_InvalidMatrixTileVectorV64:
+ return Error(Loc,
+ "invalid matrix operand, expected za[0-7]h.d or za[0-7]v.d");
+ case Match_InvalidMatrixTileVectorH128:
+ case Match_InvalidMatrixTileVectorV128:
+ return Error(Loc,
+ "invalid matrix operand, expected za[0-15]h.q or za[0-15]v.q");
+ case Match_InvalidMatrixTile32:
+ return Error(Loc, "invalid matrix operand, expected za[0-3].s");
+ case Match_InvalidMatrixTile64:
+ return Error(Loc, "invalid matrix operand, expected za[0-7].d");
+ case Match_InvalidMatrix:
+ return Error(Loc, "invalid matrix operand, expected za");
+ case Match_InvalidMatrixIndexGPR32_12_15:
+ return Error(Loc, "operand must be a register in range [w12, w15]");
default:
llvm_unreachable("unexpected error code!");
}
@@ -4774,8 +5337,8 @@
const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());
- Operands[0] = AArch64Operand::CreateToken(
- "ubfm", false, Op.getStartLoc(), getContext());
+ Operands[0] =
+ AArch64Operand::CreateToken("ubfm", Op.getStartLoc(), getContext());
Operands.push_back(AArch64Operand::CreateImm(
NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
@@ -4824,8 +5387,8 @@
const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
- Operands[0] = AArch64Operand::CreateToken(
- "bfm", false, Op.getStartLoc(), getContext());
+ Operands[0] =
+ AArch64Operand::CreateToken("bfm", Op.getStartLoc(), getContext());
Operands[2] = AArch64Operand::CreateReg(
RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar,
SMLoc(), SMLoc(), getContext());
@@ -4887,14 +5450,14 @@
Operands[4] = AArch64Operand::CreateImm(
NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
if (Tok == "bfi")
- Operands[0] = AArch64Operand::CreateToken(
- "bfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("bfm", Op.getStartLoc(),
+ getContext());
else if (Tok == "sbfiz")
- Operands[0] = AArch64Operand::CreateToken(
- "sbfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("sbfm", Op.getStartLoc(),
+ getContext());
else if (Tok == "ubfiz")
- Operands[0] = AArch64Operand::CreateToken(
- "ubfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("ubfm", Op.getStartLoc(),
+ getContext());
else
llvm_unreachable("No valid mnemonic for alias?");
}
@@ -4941,14 +5504,14 @@
Operands[4] = AArch64Operand::CreateImm(
NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
if (Tok == "bfxil")
- Operands[0] = AArch64Operand::CreateToken(
- "bfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("bfm", Op.getStartLoc(),
+ getContext());
else if (Tok == "sbfx")
- Operands[0] = AArch64Operand::CreateToken(
- "sbfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("sbfm", Op.getStartLoc(),
+ getContext());
else if (Tok == "ubfx")
- Operands[0] = AArch64Operand::CreateToken(
- "ubfm", false, Op.getStartLoc(), getContext());
+ Operands[0] = AArch64Operand::CreateToken("ubfm", Op.getStartLoc(),
+ getContext());
else
llvm_unreachable("No valid mnemonic for alias?");
}
@@ -4974,8 +5537,8 @@
" correctly on this CPU, converting to equivalent movi.16b");
// Switch the suffix to .16b.
unsigned Idx = Op1.isToken() ? 1 : 2;
- Operands[Idx] = AArch64Operand::CreateToken(".16b", false, IDLoc,
- getContext());
+ Operands[Idx] =
+ AArch64Operand::CreateToken(".16b", IDLoc, getContext());
}
}
}
@@ -5162,6 +5725,7 @@
case Match_InvalidMemoryIndexed16SImm9:
case Match_InvalidMemoryIndexed8SImm10:
case Match_InvalidImm0_1:
+ case Match_InvalidImm0_3:
case Match_InvalidImm0_7:
case Match_InvalidImm0_15:
case Match_InvalidImm0_31:
@@ -5198,10 +5762,12 @@
case Match_InvalidGPR64shifted16:
case Match_InvalidGPR64shifted32:
case Match_InvalidGPR64shifted64:
+ case Match_InvalidGPR64shifted128:
case Match_InvalidGPR64NoXZRshifted8:
case Match_InvalidGPR64NoXZRshifted16:
case Match_InvalidGPR64NoXZRshifted32:
case Match_InvalidGPR64NoXZRshifted64:
+ case Match_InvalidGPR64NoXZRshifted128:
case Match_InvalidZPR32UXTW8:
case Match_InvalidZPR32UXTW16:
case Match_InvalidZPR32UXTW32:
@@ -5252,6 +5818,21 @@
case Match_InvalidSVEExactFPImmOperandHalfOne:
case Match_InvalidSVEExactFPImmOperandHalfTwo:
case Match_InvalidSVEExactFPImmOperandZeroOne:
+ case Match_InvalidMatrixTile32:
+ case Match_InvalidMatrixTile64:
+ case Match_InvalidMatrix:
+ case Match_InvalidMatrixTileVectorH8:
+ case Match_InvalidMatrixTileVectorH16:
+ case Match_InvalidMatrixTileVectorH32:
+ case Match_InvalidMatrixTileVectorH64:
+ case Match_InvalidMatrixTileVectorH128:
+ case Match_InvalidMatrixTileVectorV8:
+ case Match_InvalidMatrixTileVectorV16:
+ case Match_InvalidMatrixTileVectorV32:
+ case Match_InvalidMatrixTileVectorV64:
+ case Match_InvalidMatrixTileVectorV128:
+ case Match_InvalidSVCR:
+ case Match_InvalidMatrixIndexGPR32_12_15:
case Match_MSR:
case Match_MRS: {
if (ErrorInfo >= Operands.size())
@@ -5270,10 +5851,9 @@
/// ParseDirective parses the arm specific directives
bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
- const MCObjectFileInfo::Environment Format =
- getContext().getObjectFileInfo()->getObjectFileType();
- bool IsMachO = Format == MCObjectFileInfo::IsMachO;
- bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+ const MCContext::Environment Format = getContext().getObjectFileType();
+ bool IsMachO = Format == MCContext::IsMachO;
+ bool IsCOFF = Format == MCContext::IsCOFF;
auto IDVal = DirectiveID.getIdentifier().lower();
SMLoc Loc = DirectiveID.getLoc();
@@ -5446,7 +6026,7 @@
for (auto Name : RequestedExtensions) {
bool EnableFeature = true;
- if (Name.startswith_lower("no")) {
+ if (Name.startswith_insensitive("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
@@ -5482,7 +6062,7 @@
return true;
bool EnableFeature = true;
- if (Name.startswith_lower("no")) {
+ if (Name.startswith_insensitive("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
@@ -5548,7 +6128,7 @@
bool EnableFeature = true;
- if (Name.startswith_lower("no")) {
+ if (Name.startswith_insensitive("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
@@ -5598,9 +6178,7 @@
return false;
};
- if (parseMany(parseOp))
- return addErrorSuffix(" in '.inst' directive");
- return false;
+ return parseMany(parseOp);
}
// parseDirectiveTLSDescCall:
@@ -5756,9 +6334,7 @@
return TokError("unexpected input in .unreq directive.");
RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
Parser.Lex(); // Eat the identifier.
- if (parseToken(AsmToken::EndOfStatement))
- return addErrorSuffix("in '.unreq' directive");
- return false;
+ return parseToken(AsmToken::EndOfStatement);
}
bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
@@ -5791,16 +6367,13 @@
MCSymbol *Sym = getContext().lookupSymbol(SymbolName);
if (!Sym)
- return TokError("unknown symbol in '.variant_pcs' directive");
+ return TokError("unknown symbol");
Parser.Lex(); // Eat the symbol
- // Shouldn't be any more tokens
- if (parseToken(AsmToken::EndOfStatement))
- return addErrorSuffix(" in '.variant_pcs' directive");
-
+ if (parseEOL())
+ return true;
getTargetStreamer().emitDirectiveVariantPCS(Sym);
-
return false;
}
@@ -5810,14 +6383,14 @@
int64_t Size;
if (parseImmExpr(Size))
return true;
- getTargetStreamer().EmitARM64WinCFIAllocStack(Size);
+ getTargetStreamer().emitARM64WinCFIAllocStack(Size);
return false;
}
/// parseDirectiveSEHPrologEnd
/// ::= .seh_endprologue
bool AArch64AsmParser::parseDirectiveSEHPrologEnd(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIPrologEnd();
+ getTargetStreamer().emitARM64WinCFIPrologEnd();
return false;
}
@@ -5827,7 +6400,7 @@
int64_t Offset;
if (parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveR19R20X(Offset);
+ getTargetStreamer().emitARM64WinCFISaveR19R20X(Offset);
return false;
}
@@ -5837,7 +6410,7 @@
int64_t Offset;
if (parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFPLR(Offset);
+ getTargetStreamer().emitARM64WinCFISaveFPLR(Offset);
return false;
}
@@ -5847,7 +6420,7 @@
int64_t Offset;
if (parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFPLRX(Offset);
+ getTargetStreamer().emitARM64WinCFISaveFPLRX(Offset);
return false;
}
@@ -5859,7 +6432,7 @@
if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveReg(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveReg(Reg, Offset);
return false;
}
@@ -5871,7 +6444,7 @@
if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveRegX(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveRegX(Reg, Offset);
return false;
}
@@ -5883,7 +6456,7 @@
if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveRegP(Reg, Offset);
return false;
}
@@ -5895,7 +6468,7 @@
if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveRegPX(Reg, Offset);
return false;
}
@@ -5911,7 +6484,7 @@
if (check(((Reg - 19) % 2 != 0), L,
"expected register with even offset from x19"))
return true;
- getTargetStreamer().EmitARM64WinCFISaveLRPair(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveLRPair(Reg, Offset);
return false;
}
@@ -5923,7 +6496,7 @@
if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFReg(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveFReg(Reg, Offset);
return false;
}
@@ -5935,7 +6508,7 @@
if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFRegX(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveFRegX(Reg, Offset);
return false;
}
@@ -5947,7 +6520,7 @@
if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveFRegP(Reg, Offset);
return false;
}
@@ -5959,14 +6532,14 @@
if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
parseComma() || parseImmExpr(Offset))
return true;
- getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset);
+ getTargetStreamer().emitARM64WinCFISaveFRegPX(Reg, Offset);
return false;
}
/// parseDirectiveSEHSetFP
/// ::= .seh_set_fp
bool AArch64AsmParser::parseDirectiveSEHSetFP(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFISetFP();
+ getTargetStreamer().emitARM64WinCFISetFP();
return false;
}
@@ -5976,63 +6549,63 @@
int64_t Size;
if (parseImmExpr(Size))
return true;
- getTargetStreamer().EmitARM64WinCFIAddFP(Size);
+ getTargetStreamer().emitARM64WinCFIAddFP(Size);
return false;
}
/// parseDirectiveSEHNop
/// ::= .seh_nop
bool AArch64AsmParser::parseDirectiveSEHNop(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFINop();
+ getTargetStreamer().emitARM64WinCFINop();
return false;
}
/// parseDirectiveSEHSaveNext
/// ::= .seh_save_next
bool AArch64AsmParser::parseDirectiveSEHSaveNext(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFISaveNext();
+ getTargetStreamer().emitARM64WinCFISaveNext();
return false;
}
/// parseDirectiveSEHEpilogStart
/// ::= .seh_startepilogue
bool AArch64AsmParser::parseDirectiveSEHEpilogStart(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIEpilogStart();
+ getTargetStreamer().emitARM64WinCFIEpilogStart();
return false;
}
/// parseDirectiveSEHEpilogEnd
/// ::= .seh_endepilogue
bool AArch64AsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIEpilogEnd();
+ getTargetStreamer().emitARM64WinCFIEpilogEnd();
return false;
}
/// parseDirectiveSEHTrapFrame
/// ::= .seh_trap_frame
bool AArch64AsmParser::parseDirectiveSEHTrapFrame(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFITrapFrame();
+ getTargetStreamer().emitARM64WinCFITrapFrame();
return false;
}
/// parseDirectiveSEHMachineFrame
/// ::= .seh_pushframe
bool AArch64AsmParser::parseDirectiveSEHMachineFrame(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIMachineFrame();
+ getTargetStreamer().emitARM64WinCFIMachineFrame();
return false;
}
/// parseDirectiveSEHContext
/// ::= .seh_context
bool AArch64AsmParser::parseDirectiveSEHContext(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIContext();
+ getTargetStreamer().emitARM64WinCFIContext();
return false;
}
/// parseDirectiveSEHClearUnwoundToCall
/// ::= .seh_clear_unwound_to_call
bool AArch64AsmParser::parseDirectiveSEHClearUnwoundToCall(SMLoc L) {
- getTargetStreamer().EmitARM64WinCFIClearUnwoundToCall();
+ getTargetStreamer().emitARM64WinCFIClearUnwoundToCall();
return false;
}
@@ -6144,6 +6717,14 @@
case MCK__HASH_8:
ExpectedVal = 8;
break;
+ case MCK_MPR:
+ // If the Kind is a token for the MPR register class which has the "za"
+ // register (SME accumulator array), check if the asm is a literal "za"
+ // token. This is for the "smstart za" alias that defines the register
+ // as a literal token.
+ if (Op.isTokenEqual("za"))
+ return Match_Success;
+ break;
}
if (!Op.isImm())
return Match_InvalidOperand;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/AArch64/CMakeLists.txt
index 0e9503b..a77a66b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -10,6 +10,8 @@
tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
+tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner
+ -combiners="AArch64O0PreLegalizerCombinerHelper")
tablegen(LLVM AArch64GenPreLegalizeGICombiner.inc -gen-global-isel-combiner
-combiners="AArch64PreLegalizerCombinerHelper")
tablegen(LLVM AArch64GenPostLegalizeGICombiner.inc -gen-global-isel-combiner
@@ -29,8 +31,10 @@
add_llvm_target(AArch64CodeGen
GISel/AArch64CallLowering.cpp
+ GISel/AArch64GlobalISelUtils.cpp
GISel/AArch64InstructionSelector.cpp
GISel/AArch64LegalizerInfo.cpp
+ GISel/AArch64O0PreLegalizerCombiner.cpp
GISel/AArch64PreLegalizerCombiner.cpp
GISel/AArch64PostLegalizerCombiner.cpp
GISel/AArch64PostLegalizerLowering.cpp
@@ -59,6 +63,7 @@
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
AArch64LoadStoreOptimizer.cpp
+ AArch64LowerHomogeneousPrologEpilog.cpp
AArch64MachineFunctionInfo.cpp
AArch64MacroFusion.cpp
AArch64MCInstLower.cpp
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/src/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index dca76f8..1ed8a80 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -69,6 +69,10 @@
static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
@@ -111,6 +115,13 @@
static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+template <unsigned NumBitsForTile>
+static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
+ unsigned RegMask,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
@@ -226,6 +237,8 @@
uint64_t Addr, const void *Decoder);
static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
+ const void *Decoder);
static bool Check(DecodeStatus &Out, DecodeStatus In) {
switch (In) {
@@ -276,6 +289,43 @@
for (auto Table : Tables) {
DecodeStatus Result =
decodeInstruction(Table, MI, Insn, Address, this, STI);
+
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ // For Scalable Matrix Extension (SME) instructions that have an implicit
+ // operand for the accumulator (ZA) which isn't encoded, manually insert
+ // operand.
+ case AArch64::LDR_ZA:
+ case AArch64::STR_ZA: {
+ MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZA));
+ // Spill and fill instructions have a single immediate used for both the
+ // vector select offset and optional memory offset. Replicate the decoded
+ // immediate.
+ const MCOperand &Imm4Op = MI.getOperand(2);
+ assert(Imm4Op.isImm() && "Unexpected operand type!");
+ MI.addOperand(Imm4Op);
+ break;
+ }
+ case AArch64::LD1_MXIPXX_H_B:
+ case AArch64::LD1_MXIPXX_V_B:
+ case AArch64::ST1_MXIPXX_H_B:
+ case AArch64::ST1_MXIPXX_V_B:
+ case AArch64::INSERT_MXIPZ_H_B:
+ case AArch64::INSERT_MXIPZ_V_B:
+ // e.g.
+ // MOVA ZA0<HV>.B[<Ws>, <imm>], <Pg>/M, <Zn>.B
+ // ^ insert implicit 8-bit element tile
+ MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZAB0));
+ break;
+ case AArch64::EXTRACT_ZPMXI_H_B:
+ case AArch64::EXTRACT_ZPMXI_V_B:
+ // MOVA <Zd>.B, <Pg>/M, ZA0<HV>.B[<Ws>, <imm>]
+ // ^ insert implicit 8-bit element tile
+ MI.insert(MI.begin()+2, MCOperand::createReg(AArch64::ZAB0));
+ break;
+ }
+
if (Result != MCDisassembler::Fail)
return Result;
}
@@ -502,6 +552,22 @@
return Success;
}
+static const unsigned MatrixIndexGPR32_12_15DecoderTable[] = {
+ AArch64::W12, AArch64::W13, AArch64::W14, AArch64::W15
+};
+
+static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 3)
+ return Fail;
+
+ unsigned Register = MatrixIndexGPR32_12_15DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
static const unsigned GPR32DecoderTable[] = {
AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
AArch64::W5, AArch64::W6, AArch64::W7, AArch64::W8, AArch64::W9,
@@ -642,6 +708,39 @@
return Success;
}
+static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
+ unsigned RegMask,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegMask > 0xFF)
+ return Fail;
+ Inst.addOperand(MCOperand::createImm(RegMask));
+ return Success;
+}
+
+static const SmallVector<SmallVector<unsigned, 16>, 5>
+ MatrixZATileDecoderTable = {
+ {AArch64::ZAB0},
+ {AArch64::ZAH0, AArch64::ZAH1},
+ {AArch64::ZAS0, AArch64::ZAS1, AArch64::ZAS2, AArch64::ZAS3},
+ {AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
+ AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7},
+ {AArch64::ZAQ0, AArch64::ZAQ1, AArch64::ZAQ2, AArch64::ZAQ3,
+ AArch64::ZAQ4, AArch64::ZAQ5, AArch64::ZAQ6, AArch64::ZAQ7,
+ AArch64::ZAQ8, AArch64::ZAQ9, AArch64::ZAQ10, AArch64::ZAQ11,
+ AArch64::ZAQ12, AArch64::ZAQ13, AArch64::ZAQ14, AArch64::ZAQ15}};
+
+template <unsigned NumBitsForTile>
+static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ unsigned LastReg = (1 << NumBitsForTile) - 1;
+ if (RegNo > LastReg)
+ return Fail;
+ Inst.addOperand(
+ MCOperand::createReg(MatrixZATileDecoderTable[NumBitsForTile][RegNo]));
+ return Success;
+}
+
static const unsigned PPRDecoderTable[] = {
AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3,
AArch64::P4, AArch64::P5, AArch64::P6, AArch64::P7,
@@ -1931,3 +2030,12 @@
Inst.addOperand(MCOperand::createImm(Imm + 1));
return Success;
}
+
+static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
+ const void *Decoder) {
+ if (AArch64SVCR::lookupSVCRByEncoding(Imm)) {
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return Success;
+ }
+ return Fail;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 0f8b1d6..28b234b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -51,49 +51,135 @@
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
: CallLowering(&TLI) {}
+static void applyStackPassedSmallTypeDAGHack(EVT OrigVT, MVT &ValVT,
+ MVT &LocVT) {
+ // If ValVT is i1/i8/i16, we should set LocVT to i8/i8/i16. This is a legacy
+ // hack because the DAG calls the assignment function with pre-legalized
+ // register typed values, not the raw type.
+ //
+ // This hack is not applied to return values which are not passed on the
+ // stack.
+ if (OrigVT == MVT::i1 || OrigVT == MVT::i8)
+ ValVT = LocVT = MVT::i8;
+ else if (OrigVT == MVT::i16)
+ ValVT = LocVT = MVT::i16;
+}
+
+// Account for i1/i8/i16 stack passed value hack
+static LLT getStackValueStoreTypeHack(const CCValAssign &VA) {
+ const MVT ValVT = VA.getValVT();
+ return (ValVT == MVT::i8 || ValVT == MVT::i16) ? LLT(ValVT)
+ : LLT(VA.getLocVT());
+}
+
namespace {
+
+struct AArch64IncomingValueAssigner
+ : public CallLowering::IncomingValueAssigner {
+ AArch64IncomingValueAssigner(CCAssignFn *AssignFn_,
+ CCAssignFn *AssignFnVarArg_)
+ : IncomingValueAssigner(AssignFn_, AssignFnVarArg_) {}
+
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
+ return IncomingValueAssigner::assignArg(ValNo, OrigVT, ValVT, LocVT,
+ LocInfo, Info, Flags, State);
+ }
+};
+
+struct AArch64OutgoingValueAssigner
+ : public CallLowering::OutgoingValueAssigner {
+ const AArch64Subtarget &Subtarget;
+
+ /// Track if this is used for a return instead of function argument
+ /// passing. We apply a hack to i1/i8/i16 stack passed values, but do not use
+ /// stack passed returns for them and cannot apply the type adjustment.
+ bool IsReturn;
+
+ AArch64OutgoingValueAssigner(CCAssignFn *AssignFn_,
+ CCAssignFn *AssignFnVarArg_,
+ const AArch64Subtarget &Subtarget_,
+ bool IsReturn)
+ : OutgoingValueAssigner(AssignFn_, AssignFnVarArg_),
+ Subtarget(Subtarget_), IsReturn(IsReturn) {}
+
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ bool IsCalleeWin = Subtarget.isCallingConvWin64(State.getCallingConv());
+ bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
+
+ if (!State.isVarArg() && !UseVarArgsCCForFixed && !IsReturn)
+ applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
+
+ bool Res;
+ if (Info.IsFixed && !UseVarArgsCCForFixed)
+ Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ else
+ Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+
+ StackOffset = State.getNextStackOffset();
+ return Res;
+ }
+};
+
struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
- IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
+ IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : IncomingValueHandler(MIRBuilder, MRI) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- int FI = MFI.CreateFixedObject(Size, Offset, true);
+
+ // Byval is assumed to be writable memory, but other stack passed arguments
+ // are not.
+ const bool IsImmutable = !Flags.isByVal();
+
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI);
- StackUsed = std::max(StackUsed, Size + Offset);
return AddrReg.getReg(0);
}
+ LLT getStackValueStoreType(const DataLayout &DL, const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const override {
+ // For pointers, we just need to fixup the integer types reported in the
+ // CCValAssign.
+ if (Flags.isPointer())
+ return CallLowering::ValueHandler::getStackValueStoreType(DL, VA, Flags);
+ return getStackValueStoreTypeHack(VA);
+ }
+
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
- switch (VA.getLocInfo()) {
- default:
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- }
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- // The reported memory location may be wider than the value.
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
+ LLT ValTy(VA.getValVT());
+ LLT LocTy(VA.getLocVT());
+
+ // Fixup the types for the DAG compatibility hack.
+ if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16)
+ std::swap(ValTy, LocTy);
+ else {
+ // The calling code knows if this is a pointer or not, we're only touching
+ // the LocTy for the i8/i16 hack.
+ assert(LocTy.getSizeInBits() == MemTy.getSizeInBits());
+ LocTy = MemTy;
+ }
auto MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, LocTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -102,14 +188,11 @@
/// parameters (it's a basic-block live-in), and a call instruction
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(MCRegister PhysReg) = 0;
-
- uint64_t StackUsed;
};
struct FormalArgHandler : public IncomingArgHandler {
- FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : IncomingArgHandler(MIRBuilder, MRI) {}
void markPhysRegUsed(MCRegister PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -119,8 +202,8 @@
struct CallReturnHandler : public IncomingArgHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
void markPhysRegUsed(MCRegister PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -129,22 +212,34 @@
MachineInstrBuilder MIB;
};
+/// A special return arg handler for "returned" attribute arg calls.
+struct ReturnedArgCallReturnHandler : public CallReturnHandler {
+ ReturnedArgCallReturnHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB)
+ : CallReturnHandler(MIRBuilder, MRI, MIB) {}
+
+ void markPhysRegUsed(MCRegister PhysReg) override {}
+};
+
struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn,
- CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
+ MachineInstrBuilder MIB, bool IsTailCall = false,
int FPDiff = 0)
- : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
- StackSize(0), SPReg(0) {}
+ : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB), IsTailCall(IsTailCall),
+ FPDiff(FPDiff),
+ Subtarget(MIRBuilder.getMF().getSubtarget<AArch64Subtarget>()) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
MachineFunction &MF = MIRBuilder.getMF();
LLT p0 = LLT::pointer(0, 64);
LLT s64 = LLT::scalar(64);
if (IsTailCall) {
+ assert(!Flags.isByVal() && "byval unhandled with tail calls");
+
Offset += FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
auto FIReg = MIRBuilder.buildFrameIndex(p0, FI);
@@ -163,6 +258,17 @@
return AddrReg.getReg(0);
}
+ /// We need to fixup the reported store size for certain value types because
+ /// we invert the interpretation of ValVT and LocVT in certain cases. This is
+ /// for compatability with the DAG call lowering implementation, which we're
+ /// currently building on top of.
+ LLT getStackValueStoreType(const DataLayout &DL, const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const override {
+ if (Flags.isPointer())
+ return CallLowering::ValueHandler::getStackValueStoreType(DL, VA, Flags);
+ return getStackValueStoreTypeHack(VA);
+ }
+
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
@@ -170,105 +276,60 @@
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size,
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
- void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
- uint64_t Size, MachinePointerInfo &MPO,
+ void assignValueToAddress(const CallLowering::ArgInfo &Arg, unsigned RegIndex,
+ Register Addr, LLT MemTy, MachinePointerInfo &MPO,
CCValAssign &VA) override {
- unsigned MaxSize = Size * 8;
+ unsigned MaxSize = MemTy.getSizeInBytes() * 8;
// For varargs, we always want to extend them to 8 bytes, in which case
// we disable setting a max.
if (!Arg.IsFixed)
MaxSize = 0;
- assert(Arg.Regs.size() == 1);
+ Register ValVReg = Arg.Regs[RegIndex];
+ if (VA.getLocInfo() != CCValAssign::LocInfo::FPExt) {
+ MVT LocVT = VA.getLocVT();
+ MVT ValVT = VA.getValVT();
- Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
- ? extendRegister(Arg.Regs[0], VA, MaxSize)
- : Arg.Regs[0];
+ if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) {
+ std::swap(ValVT, LocVT);
+ MemTy = LLT(VA.getValVT());
+ }
- // If we extended we might need to adjust the MMO's Size.
- const LLT RegTy = MRI.getType(ValVReg);
- if (RegTy.getSizeInBytes() > Size)
- Size = RegTy.getSizeInBytes();
+ ValVReg = extendRegister(ValVReg, VA, MaxSize);
+ } else {
+ // The store does not cover the full allocated stack slot.
+ MemTy = LLT(VA.getValVT());
+ }
- assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
- }
-
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info,
- ISD::ArgFlagsTy Flags,
- CCState &State) override {
- bool Res;
- if (Info.IsFixed)
- Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
- else
- Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
-
- StackSize = State.getNextStackOffset();
- return Res;
+ assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
}
MachineInstrBuilder MIB;
- CCAssignFn *AssignFnVarArg;
+
bool IsTailCall;
/// For tail calls, the byte offset of the call's argument area from the
/// callee's. Unused elsewhere.
int FPDiff;
- uint64_t StackSize;
// Cache the SP register vreg if we need it more than once in this call site.
Register SPReg;
+
+ const AArch64Subtarget &Subtarget;
};
} // namespace
static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) {
- return CallConv == CallingConv::Fast && TailCallOpt;
-}
-
-void AArch64CallLowering::splitToValueTypes(
- const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const {
- const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- LLVMContext &Ctx = OrigArg.Ty->getContext();
-
- SmallVector<EVT, 4> SplitVTs;
- SmallVector<uint64_t, 4> Offsets;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
-
- if (SplitVTs.size() == 0)
- return;
-
- if (SplitVTs.size() == 1) {
- // No splitting to do, but we want to replace the original type (e.g. [1 x
- // double] -> double).
- SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags[0], OrigArg.IsFixed);
- return;
- }
-
- // Create one ArgInfo for each virtual register in the original ArgInfo.
- assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
-
- bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
- OrigArg.Ty, CallConv, false);
- for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
- Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
- SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
- OrigArg.IsFixed);
- if (NeedsRegBlock)
- SplitArgs.back().Flags[0].setInConsecutiveRegs();
- }
-
- SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
+ return (CallConv == CallingConv::Fast && TailCallOpt) ||
+ CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
}
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -284,6 +345,7 @@
if (!VRegs.empty()) {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
@@ -300,20 +362,16 @@
CallingConv::ID CC = F.getCallingConv();
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) {
- LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split");
- return false;
- }
-
Register CurVReg = VRegs[i];
- ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)};
+ ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx), 0};
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
// i1 is a special case because SDAG i1 true is naturally zero extended
// when widened using ANYEXT. We need to do it explicitly here.
if (MRI.getType(CurVReg).getSizeInBits() == 1) {
CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0);
- } else {
+ } else if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) ==
+ 1) {
// Some types will need extending as specified by the CC.
MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]);
if (EVT(NewVT) != SplitEVTs[i]) {
@@ -357,13 +415,18 @@
.buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)})
.getReg(0);
} else {
- LLVM_DEBUG(dbgs() << "Could not handle ret ty");
+ LLVM_DEBUG(dbgs() << "Could not handle ret ty\n");
return false;
}
} else {
- // A scalar extend.
- CurVReg =
- MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0);
+ // If the split EVT was a <1 x T> vector, and NewVT is T, then we
+ // don't have to do anything since we don't distinguish between the
+ // two.
+ if (NewLLT != MRI.getType(CurVReg)) {
+ // A scalar extend.
+ CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg})
+ .getReg(0);
+ }
}
}
}
@@ -372,11 +435,14 @@
// Reset the arg flags after modifying CurVReg.
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
}
- splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC);
+ splitToValueTypes(CurArgInfo, SplitArgs, DL, CC);
}
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
- Success = handleAssignments(MIRBuilder, SplitArgs, Handler);
+ AArch64OutgoingValueAssigner Assigner(AssignFn, AssignFn, Subtarget,
+ /*IsReturn*/ true);
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+ Success = determineAndHandleAssignments(Handler, Assigner, SplitArgs,
+ MIRBuilder, CC, F.isVarArg());
}
if (SwiftErrorVReg) {
@@ -431,12 +497,20 @@
}
}
-bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const {
+bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
+ auto &F = MF.getFunction();
if (isa<ScalableVectorType>(F.getReturnType()))
return true;
- return llvm::any_of(F.args(), [](const Argument &A) {
- return isa<ScalableVectorType>(A.getType());
- });
+ if (llvm::any_of(F.args(), [](const Argument &A) {
+ return isa<ScalableVectorType>(A.getType());
+ }))
+ return true;
+ const auto &ST = MF.getSubtarget<AArch64Subtarget>();
+ if (!ST.hasNEON() || !ST.hasFPARMv8()) {
+ LLVM_DEBUG(dbgs() << "Falling back to SDAG because we don't support no-NEON\n");
+ return true;
+ }
+ return false;
}
bool AArch64CallLowering::lowerFormalArguments(
@@ -453,10 +527,13 @@
if (DL.getTypeStoreSize(Arg.getType()).isZero())
continue;
- ArgInfo OrigArg{VRegs[i], Arg.getType()};
+ ArgInfo OrigArg{VRegs[i], Arg, i};
setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F);
- splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv());
+ if (Arg.hasAttribute(Attribute::SwiftAsync))
+ MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
++i;
}
@@ -467,12 +544,14 @@
CCAssignFn *AssignFn =
TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
- FormalArgHandler Handler(MIRBuilder, MRI, AssignFn);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ AArch64IncomingValueAssigner Assigner(AssignFn, AssignFn);
+ FormalArgHandler Handler(MIRBuilder, MRI);
+ if (!determineAndHandleAssignments(Handler, Assigner, SplitArgs, MIRBuilder,
+ F.getCallingConv(), F.isVarArg()))
return false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- uint64_t StackOffset = Handler.StackUsed;
+ uint64_t StackOffset = Assigner.StackOffset;
if (F.isVarArg()) {
auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
if (!Subtarget.isTargetDarwin()) {
@@ -482,7 +561,8 @@
}
// We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
- StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
+ StackOffset =
+ alignTo(Assigner.StackOffset, Subtarget.isTargetILP32() ? 4 : 8);
auto &MFI = MIRBuilder.getMF().getFrameInfo();
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
@@ -521,8 +601,9 @@
}
/// Return true if the calling convention is one that we can guarantee TCO for.
-static bool canGuaranteeTCO(CallingConv::ID CC) {
- return CC == CallingConv::Fast;
+static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
+ return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -531,9 +612,12 @@
case CallingConv::C:
case CallingConv::PreserveMost:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ case CallingConv::Tail:
+ case CallingConv::Fast:
return true;
default:
- return canGuaranteeTCO(CC);
+ return false;
}
}
@@ -567,9 +651,12 @@
std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
getAssignFnsForCC(CallerCC, TLI);
- if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed,
- *CalleeAssignFnVarArg, *CallerAssignFnFixed,
- *CallerAssignFnVarArg))
+ AArch64IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
+ CalleeAssignFnVarArg);
+ AArch64IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
+ CallerAssignFnVarArg);
+
+ if (!resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner))
return false;
// Make sure that the caller and callee preserve all of the same registers.
@@ -592,9 +679,11 @@
return true;
const Function &CallerF = MF.getFunction();
+ LLVMContext &Ctx = CallerF.getContext();
CallingConv::ID CalleeCC = Info.CallConv;
CallingConv::ID CallerCC = CallerF.getCallingConv();
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
CCAssignFn *AssignFnFixed;
CCAssignFn *AssignFnVarArg;
@@ -602,9 +691,11 @@
// We have outgoing arguments. Make sure that we can tail call with them.
SmallVector<CCValAssign, 16> OutLocs;
- CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, Ctx);
- if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) {
+ AArch64OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg,
+ Subtarget, /*IsReturn*/ false);
+ if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo)) {
LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
return false;
}
@@ -718,8 +809,8 @@
}
// If we have -tailcallopt, then we're done.
- if (MF.getTarget().Options.GuaranteedTailCallOpt)
- return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+ if (canGuaranteeTCO(CalleeCC, MF.getTarget().Options.GuaranteedTailCallOpt))
+ return CalleeCC == CallerF.getCallingConv();
// We don't have -tailcallopt, so we're allowed to change the ABI (sibcall).
// Try to find cases where we can do that.
@@ -762,6 +853,24 @@
return AArch64::TCRETURNri;
}
+static const uint32_t *
+getMaskForArgs(SmallVectorImpl<AArch64CallLowering::ArgInfo> &OutArgs,
+ AArch64CallLowering::CallLoweringInfo &Info,
+ const AArch64RegisterInfo &TRI, MachineFunction &MF) {
+ const uint32_t *Mask;
+ if (!OutArgs.empty() && OutArgs[0].Flags[0].isReturned()) {
+ // For 'this' returns, use the X0-preserving mask if applicable
+ Mask = TRI.getThisReturnPreservedMask(MF, Info.CallConv);
+ if (!Mask) {
+ OutArgs[0].Flags[0].setReturned(false);
+ Mask = TRI.getCallPreservedMask(MF, Info.CallConv);
+ }
+ } else {
+ Mask = TRI.getCallPreservedMask(MF, Info.CallConv);
+ }
+ return Mask;
+}
+
bool AArch64CallLowering::lowerTailCall(
MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
SmallVectorImpl<ArgInfo> &OutArgs) const {
@@ -772,7 +881,9 @@
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
// True when we're tail calling, but without -tailcallopt.
- bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+ bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt &&
+ Info.CallConv != CallingConv::Tail &&
+ Info.CallConv != CallingConv::SwiftTail;
// TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
// register class. Until we can do that, we should fall back here.
@@ -801,9 +912,10 @@
MIB.addImm(0);
// Tell the call which registers are clobbered.
- auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ auto TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
- if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
+ if (Subtarget.hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
MIB.addRegMask(Mask);
@@ -828,7 +940,11 @@
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
SmallVector<CCValAssign, 16> OutLocs;
CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
- analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg);
+
+ AArch64OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg,
+ Subtarget, /*IsReturn*/ false);
+ if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
+ return false;
// The callee will pop the argument stack as a tail call. Thus, we must
// keep it 16-byte aligned.
@@ -839,6 +955,11 @@
// actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
+ // Update the required reserved area if this is the tail call requiring the
+ // most argument stack space.
+ if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
+ FuncInfo->setTailCallReservedStack(-FPDiff);
+
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
@@ -849,12 +970,18 @@
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+ AArch64OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg,
+ Subtarget, /*IsReturn*/ false);
+
// Do the actual argument marshalling.
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, true, FPDiff);
- if (!handleAssignments(MIRBuilder, OutArgs, Handler))
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB,
+ /*IsTailCall*/ true, FPDiff);
+ if (!determineAndHandleAssignments(Handler, Assigner, OutArgs, MIRBuilder,
+ CalleeCC, Info.IsVarArg))
return false;
+ Mask = getMaskForArgs(OutArgs, Info, *TRI, MF);
+
if (Info.IsVarArg && Info.IsMustTailCall) {
// Now we know what's being passed to the function. Add uses to the call for
// the forwarded registers that we *aren't* passing as parameters. This will
@@ -880,12 +1007,12 @@
// sequence start and end here.
if (!IsSibCall) {
MIB->getOperand(1).setImm(FPDiff);
- CallSeqStart.addImm(NumBytes).addImm(0);
+ CallSeqStart.addImm(0).addImm(0);
// End the call sequence *before* emitting the call. Normally, we would
// tidy the frame up after the call. However, here, we've laid out the
// parameters so that when SP is reset, they will be in the correct
// location.
- MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0);
+ MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(0).addImm(0);
}
// Now we can add the actual call instruction to the correct basic block.
@@ -913,7 +1040,7 @@
SmallVector<ArgInfo, 8> OutArgs;
for (auto &OrigArg : Info.OrigArgs) {
- splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv);
+ splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
// AAPCS requires that we zero-extend i1 to 8 bits by the caller.
if (OrigArg.Ty->isIntegerTy(1))
OutArgs.back().Flags[0].setZExt();
@@ -921,7 +1048,7 @@
SmallVector<ArgInfo, 8> InArgs;
if (!Info.OrigRet.Ty->isVoidTy())
- splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv());
+ splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
// If we can lower as a tail call, do that instead.
bool CanTailCallOpt =
@@ -956,8 +1083,20 @@
MIB.add(Info.Callee);
// Tell the call which registers are clobbered.
- auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
+ const uint32_t *Mask;
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const auto *TRI = Subtarget.getRegisterInfo();
+
+ AArch64OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg,
+ Subtarget, /*IsReturn*/ false);
+ // Do the actual argument marshalling.
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, /*IsReturn*/ false);
+ if (!determineAndHandleAssignments(Handler, Assigner, OutArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
+ return false;
+
+ Mask = getMaskForArgs(OutArgs, Info, *TRI, MF);
+
if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
MIB.addRegMask(Mask);
@@ -965,12 +1104,6 @@
if (TRI->isAnyArgRegReserved(MF))
TRI->emitReservedArgRegCallError(MF);
- // Do the actual argument marshalling.
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, false);
- if (!handleAssignments(MIRBuilder, OutArgs, Handler))
- return false;
-
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
@@ -978,17 +1111,26 @@
// instruction, it must have a register class matching the
// constraint of that instruction.
if (Info.Callee.isReg())
- constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB,
- MIB->getDesc(), Info.Callee, 0);
+ constrainOperandRegClass(MF, *TRI, MRI, *Subtarget.getInstrInfo(),
+ *Subtarget.getRegBankInfo(), *MIB, MIB->getDesc(),
+ Info.Callee, 0);
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
if (!Info.OrigRet.Ty->isVoidTy()) {
CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv);
- CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, InArgs, Handler))
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+ bool UsingReturnedArg =
+ !OutArgs.empty() && OutArgs[0].Flags[0].isReturned();
+
+ AArch64OutgoingValueAssigner Assigner(RetAssignFn, RetAssignFn, Subtarget,
+ /*IsReturn*/ false);
+ ReturnedArgCallReturnHandler ReturnedArgHandler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(
+ UsingReturnedArg ? ReturnedArgHandler : Handler, Assigner, InArgs,
+ MIRBuilder, Info.CallConv, Info.IsVarArg,
+ UsingReturnedArg ? OutArgs[0].Regs[0] : Register()))
return false;
}
@@ -1000,13 +1142,17 @@
uint64_t CalleePopBytes =
doesCalleeRestoreStack(Info.CallConv,
MF.getTarget().Options.GuaranteedTailCallOpt)
- ? alignTo(Handler.StackSize, 16)
+ ? alignTo(Assigner.StackOffset, 16)
: 0;
- CallSeqStart.addImm(Handler.StackSize).addImm(0);
+ CallSeqStart.addImm(Assigner.StackOffset).addImm(0);
MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
- .addImm(Handler.StackSize)
+ .addImm(Assigner.StackOffset)
.addImm(CalleePopBytes);
return true;
}
+
+bool AArch64CallLowering::isTypeIsValidForThisReturn(EVT Ty) const {
+ return Ty.getSizeInBits() == 64;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index 1f45c9e..add0342 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -37,7 +37,7 @@
ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
Register SwiftErrorVReg) const override;
- bool fallBackToDAGISel(const Function &F) const override;
+ bool fallBackToDAGISel(const MachineFunction &MF) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs,
@@ -55,6 +55,8 @@
bool supportSwiftError() const override { return true; }
+ bool isTypeIsValidForThisReturn(EVT Ty) const override;
+
private:
using RegHandler = std::function<void(MachineIRBuilder &, Type *, unsigned,
CCValAssign &)>;
@@ -62,11 +64,6 @@
using MemHandler =
std::function<void(MachineIRBuilder &, int, CCValAssign &)>;
- void splitToValueTypes(const ArgInfo &OrigArgInfo,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI,
- CallingConv::ID CallConv) const;
-
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
SmallVectorImpl<ArgInfo> &OutArgs) const;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
new file mode 100644
index 0000000..08d1c98
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
@@ -0,0 +1,180 @@
+//===- AArch64GlobalISelUtils.cpp --------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file Implementations of AArch64-specific helper functions used in the
+/// GlobalISel pipeline.
+//===----------------------------------------------------------------------===//
+#include "AArch64GlobalISelUtils.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+Optional<RegOrConstant>
+AArch64GISelUtils::getAArch64VectorSplat(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) {
+ if (auto Splat = getVectorSplat(MI, MRI))
+ return Splat;
+ if (MI.getOpcode() != AArch64::G_DUP)
+ return None;
+ Register Src = MI.getOperand(1).getReg();
+ if (auto ValAndVReg =
+ getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI))
+ return RegOrConstant(ValAndVReg->Value.getSExtValue());
+ return RegOrConstant(Src);
+}
+
+Optional<int64_t>
+AArch64GISelUtils::getAArch64VectorSplatScalar(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) {
+ auto Splat = getAArch64VectorSplat(MI, MRI);
+ if (!Splat || Splat->isReg())
+ return None;
+ return Splat->getCst();
+}
+
+bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub,
+ const CmpInst::Predicate &Pred,
+ const MachineRegisterInfo &MRI) {
+ // Match:
+ //
+ // %sub = G_SUB 0, %y
+ // %cmp = G_ICMP eq/ne, %sub, %z
+ //
+ // Or
+ //
+ // %sub = G_SUB 0, %y
+ // %cmp = G_ICMP eq/ne, %z, %sub
+ if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB ||
+ !CmpInst::isEquality(Pred))
+ return false;
+ auto MaybeZero =
+ getConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
+ return MaybeZero && MaybeZero->Value.getZExtValue() == 0;
+}
+
+bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI,
+ MachineIRBuilder &MIRBuilder,
+ bool MinSize) {
+ assert(MI.getOpcode() == TargetOpcode::G_MEMSET);
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+ if (!TLI.getLibcallName(RTLIB::BZERO))
+ return false;
+ auto Zero = getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI);
+ if (!Zero || Zero->Value.getSExtValue() != 0)
+ return false;
+
+ // It's not faster to use bzero rather than memset for sizes <= 256.
+ // However, it *does* save us a mov from wzr, so if we're going for
+ // minsize, use bzero even if it's slower.
+ if (!MinSize) {
+ // If the size is known, check it. If it is not known, assume using bzero is
+ // better.
+ if (auto Size =
+ getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI)) {
+ if (Size->Value.getSExtValue() <= 256)
+ return false;
+ }
+ }
+
+ MIRBuilder.setInstrAndDebugLoc(MI);
+ MIRBuilder
+ .buildInstr(TargetOpcode::G_BZERO, {},
+ {MI.getOperand(0), MI.getOperand(2)})
+ .addImm(MI.getOperand(3).getImm())
+ .addMemOperand(*MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
+}
+
+void AArch64GISelUtils::changeFCMPPredToAArch64CC(
+ const CmpInst::Predicate P, AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (P) {
+ default:
+ llvm_unreachable("Unknown FP condition!");
+ case CmpInst::FCMP_OEQ:
+ CondCode = AArch64CC::EQ;
+ break;
+ case CmpInst::FCMP_OGT:
+ CondCode = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_OGE:
+ CondCode = AArch64CC::GE;
+ break;
+ case CmpInst::FCMP_OLT:
+ CondCode = AArch64CC::MI;
+ break;
+ case CmpInst::FCMP_OLE:
+ CondCode = AArch64CC::LS;
+ break;
+ case CmpInst::FCMP_ONE:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_ORD:
+ CondCode = AArch64CC::VC;
+ break;
+ case CmpInst::FCMP_UNO:
+ CondCode = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UEQ:
+ CondCode = AArch64CC::EQ;
+ CondCode2 = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UGT:
+ CondCode = AArch64CC::HI;
+ break;
+ case CmpInst::FCMP_UGE:
+ CondCode = AArch64CC::PL;
+ break;
+ case CmpInst::FCMP_ULT:
+ CondCode = AArch64CC::LT;
+ break;
+ case CmpInst::FCMP_ULE:
+ CondCode = AArch64CC::LE;
+ break;
+ case CmpInst::FCMP_UNE:
+ CondCode = AArch64CC::NE;
+ break;
+ }
+}
+
+void AArch64GISelUtils::changeVectorFCMPPredToAArch64CC(
+ const CmpInst::Predicate P, AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2, bool &Invert) {
+ Invert = false;
+ switch (P) {
+ default:
+ // Mostly the scalar mappings work fine.
+ changeFCMPPredToAArch64CC(P, CondCode, CondCode2);
+ break;
+ case CmpInst::FCMP_UNO:
+ Invert = true;
+ LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ORD:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GE;
+ break;
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ // All of the compare-mask comparisons are ordered, but we can switch
+ // between the two by a double inversion. E.g. ULE == !OGT.
+ Invert = true;
+ changeFCMPPredToAArch64CC(CmpInst::getInversePredicate(P), CondCode,
+ CondCode2);
+ break;
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
index bed1136..f9a1ee1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
@@ -11,10 +11,17 @@
#ifndef LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
-
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/IR/InstrTypes.h"
#include <cstdint>
namespace llvm {
+
namespace AArch64GISelUtils {
/// \returns true if \p C is a legal immediate operand for an arithmetic
@@ -23,6 +30,51 @@
return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
}
+/// \returns A value when \p MI is a vector splat of a Register or constant.
+/// Checks for generic opcodes and AArch64-specific generic opcodes.
+Optional<RegOrConstant> getAArch64VectorSplat(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI);
+
+/// \returns A value when \p MI is a constant vector splat.
+/// Checks for generic opcodes and AArch64-specific generic opcodes.
+Optional<int64_t> getAArch64VectorSplatScalar(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI);
+
+/// \returns true if \p MaybeSub and \p Pred are part of a CMN tree for an
+/// integer compare.
+bool isCMN(const MachineInstr *MaybeSub, const CmpInst::Predicate &Pred,
+ const MachineRegisterInfo &MRI);
+
+/// Replace a G_MEMSET with a value of 0 with a G_BZERO instruction if it is
+/// supported and beneficial to do so.
+///
+/// \note This only applies on Darwin.
+///
+/// \returns true if \p MI was replaced with a G_BZERO.
+bool tryEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder, bool MinSize);
+
+/// Find the AArch64 condition codes necessary to represent \p P for a scalar
+/// floating point comparison.
+///
+/// \param [out] CondCode is the first condition code.
+/// \param [out] CondCode2 is the second condition code if necessary.
+/// AArch64CC::AL otherwise.
+void changeFCMPPredToAArch64CC(const CmpInst::Predicate P,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2);
+
+/// Find the AArch64 condition codes necessary to represent \p P for a vector
+/// floating point comparison.
+///
+/// \param [out] CondCode - The first condition code.
+/// \param [out] CondCode2 - The second condition code if necessary.
+/// AArch64CC::AL otherwise.
+/// \param [out] Invert - True if the comparison must be inverted with a NOT.
+void changeVectorFCMPPredToAArch64CC(const CmpInst::Predicate P,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2,
+ bool &Invert);
+
} // namespace AArch64GISelUtils
} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index fc5ef02..a982484 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -11,12 +11,14 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#include "AArch64GlobalISelUtils.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterBankInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
+#include "AArch64GlobalISelUtils.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/Optional.h"
@@ -24,16 +26,17 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -46,6 +49,12 @@
using namespace llvm;
using namespace MIPatternMatch;
+using namespace AArch64GISelUtils;
+
+namespace llvm {
+class BlockFrequencyInfo;
+class ProfileSummaryInfo;
+}
namespace {
@@ -62,9 +71,11 @@
bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
- void setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) override {
- InstructionSelector::setupMF(MF, KB, CoverageInfo);
+ void setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) override {
+ InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
+ MIB.setMF(MF);
// hasFnAttribute() is expensive to call on every BRCOND selection, so
// cache it here for each run of the selector.
@@ -85,12 +96,12 @@
bool preISelLower(MachineInstr &I);
// An early selection function that runs before the selectImpl() call.
- bool earlySelect(MachineInstr &I) const;
+ bool earlySelect(MachineInstr &I);
// Do some preprocessing of G_PHIs before we begin selection.
void processPHIs(MachineFunction &MF);
- bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
/// Eliminate same-sized cross-bank copies into stores before selectImpl().
bool contractCrossBankCopyIntoStore(MachineInstr &I,
@@ -117,10 +128,10 @@
///@}
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
- MachineRegisterInfo &MRI) const;
+ MachineRegisterInfo &MRI);
- bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
// Helper to generate an equivalent of scalar_to_vector into a new register,
// returned via 'Dst'.
@@ -139,28 +150,37 @@
Register EltReg, unsigned LaneIdx,
const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const;
- bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
- MachineRegisterInfo &MRI) const;
- bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectSplitVectorUnmerge(MachineInstr &I,
- MachineRegisterInfo &MRI) const;
+ /// Emit a sequence of instructions representing a constant \p CV for a
+ /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
+ ///
+ /// \returns the last instruction in the sequence on success, and nullptr
+ /// otherwise.
+ MachineInstr *emitConstantVector(Register Dst, Constant *CV,
+ MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI);
+
+ bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
+ MachineRegisterInfo &MRI);
+ bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
+
+ bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectIntrinsicWithSideEffects(MachineInstr &I,
- MachineRegisterInfo &MRI) const;
+ MachineRegisterInfo &MRI);
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
- bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
unsigned emitConstantPoolEntry(const Constant *CPVal,
MachineFunction &MF) const;
@@ -244,17 +264,12 @@
Register VecReg, unsigned LaneIdx,
MachineIRBuilder &MIRBuilder) const;
- /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be
- /// materialized using a FMOV instruction, then update MI and return it.
- /// Otherwise, do nothing and return a nullptr.
- MachineInstr *emitFMovForFConstant(MachineInstr &MI,
- MachineRegisterInfo &MRI) const;
-
/// Emit a CSet for an integer compare.
///
- /// \p DefReg is expected to be a 32-bit scalar register.
+ /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &MIRBuilder,
+ Register SrcReg = AArch64::WZR) const;
/// Emit a CSet for a FP compare.
///
/// \p Dst is expected to be a 32-bit scalar register.
@@ -392,13 +407,18 @@
int OpIdx = -1) const;
void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx = -1) const;
+ void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx = -1) const;
+ void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx = -1) const;
+ void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx = -1) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
- void materializeLargeCMVal(MachineInstr &I, const Value *V,
- unsigned OpFlags) const;
+ void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
// Optimization methods.
- bool tryOptSelect(MachineInstr &MI) const;
+ bool tryOptSelect(MachineInstr &MI);
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
@@ -424,6 +444,8 @@
// clobbered by calls.
Register MFReturnAddr;
+ MachineIRBuilder MIB;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -468,6 +490,8 @@
if (Ty.getSizeInBits() == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
+ if (Ty.getSizeInBits() == 128)
+ return &AArch64::XSeqPairsClassRegClass;
return nullptr;
}
@@ -500,6 +524,8 @@
if (SizeInBits == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
+ if (SizeInBits == 128)
+ return &AArch64::XSeqPairsClassRegClass;
}
if (RegBankID == AArch64::FPRRegBankID) {
@@ -562,6 +588,58 @@
}
}
+/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
+/// Helper function for functions like createDTuple and createQTuple.
+///
+/// \p RegClassIDs - The list of register class IDs available for some tuple of
+/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
+/// expected to contain between 2 and 4 tuple classes.
+///
+/// \p SubRegs - The list of subregister classes associated with each register
+/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
+/// subregister class. The index of each subregister class is expected to
+/// correspond with the index of each register class.
+///
+/// \returns Either the destination register of REG_SEQUENCE instruction that
+/// was created, or the 0th element of \p Regs if \p Regs contains a single
+/// element.
+static Register createTuple(ArrayRef<Register> Regs,
+ const unsigned RegClassIDs[],
+ const unsigned SubRegs[], MachineIRBuilder &MIB) {
+ unsigned NumRegs = Regs.size();
+ if (NumRegs == 1)
+ return Regs[0];
+ assert(NumRegs >= 2 && NumRegs <= 4 &&
+ "Only support between two and 4 registers in a tuple!");
+ const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
+ auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
+ auto RegSequence =
+ MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
+ for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
+ RegSequence.addUse(Regs[I]);
+ RegSequence.addImm(SubRegs[I]);
+ }
+ return RegSequence.getReg(0);
+}
+
+/// Create a tuple of D-registers using the registers in \p Regs.
+static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
+ static const unsigned RegClassIDs[] = {
+ AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+ static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2, AArch64::dsub3};
+ return createTuple(Regs, RegClassIDs, SubRegs, MIB);
+}
+
+/// Create a tuple of Q-registers using the registers in \p Regs.
+static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
+ static const unsigned RegClassIDs[] = {
+ AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+ static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3};
+ return createTuple(Regs, RegClassIDs, SubRegs, MIB);
+}
+
static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
auto &MI = *Root.getParent();
auto &MBB = *MI.getParent();
@@ -865,8 +943,8 @@
#ifndef NDEBUG
ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
assert(ValidCopy && "Invalid copy.");
- (void)KnownValid;
#endif
+ (void)KnownValid;
return ValidCopy;
};
@@ -932,6 +1010,15 @@
<< " operand\n");
return false;
}
+
+ // If this a GPR ZEXT that we want to just reduce down into a copy.
+ // The sizes will be mismatched with the source < 32b but that's ok.
+ if (I.getOpcode() == TargetOpcode::G_ZEXT) {
+ I.setDesc(TII.get(AArch64::COPY));
+ assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
+ return selectCopy(I, TII, MRI, TRI, RBI);
+ }
+
I.setDesc(TII.get(AArch64::COPY));
return CheckCopy();
}
@@ -1085,7 +1172,9 @@
//
// Into:
// %select = CSINC %reg, %x, cc
- if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) {
+ if (mi_match(Reg, MRI,
+ m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
+ m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
Reg = MatchReg;
if (Invert) {
@@ -1208,60 +1297,6 @@
}
}
-static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
- AArch64CC::CondCode &CondCode,
- AArch64CC::CondCode &CondCode2) {
- CondCode2 = AArch64CC::AL;
- switch (P) {
- default:
- llvm_unreachable("Unknown FP condition!");
- case CmpInst::FCMP_OEQ:
- CondCode = AArch64CC::EQ;
- break;
- case CmpInst::FCMP_OGT:
- CondCode = AArch64CC::GT;
- break;
- case CmpInst::FCMP_OGE:
- CondCode = AArch64CC::GE;
- break;
- case CmpInst::FCMP_OLT:
- CondCode = AArch64CC::MI;
- break;
- case CmpInst::FCMP_OLE:
- CondCode = AArch64CC::LS;
- break;
- case CmpInst::FCMP_ONE:
- CondCode = AArch64CC::MI;
- CondCode2 = AArch64CC::GT;
- break;
- case CmpInst::FCMP_ORD:
- CondCode = AArch64CC::VC;
- break;
- case CmpInst::FCMP_UNO:
- CondCode = AArch64CC::VS;
- break;
- case CmpInst::FCMP_UEQ:
- CondCode = AArch64CC::EQ;
- CondCode2 = AArch64CC::VS;
- break;
- case CmpInst::FCMP_UGT:
- CondCode = AArch64CC::HI;
- break;
- case CmpInst::FCMP_UGE:
- CondCode = AArch64CC::PL;
- break;
- case CmpInst::FCMP_ULT:
- CondCode = AArch64CC::LT;
- break;
- case CmpInst::FCMP_ULE:
- CondCode = AArch64CC::LE;
- break;
- case CmpInst::FCMP_UNE:
- CondCode = AArch64CC::NE;
- break;
- }
-}
-
/// Return a register which can be used as a bit to test in a TB(N)Z.
static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
MachineRegisterInfo &MRI) {
@@ -1605,7 +1640,7 @@
}
bool AArch64InstructionSelector::selectCompareBranch(
- MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
Register CondReg = I.getOperand(0).getReg();
MachineInstr *CCMI = MRI.getVRegDef(CondReg);
if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
@@ -1615,7 +1650,6 @@
// Try to select the G_BRCOND using whatever is feeding the condition if
// possible.
- MachineIRBuilder MIB(I);
unsigned CCMIOpc = CCMI->getOpcode();
if (CCMIOpc == TargetOpcode::G_FCMP)
return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
@@ -1650,23 +1684,7 @@
assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
MachineInstr *OpMI = MRI.getVRegDef(Reg);
assert(OpMI && "Expected to find a vreg def for vector shift operand");
- if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR)
- return None;
-
- // Check all operands are identical immediates.
- int64_t ImmVal = 0;
- for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) {
- auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI);
- if (!VRegAndVal)
- return None;
-
- if (Idx == 1)
- ImmVal = VRegAndVal->Value.getSExtValue();
- if (ImmVal != VRegAndVal->Value.getSExtValue())
- return None;
- }
-
- return ImmVal;
+ return getAArch64VectorSplatScalar(*OpMI, MRI);
}
/// Matches and returns the shift immediate value for a SHL instruction given
@@ -1703,8 +1721,8 @@
return Imm;
}
-bool AArch64InstructionSelector::selectVectorSHL(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_SHL);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
@@ -1719,26 +1737,25 @@
Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
unsigned Opc = 0;
- if (Ty == LLT::vector(2, 64)) {
+ if (Ty == LLT::fixed_vector(2, 64)) {
Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
- } else if (Ty == LLT::vector(4, 32)) {
+ } else if (Ty == LLT::fixed_vector(4, 32)) {
Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
- } else if (Ty == LLT::vector(2, 32)) {
+ } else if (Ty == LLT::fixed_vector(2, 32)) {
Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
- } else if (Ty == LLT::vector(4, 16)) {
+ } else if (Ty == LLT::fixed_vector(4, 16)) {
Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
- } else if (Ty == LLT::vector(8, 16)) {
+ } else if (Ty == LLT::fixed_vector(8, 16)) {
Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
- } else if (Ty == LLT::vector(16, 8)) {
+ } else if (Ty == LLT::fixed_vector(16, 8)) {
Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
- } else if (Ty == LLT::vector(8, 8)) {
+ } else if (Ty == LLT::fixed_vector(8, 8)) {
Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
return false;
}
- MachineIRBuilder MIB(I);
auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
if (ImmVal)
Shl.addImm(*ImmVal);
@@ -1750,7 +1767,7 @@
}
bool AArch64InstructionSelector::selectVectorAshrLshr(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_ASHR ||
I.getOpcode() == TargetOpcode::G_LSHR);
Register DstReg = I.getOperand(0).getReg();
@@ -1774,25 +1791,25 @@
unsigned NegOpc = 0;
const TargetRegisterClass *RC =
getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
- if (Ty == LLT::vector(2, 64)) {
+ if (Ty == LLT::fixed_vector(2, 64)) {
Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
NegOpc = AArch64::NEGv2i64;
- } else if (Ty == LLT::vector(4, 32)) {
+ } else if (Ty == LLT::fixed_vector(4, 32)) {
Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
NegOpc = AArch64::NEGv4i32;
- } else if (Ty == LLT::vector(2, 32)) {
+ } else if (Ty == LLT::fixed_vector(2, 32)) {
Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
NegOpc = AArch64::NEGv2i32;
- } else if (Ty == LLT::vector(4, 16)) {
+ } else if (Ty == LLT::fixed_vector(4, 16)) {
Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
NegOpc = AArch64::NEGv4i16;
- } else if (Ty == LLT::vector(8, 16)) {
+ } else if (Ty == LLT::fixed_vector(8, 16)) {
Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
NegOpc = AArch64::NEGv8i16;
- } else if (Ty == LLT::vector(16, 8)) {
+ } else if (Ty == LLT::fixed_vector(16, 8)) {
Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
NegOpc = AArch64::NEGv16i8;
- } else if (Ty == LLT::vector(8, 8)) {
+ } else if (Ty == LLT::fixed_vector(8, 8)) {
Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
NegOpc = AArch64::NEGv8i8;
} else {
@@ -1800,7 +1817,6 @@
return false;
}
- MachineIRBuilder MIB(I);
auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
@@ -1842,11 +1858,10 @@
}
void AArch64InstructionSelector::materializeLargeCMVal(
- MachineInstr &I, const Value *V, unsigned OpFlags) const {
+ MachineInstr &I, const Value *V, unsigned OpFlags) {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineIRBuilder MIB(I);
auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
MovZ->addOperand(MF, I.getOperand(1));
@@ -1907,7 +1922,6 @@
assert(AmtMI && "could not find a vreg definition for shift amount");
if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
// Insert a subregister copy to implement a 64->32 trunc
- MachineIRBuilder MIB(I);
auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
.addReg(ShiftReg, 0, AArch64::sub_32);
MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
@@ -1915,8 +1929,21 @@
}
return true;
}
- case TargetOpcode::G_STORE:
- return contractCrossBankCopyIntoStore(I, MRI);
+ case TargetOpcode::G_STORE: {
+ bool Changed = contractCrossBankCopyIntoStore(I, MRI);
+ MachineOperand &SrcOp = I.getOperand(0);
+ if (MRI.getType(SrcOp.getReg()).isPointer()) {
+ // Allow matching with imported patterns for stores of pointers. Unlike
+ // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
+ // and constrain.
+ auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
+ Register NewSrc = Copy.getReg(0);
+ SrcOp.setReg(NewSrc);
+ RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
+ Changed = true;
+ }
+ return Changed;
+ }
case TargetOpcode::G_PTR_ADD:
return convertPtrAddToAdd(I, MRI);
case TargetOpcode::G_LOAD: {
@@ -1936,11 +1963,10 @@
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
if (!DstTy.getElementType().isPointer())
return false;
- MachineIRBuilder MIB(I);
auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
MRI.setType(I.getOperand(0).getReg(),
DstTy.changeElementType(LLT::scalar(64)));
- MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+ MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
I.getOperand(1).setReg(NewSrc.getReg(0));
return true;
}
@@ -1987,8 +2013,8 @@
if (PtrTy.getAddressSpace() != 0)
return false;
- MachineIRBuilder MIB(I);
- const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64);
+ const LLT CastPtrTy =
+ PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
// Set regbanks on the registers.
if (PtrTy.isVector())
@@ -2016,8 +2042,8 @@
return true;
}
-bool AArch64InstructionSelector::earlySelectSHL(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
// We try to match the immediate variant of LSL, which is actually an alias
// for a special case of UBFM. Otherwise, we fall back to the imported
// selector which will match the register variant.
@@ -2033,7 +2059,6 @@
bool Is64Bit = DstTy.getSizeInBits() == 64;
auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
- MachineIRBuilder MIB(I);
if (!Imm1Fn || !Imm2Fn)
return false;
@@ -2093,7 +2118,7 @@
return true;
}
-bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
+bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -2102,6 +2127,24 @@
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
+ case AArch64::G_DUP: {
+ // Before selecting a DUP instruction, check if it is better selected as a
+ // MOV or load from a constant pool.
+ Register Src = I.getOperand(1).getReg();
+ auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI);
+ if (!ValAndVReg)
+ return false;
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ Register Dst = I.getOperand(0).getReg();
+ auto *CV = ConstantDataVector::getSplat(
+ MRI.getType(Dst).getNumElements(),
+ ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
+ ValAndVReg->Value));
+ if (!emitConstantVector(Dst, CV, MIB, MRI))
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
case TargetOpcode::G_BR: {
// If the branch jumps to the fallthrough block, don't bother emitting it.
// Only do this for -O0 for a good code size improvement, because when
@@ -2139,6 +2182,74 @@
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
}
+
+ case TargetOpcode::G_ADD: {
+ // Check if this is being fed by a G_ICMP on either side.
+ //
+ // (cmp pred, x, y) + z
+ //
+ // In the above case, when the cmp is true, we increment z by 1. So, we can
+ // fold the add into the cset for the cmp by using cinc.
+ //
+ // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
+ Register X = I.getOperand(1).getReg();
+
+ // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
+ // early if we see it.
+ LLT Ty = MRI.getType(X);
+ if (Ty.isVector() || Ty.getSizeInBits() != 32)
+ return false;
+
+ Register CmpReg = I.getOperand(2).getReg();
+ MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
+ if (!Cmp) {
+ std::swap(X, CmpReg);
+ Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
+ if (!Cmp)
+ return false;
+ }
+ auto Pred =
+ static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
+ emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
+ Cmp->getOperand(1), MIB);
+ emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
+ I.eraseFromParent();
+ return true;
+ }
+ case TargetOpcode::G_OR: {
+ // Look for operations that take the lower `Width=Size-ShiftImm` bits of
+ // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
+ // shifting and masking that we can replace with a BFI (encoded as a BFM).
+ Register Dst = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(Dst);
+
+ if (!Ty.isScalar())
+ return false;
+
+ unsigned Size = Ty.getSizeInBits();
+ if (Size != 32 && Size != 64)
+ return false;
+
+ Register ShiftSrc;
+ int64_t ShiftImm;
+ Register MaskSrc;
+ int64_t MaskImm;
+ if (!mi_match(
+ Dst, MRI,
+ m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
+ m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
+ return false;
+
+ if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
+ return false;
+
+ int64_t Immr = Size - ShiftImm;
+ int64_t Imms = Size - ShiftImm - 1;
+ unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
+ emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
+ I.eraseFromParent();
+ return true;
+ }
default:
return false;
}
@@ -2160,6 +2271,8 @@
return false;
}
+ MIB.setInstrAndDebugLoc(I);
+
unsigned Opcode = I.getOpcode();
// G_PHI requires same handling as PHI
if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
@@ -2229,9 +2342,30 @@
LLT Ty =
I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
- MachineIRBuilder MIB(I);
-
switch (Opcode) {
+ case TargetOpcode::G_SBFX:
+ case TargetOpcode::G_UBFX: {
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::UBFMWri, AArch64::UBFMXri},
+ {AArch64::SBFMWri, AArch64::SBFMXri}};
+ bool IsSigned = Opcode == TargetOpcode::G_SBFX;
+ unsigned Size = Ty.getSizeInBits();
+ unsigned Opc = OpcTable[IsSigned][Size == 64];
+ auto Cst1 =
+ getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
+ assert(Cst1 && "Should have gotten a constant for src 1?");
+ auto Cst2 =
+ getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
+ assert(Cst2 && "Should have gotten a constant for src 2?");
+ auto LSB = Cst1->Value.getZExtValue();
+ auto Width = Cst2->Value.getZExtValue();
+ auto BitfieldInst =
+ MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
+ .addImm(LSB)
+ .addImm(LSB + Width - 1);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
+ }
case TargetOpcode::G_BRCOND:
return selectCompareBranch(I, MF, MRI);
@@ -2256,7 +2390,6 @@
}
assert(TM.getCodeModel() == CodeModel::Small &&
"Expected small code model");
- MachineIRBuilder MIB(I);
auto Op1 = BaseMI->getOperand(1);
auto Op2 = I.getOperand(2);
auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
@@ -2373,14 +2506,11 @@
: (DefSize == 64 ? AArch64::FPR64RegClass
: AArch64::FPR128RegClass);
- // Can we use a FMOV instruction to represent the immediate?
- if (emitFMovForFConstant(I, MRI))
- return true;
-
// For 64b values, emit a constant pool load instead.
- if (DefSize == 64 || DefSize == 128) {
+ // For s32, use a cp load if we have optsize/minsize.
+ if (DefSize == 64 || DefSize == 128 ||
+ (DefSize == 32 && shouldOptForSize(&MF))) {
auto *FPImm = I.getOperand(1).getFPImm();
- MachineIRBuilder MIB(I);
auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
if (!LoadMI) {
LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
@@ -2435,21 +2565,25 @@
if (DstTy.getSizeInBits() != 64)
return false;
- const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
- const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
- // Check we have the right regbank always.
- assert(SrcRB.getID() == AArch64::FPRRegBankID &&
- DstRB.getID() == AArch64::FPRRegBankID &&
- "Wrong extract regbank!");
- (void)SrcRB;
-
- // Emit the same code as a vector extract.
- // Offset must be a multiple of 64.
unsigned Offset = I.getOperand(2).getImm();
if (Offset % 64 != 0)
return false;
+
+ // Check we have the right regbank always.
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
+
+ if (SrcRB.getID() == AArch64::GPRRegBankID) {
+ MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
+ I.eraseFromParent();
+ return true;
+ }
+
+ // Emit the same code as a vector extract.
+ // Offset must be a multiple of 64.
unsigned LaneIdx = Offset / 64;
- MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
if (!Extract)
@@ -2560,8 +2694,6 @@
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE: {
bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
- MachineIRBuilder MIB(I);
-
LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
if (PtrTy != LLT::pointer(0, 64)) {
@@ -2572,18 +2704,29 @@
auto &MemOp = **I.memoperands_begin();
uint64_t MemSizeInBytes = MemOp.getSize();
- if (MemOp.isAtomic()) {
- // For now we just support s8 acquire loads to be able to compile stack
- // protector code.
- if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
- MemSizeInBytes == 1) {
- I.setDesc(TII.get(AArch64::LDARB));
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
- }
- LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
- return false;
- }
unsigned MemSizeInBits = MemSizeInBytes * 8;
+ AtomicOrdering Order = MemOp.getSuccessOrdering();
+
+ // Need special instructions for atomics that affect ordering.
+ if (Order != AtomicOrdering::NotAtomic &&
+ Order != AtomicOrdering::Unordered &&
+ Order != AtomicOrdering::Monotonic) {
+ assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
+ if (MemSizeInBytes > 64)
+ return false;
+
+ if (I.getOpcode() == TargetOpcode::G_LOAD) {
+ static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
+ AArch64::LDARW, AArch64::LDARX};
+ I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
+ } else {
+ static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
+ AArch64::STLRW, AArch64::STLRX};
+ I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
+ }
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
+ }
#ifndef NDEBUG
const Register PtrReg = I.getOperand(1).getReg();
@@ -2737,9 +2880,7 @@
}
case TargetOpcode::G_PTR_ADD: {
- MachineIRBuilder MIRBuilder(I);
- emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
- MIRBuilder);
+ emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
I.eraseFromParent();
return true;
}
@@ -2748,18 +2889,16 @@
case TargetOpcode::G_SSUBO:
case TargetOpcode::G_USUBO: {
// Emit the operation and get the correct condition code.
- MachineIRBuilder MIRBuilder(I);
auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
- I.getOperand(2), I.getOperand(3), MIRBuilder);
+ I.getOperand(2), I.getOperand(3), MIB);
// Now, put the overflow result in the register given by the first operand
// to the overflow op. CSINC increments the result when the predicate is
// false, so to get the increment when it's true, we need to use the
// inverse. In this case, we want to increment when carry is set.
Register ZReg = AArch64::WZR;
- auto CsetMI = MIRBuilder
- .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
- {ZReg, ZReg})
+ auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
+ {ZReg, ZReg})
.addImm(getInvertedCondCode(OpAndCC.second));
constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
I.eraseFromParent();
@@ -2832,14 +2971,14 @@
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
} else if (DstRB.getID() == AArch64::FPRRegBankID) {
- if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
+ if (DstTy == LLT::fixed_vector(4, 16) &&
+ SrcTy == LLT::fixed_vector(4, 32)) {
I.setDesc(TII.get(AArch64::XTNv4i16));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
- MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
if (!Extract)
@@ -2927,7 +3066,6 @@
AArch64::GPRRegBankID &&
"Unexpected ext regbank");
- MachineIRBuilder MIB(I);
MachineInstr *ExtI;
// First check if we're extending the result of a load which has a dest type
@@ -2947,34 +3085,46 @@
return selectCopy(I, TII, MRI, TRI, RBI);
}
+ // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
+ // + SUBREG_TO_REG.
+ //
// If we are zero extending from 32 bits to 64 bits, it's possible that
// the instruction implicitly does the zero extend for us. In that case,
- // we can just emit a SUBREG_TO_REG.
+ // we only need the SUBREG_TO_REG.
if (IsGPR && SrcSize == 32 && DstSize == 64) {
// Unlike with the G_LOAD case, we don't want to look through copies
- // here.
+ // here. (See isDef32.)
MachineInstr *Def = MRI.getVRegDef(SrcReg);
- if (Def && isDef32(*Def)) {
- MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
- .addImm(0)
- .addUse(SrcReg)
- .addImm(AArch64::sub_32);
+ Register SubregToRegSrc = SrcReg;
- if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
- MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
- return false;
- }
-
- if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
- MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
- return false;
- }
-
- I.eraseFromParent();
- return true;
+ // Does the instruction implicitly zero extend?
+ if (!Def || !isDef32(*Def)) {
+ // No. Zero out using an OR.
+ Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ const Register ZReg = AArch64::WZR;
+ MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
+ SubregToRegSrc = OrDst;
}
+
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
+ .addImm(0)
+ .addUse(SubregToRegSrc)
+ .addImm(AArch64::sub_32);
+
+ if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
+ MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
+ return false;
+ }
+
+ if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
+ MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
+ return false;
+ }
+
+ I.eraseFromParent();
+ return true;
}
}
@@ -3061,7 +3211,6 @@
// Make sure to use an unused vreg instead of wzr, so that the peephole
// optimizations will be able to optimize these.
- MachineIRBuilder MIB(I);
Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
@@ -3081,22 +3230,20 @@
return false;
}
- MachineIRBuilder MIRBuilder(I);
auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
- MIRBuilder);
- emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
+ MIB);
+ emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_FCMP: {
- MachineIRBuilder MIRBuilder(I);
CmpInst::Predicate Pred =
static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
- if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(),
- MIRBuilder, Pred) ||
- !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder))
+ if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
+ Pred) ||
+ !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
return false;
I.eraseFromParent();
return true;
@@ -3142,14 +3289,18 @@
// difficult because at RBS we may end up pessimizing the fpr case if we
// decided to add an anyextend to fix this. Manual selection is the most
// robust solution for now.
- Register SrcReg = I.getOperand(1).getReg();
- if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID)
+ if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
+ AArch64::GPRRegBankID)
return false; // We expect the fpr regbank case to be imported.
- LLT SrcTy = MRI.getType(SrcReg);
- if (SrcTy.getSizeInBits() == 16)
- I.setDesc(TII.get(AArch64::DUPv8i16gpr));
- else if (SrcTy.getSizeInBits() == 8)
+ LLT VecTy = MRI.getType(I.getOperand(0).getReg());
+ if (VecTy == LLT::fixed_vector(8, 8))
+ I.setDesc(TII.get(AArch64::DUPv8i8gpr));
+ else if (VecTy == LLT::fixed_vector(16, 8))
I.setDesc(TII.get(AArch64::DUPv16i8gpr));
+ else if (VecTy == LLT::fixed_vector(4, 16))
+ I.setDesc(TII.get(AArch64::DUPv4i16gpr));
+ else if (VecTy == LLT::fixed_vector(8, 16))
+ I.setDesc(TII.get(AArch64::DUPv8i16gpr));
else
return false;
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -3182,19 +3333,33 @@
return false;
}
-bool AArch64InstructionSelector::selectReduction(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
Register VecReg = I.getOperand(1).getReg();
LLT VecTy = MRI.getType(VecReg);
if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
+ // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
+ // a subregister copy afterwards.
+ if (VecTy == LLT::fixed_vector(2, 32)) {
+ Register DstReg = I.getOperand(0).getReg();
+ auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
+ {VecReg, VecReg});
+ auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addReg(AddP.getReg(0), 0, AArch64::ssub)
+ .getReg(0);
+ RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
+ }
+
unsigned Opc = 0;
- if (VecTy == LLT::vector(16, 8))
+ if (VecTy == LLT::fixed_vector(16, 8))
Opc = AArch64::ADDVv16i8v;
- else if (VecTy == LLT::vector(8, 16))
+ else if (VecTy == LLT::fixed_vector(8, 16))
Opc = AArch64::ADDVv8i16v;
- else if (VecTy == LLT::vector(4, 32))
+ else if (VecTy == LLT::fixed_vector(4, 32))
Opc = AArch64::ADDVv4i32v;
- else if (VecTy == LLT::vector(2, 64))
+ else if (VecTy == LLT::fixed_vector(2, 64))
Opc = AArch64::ADDPv2i64p;
else {
LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
@@ -3206,9 +3371,9 @@
if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
unsigned Opc = 0;
- if (VecTy == LLT::vector(2, 32))
+ if (VecTy == LLT::fixed_vector(2, 32))
Opc = AArch64::FADDPv2i32p;
- else if (VecTy == LLT::vector(2, 64))
+ else if (VecTy == LLT::fixed_vector(2, 64))
Opc = AArch64::FADDPv2i64p;
else {
LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
@@ -3221,12 +3386,11 @@
}
bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
- MachineRegisterInfo &MRI) const {
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
Register JTAddr = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
Register Index = I.getOperand(2).getReg();
- MachineIRBuilder MIB(I);
Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
@@ -3241,15 +3405,14 @@
return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
}
-bool AArch64InstructionSelector::selectJumpTable(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
Register DstReg = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
// We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
- MachineIRBuilder MIB(I);
auto MovMI =
MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
.addJumpTableIndex(JTI, AArch64II::MO_PAGE)
@@ -3259,14 +3422,16 @@
}
bool AArch64InstructionSelector::selectTLSGlobalValue(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
if (!STI.isTargetMachO())
return false;
MachineFunction &MF = *I.getParent()->getParent();
MF.getFrameInfo().setAdjustsStack(true);
- const GlobalValue &GV = *I.getOperand(1).getGlobal();
- MachineIRBuilder MIB(I);
+ const auto &GlobalOp = I.getOperand(1);
+ assert(GlobalOp.getOffset() == 0 &&
+ "Shouldn't have an offset on TLS globals!");
+ const GlobalValue &GV = *GlobalOp.getGlobal();
auto LoadGOT =
MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
@@ -3403,7 +3568,7 @@
}
bool AArch64InstructionSelector::selectVectorICmp(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
Register DstReg = I.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
Register SrcReg = I.getOperand(2).getReg();
@@ -3558,7 +3723,6 @@
if (SwapOperands)
std::swap(SrcReg, Src2Reg);
- MachineIRBuilder MIB(I);
auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
@@ -3602,7 +3766,7 @@
}
bool AArch64InstructionSelector::selectMergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
@@ -3616,7 +3780,6 @@
if (DstTy == LLT::scalar(128)) {
if (SrcTy.getSizeInBits() != 64)
return false;
- MachineIRBuilder MIB(I);
Register DstReg = I.getOperand(0).getReg();
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
@@ -3757,7 +3920,7 @@
}
bool AArch64InstructionSelector::selectExtractElt(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
"unexpected opcode!");
Register DstReg = I.getOperand(0).getReg();
@@ -3784,11 +3947,10 @@
return false;
unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
- MachineIRBuilder MIRBuilder(I);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
- LaneIdx, MIRBuilder);
+ LaneIdx, MIB);
if (!Extract)
return false;
@@ -3797,7 +3959,7 @@
}
bool AArch64InstructionSelector::selectSplitVectorUnmerge(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
unsigned NumElts = I.getNumOperands() - 1;
Register SrcReg = I.getOperand(NumElts).getReg();
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
@@ -3809,8 +3971,6 @@
return false;
}
- MachineIRBuilder MIB(I);
-
// We implement a split vector operation by treating the sub-vectors as
// scalars and extracting them.
const RegisterBank &DstRB =
@@ -3826,8 +3986,8 @@
return true;
}
-bool AArch64InstructionSelector::selectUnmergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
"unexpected opcode");
@@ -3856,8 +4016,6 @@
if (!NarrowTy.isScalar())
return selectSplitVectorUnmerge(I, MRI);
- MachineIRBuilder MIB(I);
-
// Choose a lane copy opcode and subregister based off of the size of the
// vector's elements.
unsigned CopyOpc = 0;
@@ -3882,6 +4040,13 @@
} else {
// No. We have to perform subregister inserts. For each insert, create an
// implicit def and a subregister insert, and save the register we create.
+ const TargetRegisterClass *RC =
+ getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
+ WideTy.getScalarSizeInBits() * NumElts);
+ unsigned SubReg = 0;
+ bool Found = getSubRegForClass(RC, TRI, SubReg);
+ (void)Found;
+ assert(Found && "expected to find last operand's subeg idx");
for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
MachineInstr &ImpDefMI =
@@ -3895,7 +4060,7 @@
TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
.addUse(ImpDefReg)
.addUse(SrcReg)
- .addImm(AArch64::dsub);
+ .addImm(SubReg);
constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
@@ -3942,14 +4107,13 @@
}
bool AArch64InstructionSelector::selectConcatVectors(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
"Unexpected opcode");
Register Dst = I.getOperand(0).getReg();
Register Op1 = I.getOperand(1).getReg();
Register Op2 = I.getOperand(2).getReg();
- MachineIRBuilder MIRBuilder(I);
- MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder);
+ MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
if (!ConcatMI)
return false;
I.eraseFromParent();
@@ -3968,14 +4132,17 @@
MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
- unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
+ auto &MF = MIRBuilder.getMF();
+ unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
auto Adrp =
MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
.addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
MachineInstr *LoadMI = nullptr;
- switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
+ unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
+ switch (Size) {
case 16:
LoadMI =
&*MIRBuilder
@@ -3984,16 +4151,27 @@
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
case 8:
- LoadMI = &*MIRBuilder
- .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
- .addConstantPoolIndex(
- CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ LoadMI =
+ &*MIRBuilder
+ .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
+ .addConstantPoolIndex(CPIdx, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ break;
+ case 4:
+ LoadMI =
+ &*MIRBuilder
+ .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
+ .addConstantPoolIndex(CPIdx, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
default:
LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
<< *CPVal->getType());
return nullptr;
}
+ LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad,
+ Size, Align(Size)));
constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
return LoadMI;
@@ -4316,49 +4494,15 @@
return &*InsElt;
}
-MachineInstr *AArch64InstructionSelector::emitFMovForFConstant(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
- assert(I.getOpcode() == TargetOpcode::G_FCONSTANT &&
- "Expected a G_FCONSTANT!");
- MachineOperand &ImmOp = I.getOperand(1);
- unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
-
- // Only handle 32 and 64 bit defs for now.
- if (DefSize != 32 && DefSize != 64)
- return nullptr;
-
- // Don't handle null values using FMOV.
- if (ImmOp.getFPImm()->isNullValue())
- return nullptr;
-
- // Get the immediate representation for the FMOV.
- const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF();
- int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF)
- : AArch64_AM::getFP64Imm(ImmValAPF);
-
- // If this is -1, it means the immediate can't be represented as the requested
- // floating point value. Bail.
- if (Imm == -1)
- return nullptr;
-
- // Update MI to represent the new FMOV instruction, constrain it, and return.
- ImmOp.ChangeToImmediate(Imm);
- unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi;
- I.setDesc(TII.get(MovOpc));
- constrainSelectedInstRegOperands(I, TII, TRI, RBI);
- return &I;
-}
-
MachineInstr *
AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
- MachineIRBuilder &MIRBuilder) const {
+ MachineIRBuilder &MIRBuilder,
+ Register SrcReg) const {
// CSINC increments the result when the predicate is false. Invert it.
const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
- auto I =
- MIRBuilder
- .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)})
- .addImm(InvCC);
+ auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
+ .addImm(InvCC);
constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
return &*I;
}
@@ -4382,8 +4526,7 @@
}
}
-bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
- MachineIRBuilder MIB(I);
+bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
MachineRegisterInfo &MRI = *MIB.getMRI();
// We want to recognize this pattern:
//
@@ -4489,37 +4632,10 @@
//
// cmn z, y
- // Helper lambda to detect the subtract followed by the compare.
- // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0.
- auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) {
- if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB)
- return false;
-
- // Need to make sure NZCV is the same at the end of the transformation.
- if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
- return false;
-
- // We want to match against SUBs.
- if (DefMI->getOpcode() != TargetOpcode::G_SUB)
- return false;
-
- // Make sure that we're getting
- // x = G_SUB 0, y
- auto ValAndVReg =
- getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI);
- if (!ValAndVReg || ValAndVReg->Value != 0)
- return false;
-
- // This can safely be represented as a CMN.
- return true;
- };
-
// Check if the RHS or LHS of the G_ICMP is defined by a SUB
MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
- CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
- const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P);
-
+ auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
// Given this:
//
// x = G_SUB 0, y
@@ -4528,7 +4644,7 @@
// Produce this:
//
// cmn y, z
- if (IsCMN(LHSDef, CC))
+ if (isCMN(LHSDef, P, MRI))
return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
// Same idea here, but with the RHS of the compare instead:
@@ -4541,7 +4657,7 @@
// Produce this:
//
// cmn z, y
- if (IsCMN(RHSDef, CC))
+ if (isCMN(RHSDef, P, MRI))
return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
// Given this:
@@ -4567,7 +4683,7 @@
}
bool AArch64InstructionSelector::selectShuffleVector(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
@@ -4600,11 +4716,9 @@
}
}
- MachineIRBuilder MIRBuilder(I);
-
// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
- MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
+ MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
@@ -4613,25 +4727,23 @@
if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
- MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder);
+ MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
if (!Concat) {
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
return false;
}
// The constant pool load will be 64 bits, so need to convert to FPR128 reg.
- IndexLoad =
- emitScalarToVector(64, &AArch64::FPR128RegClass,
- IndexLoad->getOperand(0).getReg(), MIRBuilder);
+ IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
+ IndexLoad->getOperand(0).getReg(), MIB);
- auto TBL1 = MIRBuilder.buildInstr(
+ auto TBL1 = MIB.buildInstr(
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
auto Copy =
- MIRBuilder
- .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
+ MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(TBL1.getReg(0), 0, AArch64::dsub);
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
I.eraseFromParent();
@@ -4640,16 +4752,10 @@
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
- auto RegSeq = MIRBuilder
- .buildInstr(TargetOpcode::REG_SEQUENCE,
- {&AArch64::QQRegClass}, {Src1Reg})
- .addImm(AArch64::qsub0)
- .addUse(Src2Reg)
- .addImm(AArch64::qsub1);
-
- auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
- {RegSeq, IndexLoad->getOperand(0)});
- constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
+ SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
+ auto RegSeq = createQTuple(Regs, MIB);
+ auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
+ {RegSeq, IndexLoad->getOperand(0)});
constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
I.eraseFromParent();
return true;
@@ -4686,8 +4792,8 @@
return InsElt;
}
-bool AArch64InstructionSelector::selectInsertElt(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
// Get information on the destination.
@@ -4713,13 +4819,12 @@
// Perform the lane insert.
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
- MachineIRBuilder MIRBuilder(I);
if (VecSize < 128) {
// If the vector we're inserting into is smaller than 128 bits, widen it
// to 128 to do the insert.
- MachineInstr *ScalarToVec = emitScalarToVector(
- VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder);
+ MachineInstr *ScalarToVec =
+ emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
if (!ScalarToVec)
return false;
SrcReg = ScalarToVec->getOperand(0).getReg();
@@ -4729,7 +4834,7 @@
// Note that if our vector is already 128 bits, we end up emitting an extra
// register.
MachineInstr *InsMI =
- emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder);
+ emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
if (VecSize < 128) {
// If we had to widen to perform the insert, then we have to demote back to
@@ -4749,7 +4854,7 @@
<< "\n");
return false;
}
- MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(DemoteVec, 0, SubReg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
@@ -4762,8 +4867,46 @@
return true;
}
+MachineInstr *
+AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
+ MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) {
+ LLT DstTy = MRI.getType(Dst);
+ unsigned DstSize = DstTy.getSizeInBits();
+ if (CV->isNullValue()) {
+ if (DstSize == 128) {
+ auto Mov =
+ MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
+ constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
+ return &*Mov;
+ }
+
+ if (DstSize == 64) {
+ auto Mov =
+ MIRBuilder
+ .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
+ .addImm(0);
+ auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
+ .addReg(Mov.getReg(0), 0, AArch64::dsub);
+ RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
+ return &*Copy;
+ }
+ }
+
+ auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
+ if (!CPLoad) {
+ LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
+ return nullptr;
+ }
+
+ auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
+ RBI.constrainGenericRegister(
+ Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
+ return &*Copy;
+}
+
bool AArch64InstructionSelector::tryOptConstantBuildVec(
- MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
unsigned DstSize = DstTy.getSizeInBits();
assert(DstSize <= 128 && "Unexpected build_vec type!");
@@ -4787,40 +4930,14 @@
return false;
}
Constant *CV = ConstantVector::get(Csts);
- MachineIRBuilder MIB(I);
- if (CV->isNullValue()) {
- // Until the importer can support immAllZerosV in pattern leaf nodes,
- // select a zero move manually here.
- Register DstReg = I.getOperand(0).getReg();
- if (DstSize == 128) {
- auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0);
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
- } else if (DstSize == 64) {
- auto Mov =
- MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
- .addImm(0);
- MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
- .addReg(Mov.getReg(0), 0, AArch64::dsub);
- I.eraseFromParent();
- return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI);
- }
- }
- auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
- if (!CPLoad) {
- LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
+ if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
return false;
- }
- MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0));
- RBI.constrainGenericRegister(I.getOperand(0).getReg(),
- *MRI.getRegClass(CPLoad->getOperand(0).getReg()),
- MRI);
I.eraseFromParent();
return true;
}
-bool AArch64InstructionSelector::selectBuildVector(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
// Until we port more of the optimized selections, for now just use a vector
// insert sequence.
@@ -4833,12 +4950,11 @@
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
- MachineIRBuilder MIRBuilder(I);
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineInstr *ScalarToVec =
emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
- I.getOperand(1).getReg(), MIRBuilder);
+ I.getOperand(1).getReg(), MIB);
if (!ScalarToVec)
return false;
@@ -4852,7 +4968,7 @@
// Note that if we don't do a subregister copy, we can end up making an
// extra register.
PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
- MIRBuilder);
+ MIB);
DstVec = PrevMI->getOperand(0).getReg();
}
@@ -4881,8 +4997,7 @@
Register Reg = MRI.createVirtualRegister(RC);
Register DstReg = I.getOperand(0).getReg();
- MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
- .addReg(DstVec, 0, SubReg);
+ MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(Reg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
@@ -4910,27 +5025,73 @@
}
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+ MachineInstr &I, MachineRegisterInfo &MRI) {
// Find the intrinsic ID.
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
- MachineIRBuilder MIRBuilder(I);
// Select the instruction.
switch (IntrinID) {
default:
return false;
+ case Intrinsic::aarch64_ldxp:
+ case Intrinsic::aarch64_ldaxp: {
+ auto NewI = MIB.buildInstr(
+ IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
+ {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
+ {I.getOperand(3)});
+ NewI.cloneMemRefs(I);
+ constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
+ break;
+ }
case Intrinsic::trap:
- MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
+ MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
break;
case Intrinsic::debugtrap:
- MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
+ MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
case Intrinsic::ubsantrap:
- MIRBuilder.buildInstr(AArch64::BRK, {}, {})
+ MIB.buildInstr(AArch64::BRK, {}, {})
.addImm(I.getOperand(1).getImm() | ('U' << 8));
break;
+ case Intrinsic::aarch64_neon_st2: {
+ Register Src1 = I.getOperand(1).getReg();
+ Register Src2 = I.getOperand(2).getReg();
+ Register Ptr = I.getOperand(3).getReg();
+ LLT Ty = MRI.getType(Src1);
+ const LLT S8 = LLT::scalar(8);
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+ const LLT P0 = LLT::pointer(0, 64);
+ unsigned Opc;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::ST2Twov8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::ST2Twov16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::ST2Twov4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::ST2Twov8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::ST2Twov2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::ST2Twov4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::ST2Twov2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::ST1Twov1d;
+ else
+ llvm_unreachable("Unexpected type for st2!");
+ SmallVector<Register, 2> Regs = {Src1, Src2};
+ Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
+ : createDTuple(Regs, MIB);
+ auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
+ Store.cloneMemRefs(I);
+ constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
+ break;
+ }
}
I.eraseFromParent();
@@ -4942,7 +5103,6 @@
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
- MachineIRBuilder MIRBuilder(I);
switch (IntrinID) {
default:
@@ -4960,7 +5120,7 @@
// the source and destination if they are on GPRs.
if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
- MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)});
+ MIB.buildCopy({SrcReg}, {I.getOperand(2)});
// Make sure the copy ends up getting constrained properly.
RBI.constrainGenericRegister(I.getOperand(2).getReg(),
@@ -4971,14 +5131,14 @@
DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
// Actually insert the instruction.
- auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
+ auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
// Did we create a new register for the destination?
if (DstReg != I.getOperand(0).getReg()) {
// Yep. Copy the result of the instruction back into the original
// destination.
- MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg});
+ MIB.buildCopy({I.getOperand(0)}, {DstReg});
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
}
@@ -5005,11 +5165,11 @@
}
if (STI.hasPAuth()) {
- MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
+ MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
} else {
- MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
- MIRBuilder.buildInstr(AArch64::XPACLRI);
- MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+ MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
+ MIB.buildInstr(AArch64::XPACLRI);
+ MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
}
I.eraseFromParent();
@@ -5021,31 +5181,42 @@
while (Depth--) {
Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
auto Ldr =
- MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr})
- .addImm(0);
+ MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
FrameAddr = NextFrame;
}
if (IntrinID == Intrinsic::frameaddress)
- MIRBuilder.buildCopy({DstReg}, {FrameAddr});
+ MIB.buildCopy({DstReg}, {FrameAddr});
else {
MFI.setReturnAddressIsTaken(true);
if (STI.hasPAuth()) {
Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
- MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
+ MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
+ MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
} else {
- MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1);
- MIRBuilder.buildInstr(AArch64::XPACLRI);
- MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+ MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
+ .addImm(1);
+ MIB.buildInstr(AArch64::XPACLRI);
+ MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
}
}
I.eraseFromParent();
return true;
}
+ case Intrinsic::swift_async_context_addr:
+ auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
+ {Register(AArch64::FP)})
+ .addImm(8)
+ .addImm(0);
+ constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
+
+ MF->getFrameInfo().setFrameAddressIsTaken(true);
+ MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+ I.eraseFromParent();
+ return true;
}
return false;
}
@@ -5168,7 +5339,7 @@
// Always fold if there is one use, or if we're optimizing for size.
Register DefReg = MI.getOperand(0).getReg();
if (MRI.hasOneNonDBGUse(DefReg) ||
- MI.getParent()->getParent()->getFunction().hasMinSize())
+ MI.getParent()->getParent()->getFunction().hasOptSize())
return true;
// It's better to avoid folding and recomputing shifts when we don't have a
@@ -5577,8 +5748,10 @@
return None;
// TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
- // TODO: Need to check GV's offset % size if doing offset folding into globals.
- assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global");
+ auto Offset = Adrp.getOperand(1).getOffset();
+ if (Offset % Size != 0)
+ return None;
+
auto GV = Adrp.getOperand(1).getGlobal();
if (GV->isThreadLocal())
return None;
@@ -5592,7 +5765,7 @@
Register AdrpReg = Adrp.getOperand(0).getReg();
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
[=](MachineInstrBuilder &MIB) {
- MIB.addGlobalAddress(GV, /* Offset */ 0,
+ MIB.addGlobalAddress(GV, Offset,
OpFlags | AArch64II::MO_PAGEOFF |
AArch64II::MO_NC);
}}};
@@ -5736,9 +5909,9 @@
assert(Size != 64 && "Extend from 64 bits?");
switch (Size) {
case 8:
- return AArch64_AM::SXTB;
+ return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
case 16:
- return AArch64_AM::SXTH;
+ return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
case 32:
return AArch64_AM::SXTW;
default:
@@ -5751,9 +5924,9 @@
assert(Size != 64 && "Extend from 64 bits?");
switch (Size) {
case 8:
- return AArch64_AM::UXTB;
+ return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
case 16:
- return AArch64_AM::UXTH;
+ return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
case 32:
return AArch64_AM::UXTW;
default:
@@ -5895,6 +6068,33 @@
MIB.addImm(Enc);
}
+void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+ "Expected G_FCONSTANT");
+ MIB.addImm(
+ AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
+}
+
+void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+ "Expected G_FCONSTANT");
+ MIB.addImm(
+ AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
+}
+
+void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+ "Expected G_FCONSTANT");
+ MIB.addImm(
+ AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
+}
+
bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
const MachineInstr &MI, unsigned NumBytes) const {
if (!MI.mayLoadOrStore())
@@ -5946,7 +6146,14 @@
// Insert a cross-bank copy.
auto *OpDef = MRI.getVRegDef(OpReg);
const LLT &Ty = MRI.getType(OpReg);
- MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator()));
+ MachineBasicBlock &OpDefBB = *OpDef->getParent();
+
+ // Any instruction we insert must appear after all PHIs in the block
+ // for the block to be valid MIR.
+ MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
+ if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
+ InsertPt = OpDefBB.getFirstNonPHI();
+ MIB.setInsertPt(*OpDef->getParent(), InsertPt);
auto Copy = MIB.buildCopy(Ty, OpReg);
MRI.setRegBank(Copy.getReg(0), *DstRB);
MO.setReg(Copy.getReg(0));
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5a6c904..08e4a11 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64LegalizerInfo.h"
+#include "AArch64RegisterBankInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -22,9 +23,10 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/Type.h"
-#include <initializer_list>
#include "llvm/Support/MathExtras.h"
+#include <initializer_list>
#define DEBUG_TYPE "aarch64-legalinfo"
@@ -44,17 +46,16 @@
const LLT s64 = LLT::scalar(64);
const LLT s128 = LLT::scalar(128);
const LLT s256 = LLT::scalar(256);
- const LLT s512 = LLT::scalar(512);
- const LLT v16s8 = LLT::vector(16, 8);
- const LLT v8s8 = LLT::vector(8, 8);
- const LLT v4s8 = LLT::vector(4, 8);
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v4s16 = LLT::vector(4, 16);
- const LLT v2s16 = LLT::vector(2, 16);
- const LLT v2s32 = LLT::vector(2, 32);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
- const LLT v2p0 = LLT::vector(2, p0);
+ const LLT v16s8 = LLT::fixed_vector(16, 8);
+ const LLT v8s8 = LLT::fixed_vector(8, 8);
+ const LLT v4s8 = LLT::fixed_vector(4, 8);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v4s16 = LLT::fixed_vector(4, 16);
+ const LLT v2s16 = LLT::fixed_vector(2, 16);
+ const LLT v2s32 = LLT::fixed_vector(2, 32);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
+ const LLT v2p0 = LLT::fixed_vector(2, p0);
std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
v16s8, v8s16, v4s32,
@@ -67,7 +68,7 @@
// FIXME: support subtargets which have neon/fp-armv8 disabled.
if (!ST.hasNEON() || !ST.hasFPARMv8()) {
- computeTables();
+ getLegacyLegalizerInfo().computeTables();
return;
}
@@ -79,7 +80,7 @@
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
.legalFor({p0, s1, s8, s16, s32, s64})
.legalFor(PackedVectorAllTypeList)
- .clampScalar(0, s1, s64)
+ .clampScalar(0, s8, s64)
.widenScalarToNextPow2(0, 8)
.fewerElementsIf(
[=](const LegalityQuery &Query) {
@@ -90,7 +91,7 @@
[=](const LegalityQuery &Query) {
LLT EltTy = Query.Types[0].getElementType();
if (EltTy == s64)
- return std::make_pair(0, LLT::vector(2, 64));
+ return std::make_pair(0, LLT::fixed_vector(2, 64));
return std::make_pair(0, EltTy);
});
@@ -102,7 +103,8 @@
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({s32, s64, v4s32, v2s32, v2s64})
.clampScalar(0, s32, s64)
- .widenScalarToNextPow2(0);
+ .widenScalarToNextPow2(0)
+ .customIf(typeIs(0, v2s16)); // custom lower as G_REV32 + G_LSHR
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
@@ -137,9 +139,9 @@
{v4s32, v4s32},
{v2s64, v2s64},
})
+ .widenScalarToNextPow2(0)
.clampScalar(1, s32, s64)
.clampScalar(0, s32, s64)
- .widenScalarToNextPow2(0)
.clampNumElements(0, v2s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0)
@@ -158,17 +160,28 @@
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({G_SREM, G_UREM})
+ getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
.lowerFor({s1, s8, s16, s32, s64});
getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .clampNumElements(0, v8s8, v16s8)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ // FIXME: This sholdn't be needed as v2s64 types are going to
+ // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
+ .clampNumElements(0, v2s64, v2s64)
+ .lower();
+
getActionDefinitionsBuilder(
- {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
+ {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
.legalFor({{s32, s1}, {s64, s1}})
- .minScalar(0, s32);
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
.legalFor({s32, s64, v2s64, v4s32, v2s32})
@@ -245,15 +258,16 @@
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({{s32, p0, 8, 8},
- {s32, p0, 16, 8},
- {s32, p0, 32, 8},
- {s64, p0, 8, 2},
- {s64, p0, 16, 2},
- {s64, p0, 32, 4},
- {s64, p0, 64, 8},
- {p0, p0, 64, 8},
- {v2s32, p0, 64, 8}})
+ .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
+ .legalForTypesWithMemDesc({{s32, p0, s8, 8},
+ {s32, p0, s16, 8},
+ {s32, p0, s32, 8},
+ {s64, p0, s8, 2},
+ {s64, p0, s16, 2},
+ {s64, p0, s32, 4},
+ {s64, p0, s64, 8},
+ {p0, p0, s64, 8},
+ {v2s32, p0, s64, 8}})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
@@ -271,57 +285,77 @@
};
getActionDefinitionsBuilder(G_LOAD)
- .legalForTypesWithMemDesc({{s8, p0, 8, 8},
- {s16, p0, 16, 8},
- {s32, p0, 32, 8},
- {s64, p0, 64, 8},
- {p0, p0, 64, 8},
- {s128, p0, 128, 8},
- {v8s8, p0, 64, 8},
- {v16s8, p0, 128, 8},
- {v4s16, p0, 64, 8},
- {v8s16, p0, 128, 8},
- {v2s32, p0, 64, 8},
- {v4s32, p0, 128, 8},
- {v2s64, p0, 128, 8}})
+ .legalForTypesWithMemDesc({{s8, p0, s8, 8},
+ {s16, p0, s16, 8},
+ {s32, p0, s32, 8},
+ {s64, p0, s64, 8},
+ {p0, p0, s64, 8},
+ {s128, p0, s128, 8},
+ {v8s8, p0, s64, 8},
+ {v16s8, p0, s128, 8},
+ {v4s16, p0, s64, 8},
+ {v8s16, p0, s128, 8},
+ {v2s32, p0, s64, 8},
+ {v4s32, p0, s128, 8},
+ {v2s64, p0, s128, 8}})
// These extends are also legal
- .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}})
+ .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
.clampScalar(0, s8, s64)
.lowerIfMemSizeNotPow2()
+ .widenScalarToNextPow2(0)
+ .narrowScalarIf([=](const LegalityQuery &Query) {
+ // Clamp extending load results to 32-bits.
+ return Query.Types[0].isScalar() &&
+ Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
+ Query.Types[0].getSizeInBits() > 32;
+ },
+ changeTo(0, s32))
// Lower any any-extending loads left into G_ANYEXT and G_LOAD
.lowerIf([=](const LegalityQuery &Query) {
- return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
+ return Query.Types[0] != Query.MMODescrs[0].MemoryTy;
})
- .widenScalarToNextPow2(0)
- .clampMaxNumElements(0, s32, 2)
- .clampMaxNumElements(0, s64, 1)
- .customIf(IsPtrVecPred);
+ .clampMaxNumElements(0, s8, 16)
+ .clampMaxNumElements(0, s16, 8)
+ .clampMaxNumElements(0, s32, 4)
+ .clampMaxNumElements(0, s64, 2)
+ .customIf(IsPtrVecPred)
+ .scalarizeIf(typeIs(0, v2s16), 0);
getActionDefinitionsBuilder(G_STORE)
- .legalForTypesWithMemDesc({{s8, p0, 8, 8},
- {s16, p0, 16, 8},
- {s32, p0, 8, 8},
- {s32, p0, 16, 8},
- {s32, p0, 32, 8},
- {s64, p0, 64, 8},
- {p0, p0, 64, 8},
- {s128, p0, 128, 8},
- {v16s8, p0, 128, 8},
- {v8s8, p0, 64, 8},
- {v4s16, p0, 64, 8},
- {v8s16, p0, 128, 8},
- {v2s32, p0, 64, 8},
- {v4s32, p0, 128, 8},
- {v2s64, p0, 128, 8}})
+ .legalForTypesWithMemDesc({{s8, p0, s8, 8},
+ {s16, p0, s8, 8}, // truncstorei8 from s16
+ {s32, p0, s8, 8}, // truncstorei8 from s32
+ {s64, p0, s8, 8}, // truncstorei8 from s64
+ {s16, p0, s16, 8},
+ {s32, p0, s16, 8}, // truncstorei16 from s32
+ {s64, p0, s16, 8}, // truncstorei16 from s64
+ {s32, p0, s8, 8},
+ {s32, p0, s16, 8},
+ {s32, p0, s32, 8},
+ {s64, p0, s64, 8},
+ {s64, p0, s32, 8}, // truncstorei32 from s64
+ {p0, p0, s64, 8},
+ {s128, p0, s128, 8},
+ {v16s8, p0, s128, 8},
+ {v8s8, p0, s64, 8},
+ {v4s16, p0, s64, 8},
+ {v8s16, p0, s128, 8},
+ {v2s32, p0, s64, 8},
+ {v4s32, p0, s128, 8},
+ {v2s64, p0, s128, 8}})
.clampScalar(0, s8, s64)
- .lowerIfMemSizeNotPow2()
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
- Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
+ Query.Types[0] != Query.MMODescrs[0].MemoryTy;
})
- .clampMaxNumElements(0, s32, 2)
- .clampMaxNumElements(0, s64, 1)
- .customIf(IsPtrVecPred);
+ // Maximum: sN * k = 128
+ .clampMaxNumElements(0, s8, 16)
+ .clampMaxNumElements(0, s16, 8)
+ .clampMaxNumElements(0, s32, 4)
+ .clampMaxNumElements(0, s64, 2)
+ .lowerIfMemSizeNotPow2()
+ .customIf(IsPtrVecPred)
+ .scalarizeIf(typeIs(0, v2s16), 0);
// Constants
getActionDefinitionsBuilder(G_CONSTANT)
@@ -465,7 +499,7 @@
.unsupportedIf([&](const LegalityQuery &Query) {
return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
})
- .legalFor({{p0, s64}});
+ .legalFor({{p0, s64}, {v2p0, v2s64}});
// Casts for 32 and 64-bit width type are just copies.
// Same for 128-bit width type, except they are on the FPR bank.
@@ -486,20 +520,23 @@
.clampScalar(0, s8, s64)
.widenScalarToNextPow2(0, /*Min*/ 8);
- if (ST.hasLSE()) {
- getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
- .lowerIf(all(
- typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0),
- atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
+ getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+ .lowerIf(
+ all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0)));
- getActionDefinitionsBuilder(
- {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
- G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
- G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG})
- .legalIf(all(
- typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
- atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
- }
+ getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
+ .customIf([](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() == 128;
+ })
+ .clampScalar(0, s32, s64)
+ .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+ getActionDefinitionsBuilder(
+ {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
+ G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
+ G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+ .clampScalar(0, s32, s64)
+ .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
@@ -536,9 +573,8 @@
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
scalarize(1))
- // Clamp the big scalar to s8-s512 and make it either a power of 2, 192,
- // or 384.
- .clampScalar(BigTyIdx, s8, s512)
+ // Clamp the big scalar to s8-s128 and make it a power of 2.
+ .clampScalar(BigTyIdx, s8, s128)
.widenScalarIf(
[=](const LegalityQuery &Query) {
const LLT &Ty = Query.Types[BigTyIdx];
@@ -614,7 +650,11 @@
return Query.Types[1].getNumElements() <= 16;
},
0, s8)
- .minScalarOrElt(0, s8); // Worst case, we need at least s8.
+ .minScalarOrElt(0, s8) // Worst case, we need at least s8.
+ .clampMaxNumElements(1, s64, 2)
+ .clampMaxNumElements(1, s32, 4)
+ .clampMaxNumElements(1, s16, 8)
+ .clampMaxNumElements(1, p0, 2);
getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
.legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
@@ -622,6 +662,7 @@
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.legalFor({{v8s8, s8},
{v16s8, s8},
+ {v2s16, s16},
{v4s16, s16},
{v8s16, s16},
{v2s32, s32},
@@ -630,18 +671,26 @@
{v2s64, s64}})
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
-
- // Deal with larger scalar types, which will be implicitly truncated.
- .legalIf([=](const LegalityQuery &Query) {
- return Query.Types[0].getScalarSizeInBits() <
- Query.Types[1].getSizeInBits();
- })
.minScalarSameAs(1, 0);
+ getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
+
getActionDefinitionsBuilder(G_CTLZ)
.legalForCartesianProduct(
{s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
.scalarize(1);
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
+
+ // TODO: Custom lowering for v2s32, v4s32, v2s64.
+ getActionDefinitionsBuilder(G_BITREVERSE).legalFor({s32, s64, v8s8, v16s8});
+
+ getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
+
+ // TODO: Handle vector types.
+ getActionDefinitionsBuilder(G_CTTZ)
+ .clampScalar(0, s32, s64)
+ .scalarSameSizeAs(1, 0)
+ .customFor({s32, s64});
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
.legalIf([=](const LegalityQuery &Query) {
@@ -662,6 +711,7 @@
.lowerIf([=](const LegalityQuery &Query) {
return !Query.Types[1].isVector();
})
+ .moreElementsToNextPow2(0)
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64);
@@ -676,21 +726,62 @@
getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
- getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+ getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
+ .libcall();
- getActionDefinitionsBuilder(G_ABS).lowerIf(
- [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); });
+ // FIXME: Legal types are only legal with NEON.
+ getActionDefinitionsBuilder(G_ABS)
+ .lowerIf(isScalar(0))
+ .legalFor(PackedVectorAllTypeList);
getActionDefinitionsBuilder(G_VECREDUCE_FADD)
// We only have FADDP to do reduction-like operations. Lower the rest.
.legalFor({{s32, v2s32}, {s64, v2s64}})
+ .clampMaxNumElements(1, s64, 2)
+ .clampMaxNumElements(1, s32, 2)
.lower();
getActionDefinitionsBuilder(G_VECREDUCE_ADD)
- .legalFor({{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s64, v2s64}})
+ .legalFor(
+ {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
+ .clampMaxNumElements(1, s64, 2)
+ .clampMaxNumElements(1, s32, 4)
.lower();
- computeTables();
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
+
+ getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
+
+ getActionDefinitionsBuilder(G_ROTR)
+ .legalFor({{s32, s64}, {s64, s64}})
+ .customIf([=](const LegalityQuery &Q) {
+ return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
+ })
+ .lower();
+ getActionDefinitionsBuilder(G_ROTL).lower();
+
+ getActionDefinitionsBuilder({G_SBFX, G_UBFX})
+ .customFor({{s32, s32}, {s64, s64}});
+
+ // TODO: Custom legalization for s128
+ // TODO: Use generic lowering when custom lowering is not possible.
+ auto always = [=](const LegalityQuery &Q) { return true; };
+ getActionDefinitionsBuilder(G_CTPOP)
+ .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
+ .clampScalar(0, s32, s128)
+ .widenScalarToNextPow2(0)
+ .minScalarEltSameAsIf(always, 1, 0)
+ .maxScalarEltSameAsIf(always, 1, 0)
+ .customFor({{s32, s32},
+ {s64, s64},
+ {v2s64, v2s64},
+ {v2s32, v2s32},
+ {v4s32, v4s32},
+ {v4s16, v4s16},
+ {v8s16, v8s16}});
+
+ getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -708,6 +799,8 @@
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE:
return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
+ case TargetOpcode::G_BSWAP:
+ return legalizeBSwap(MI, MRI, MIRBuilder);
case TargetOpcode::G_SHL:
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR:
@@ -716,11 +809,39 @@
return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
case TargetOpcode::G_TRUNC:
return legalizeVectorTrunc(MI, Helper);
+ case TargetOpcode::G_SBFX:
+ case TargetOpcode::G_UBFX:
+ return legalizeBitfieldExtract(MI, MRI, Helper);
+ case TargetOpcode::G_ROTR:
+ return legalizeRotate(MI, MRI, Helper);
+ case TargetOpcode::G_CTPOP:
+ return legalizeCTPOP(MI, MRI, Helper);
+ case TargetOpcode::G_ATOMIC_CMPXCHG:
+ return legalizeAtomicCmpxchg128(MI, MRI, Helper);
+ case TargetOpcode::G_CTTZ:
+ return legalizeCTTZ(MI, Helper);
}
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const {
+ // To allow for imported patterns to match, we ensure that the rotate amount
+ // is 64b with an extension.
+ Register AmtReg = MI.getOperand(2).getReg();
+ LLT AmtTy = MRI.getType(AmtReg);
+ (void)AmtTy;
+ assert(AmtTy.isScalar() && "Expected a scalar rotate");
+ assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
+ auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg);
+ Helper.Observer.changingInstr(MI);
+ MI.getOperand(2).setReg(NewAmt.getReg(0));
+ Helper.Observer.changedInstr(MI);
+ return true;
+}
+
static void extractParts(Register Reg, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
SmallVectorImpl<Register> &VRegs) {
@@ -749,7 +870,8 @@
isPowerOf2_32(SrcTy.getSizeInBits()));
// Split input type.
- LLT SplitSrcTy = SrcTy.changeNumElements(SrcTy.getNumElements() / 2);
+ LLT SplitSrcTy =
+ SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
// First, split the source into two smaller vectors.
SmallVector<Register, 2> SplitSrcs;
extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
@@ -776,7 +898,8 @@
// G_ADD_LOW instructions.
// By splitting this here, we can optimize accesses in the small code model by
// folding in the G_ADD_LOW into the load/store offset.
- auto GV = MI.getOperand(1).getGlobal();
+ auto &GlobalOp = MI.getOperand(1);
+ const auto* GV = GlobalOp.getGlobal();
if (GV->isThreadLocal())
return true; // Don't want to modify TLS vars.
@@ -786,9 +909,10 @@
if (OpFlags & AArch64II::MO_GOT)
return true;
+ auto Offset = GlobalOp.getOffset();
Register DstReg = MI.getOperand(0).getReg();
auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
- .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+ .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
// Set the regclass on the dest reg too.
MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
@@ -806,6 +930,8 @@
// binary must also be loaded into address range [0, 2^48). Both of these
// properties need to be ensured at runtime when using tagged addresses.
if (OpFlags & AArch64II::MO_TAGGED) {
+ assert(!Offset &&
+ "Should not have folded in an offset for a tagged global!");
ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
.addGlobalAddress(GV, 0x100000000,
AArch64II::MO_PREL | AArch64II::MO_G3)
@@ -814,7 +940,7 @@
}
MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
- .addGlobalAddress(GV, 0,
+ .addGlobalAddress(GV, Offset,
OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
MI.eraseFromParent();
return true;
@@ -848,6 +974,8 @@
return true;
}
+// FIXME: This should be removed and replaced with the generic bitcast legalize
+// action.
bool AArch64LegalizerInfo::legalizeLoadStore(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const {
@@ -872,8 +1000,10 @@
}
unsigned PtrSize = ValTy.getElementType().getSizeInBits();
- const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize);
+ const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
auto &MMO = **MI.memoperands_begin();
+ MMO.setType(NewTy);
+
if (MI.getOpcode() == TargetOpcode::G_STORE) {
auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
@@ -885,6 +1015,46 @@
return true;
}
+bool AArch64LegalizerInfo::legalizeBSwap(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(MI.getOpcode() == TargetOpcode::G_BSWAP);
+
+ // The <2 x half> case needs special lowering because there isn't an
+ // instruction that does that directly. Instead, we widen to <8 x i8>
+ // and emit a G_REV32 followed by a G_LSHR knowing that instruction selection
+ // will later match them as:
+ //
+ // rev32.8b v0, v0
+ // ushr.2s v0, v0, #16
+ //
+ // We could emit those here directly, but it seems better to keep things as
+ // generic as possible through legalization, and avoid committing layering
+ // violations by legalizing & selecting here at the same time.
+
+ Register ValReg = MI.getOperand(1).getReg();
+ assert(LLT::fixed_vector(2, 16) == MRI.getType(ValReg));
+ const LLT v2s32 = LLT::fixed_vector(2, 32);
+ const LLT v8s8 = LLT::fixed_vector(8, 8);
+ const LLT s32 = LLT::scalar(32);
+
+ auto Undef = MIRBuilder.buildUndef(v8s8);
+ auto Insert =
+ MIRBuilder
+ .buildInstr(TargetOpcode::INSERT_SUBREG, {v8s8}, {Undef, ValReg})
+ .addImm(AArch64::ssub);
+ auto Rev32 = MIRBuilder.buildInstr(AArch64::G_REV32, {v8s8}, {Insert});
+ auto Bitcast = MIRBuilder.buildBitcast(v2s32, Rev32);
+ auto Amt = MIRBuilder.buildConstant(v2s32, 16);
+ auto UShr =
+ MIRBuilder.buildInstr(TargetOpcode::G_LSHR, {v2s32}, {Bitcast, Amt});
+ auto Zero = MIRBuilder.buildConstant(s32, 0);
+ auto Extract = MIRBuilder.buildExtractVectorElement(s32, UShr, Zero);
+ MIRBuilder.buildBitcast({MI.getOperand(0).getReg()}, Extract);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
@@ -901,7 +1071,7 @@
auto List = MIRBuilder.buildLoad(
PtrTy, ListPtr,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
- PtrSize, PtrAlign));
+ PtrTy, PtrAlign));
MachineInstrBuilder DstPtr;
if (Alignment > PtrAlign) {
@@ -913,11 +1083,12 @@
} else
DstPtr = List;
- uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8;
+ LLT ValTy = MRI.getType(Dst);
+ uint64_t ValSize = ValTy.getSizeInBits() / 8;
MIRBuilder.buildLoad(
Dst, DstPtr,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
- ValSize, std::max(Alignment, PtrAlign)));
+ ValTy, std::max(Alignment, PtrAlign)));
auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
@@ -926,8 +1097,219 @@
MIRBuilder.buildStore(NewList, ListPtr,
*MF.getMachineMemOperand(MachinePointerInfo(),
MachineMemOperand::MOStore,
- PtrSize, PtrAlign));
+ PtrTy, PtrAlign));
MI.eraseFromParent();
return true;
}
+
+bool AArch64LegalizerInfo::legalizeBitfieldExtract(
+ MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
+ // Only legal if we can select immediate forms.
+ // TODO: Lower this otherwise.
+ return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
+ getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+}
+
+bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const {
+ // While there is no integer popcount instruction, it can
+ // be more efficiently lowered to the following sequence that uses
+ // AdvSIMD registers/instructions as long as the copies to/from
+ // the AdvSIMD registers are cheap.
+ // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
+ // CNT V0.8B, V0.8B // 8xbyte pop-counts
+ // ADDV B0, V0.8B // sum 8xbyte pop-counts
+ // UMOV X0, V0.B[0] // copy byte result back to integer reg
+ //
+ // For 128 bit vector popcounts, we lower to the following sequence:
+ // cnt.16b v0, v0 // v8s16, v4s32, v2s64
+ // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
+ // uaddlp.4s v0, v0 // v4s32, v2s64
+ // uaddlp.2d v0, v0 // v2s64
+ //
+ // For 64 bit vector popcounts, we lower to the following sequence:
+ // cnt.8b v0, v0 // v4s16, v2s32
+ // uaddlp.4h v0, v0 // v4s16, v2s32
+ // uaddlp.2s v0, v0 // v2s32
+
+ if (!ST->hasNEON() ||
+ MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Val = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(Val);
+
+ assert(Ty == MRI.getType(Dst) &&
+ "Expected src and dst to have the same type!");
+ unsigned Size = Ty.getSizeInBits();
+
+ // Pre-conditioning: widen Val up to the nearest vector type.
+ // s32,s64,v4s16,v2s32 -> v8i8
+ // v8s16,v4s32,v2s64 -> v16i8
+ LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
+ if (Ty.isScalar()) {
+ // TODO: Handle s128.
+ assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!");
+ if (Size == 32) {
+ Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
+ }
+ }
+ Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
+
+ // Count bits in each byte-sized lane.
+ auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
+
+ // Sum across lanes.
+ Register HSum = CTPOP.getReg(0);
+ unsigned Opc;
+ SmallVector<LLT> HAddTys;
+ if (Ty.isScalar()) {
+ Opc = Intrinsic::aarch64_neon_uaddlv;
+ HAddTys.push_back(LLT::scalar(32));
+ } else if (Ty == LLT::fixed_vector(8, 16)) {
+ Opc = Intrinsic::aarch64_neon_uaddlp;
+ HAddTys.push_back(LLT::fixed_vector(8, 16));
+ } else if (Ty == LLT::fixed_vector(4, 32)) {
+ Opc = Intrinsic::aarch64_neon_uaddlp;
+ HAddTys.push_back(LLT::fixed_vector(8, 16));
+ HAddTys.push_back(LLT::fixed_vector(4, 32));
+ } else if (Ty == LLT::fixed_vector(2, 64)) {
+ Opc = Intrinsic::aarch64_neon_uaddlp;
+ HAddTys.push_back(LLT::fixed_vector(8, 16));
+ HAddTys.push_back(LLT::fixed_vector(4, 32));
+ HAddTys.push_back(LLT::fixed_vector(2, 64));
+ } else if (Ty == LLT::fixed_vector(4, 16)) {
+ Opc = Intrinsic::aarch64_neon_uaddlp;
+ HAddTys.push_back(LLT::fixed_vector(4, 16));
+ } else if (Ty == LLT::fixed_vector(2, 32)) {
+ Opc = Intrinsic::aarch64_neon_uaddlp;
+ HAddTys.push_back(LLT::fixed_vector(4, 16));
+ HAddTys.push_back(LLT::fixed_vector(2, 32));
+ } else
+ llvm_unreachable("unexpected vector shape");
+ MachineInstrBuilder UADD;
+ for (LLT HTy : HAddTys) {
+ UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
+ .addUse(HSum);
+ HSum = UADD.getReg(0);
+ }
+
+ // Post-conditioning.
+ if (Ty.isScalar() && Size == 64)
+ MIRBuilder.buildZExt(Dst, UADD);
+ else
+ UADD->getOperand(0).setReg(Dst);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
+ MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ LLT s64 = LLT::scalar(64);
+ auto Addr = MI.getOperand(1).getReg();
+ auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
+ auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
+ auto DstLo = MRI.createGenericVirtualRegister(s64);
+ auto DstHi = MRI.createGenericVirtualRegister(s64);
+
+ MachineInstrBuilder CAS;
+ if (ST->hasLSE()) {
+ // We have 128-bit CASP instructions taking XSeqPair registers, which are
+ // s128. We need the merge/unmerge to bracket the expansion and pair up with
+ // the rest of the MIR so we must reassemble the extracted registers into a
+ // 128-bit known-regclass one with code like this:
+ //
+ // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
+ // %out = CASP %in1, ...
+ // %OldLo = G_EXTRACT %out, 0
+ // %OldHi = G_EXTRACT %out, 64
+ auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
+ unsigned Opcode;
+ switch (Ordering) {
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CASPAX;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CASPLX;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CASPALX;
+ break;
+ default:
+ Opcode = AArch64::CASPX;
+ break;
+ }
+
+ LLT s128 = LLT::scalar(128);
+ auto CASDst = MRI.createGenericVirtualRegister(s128);
+ auto CASDesired = MRI.createGenericVirtualRegister(s128);
+ auto CASNew = MRI.createGenericVirtualRegister(s128);
+ MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
+ .addUse(DesiredI->getOperand(0).getReg())
+ .addImm(AArch64::sube64)
+ .addUse(DesiredI->getOperand(1).getReg())
+ .addImm(AArch64::subo64);
+ MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
+ .addUse(NewI->getOperand(0).getReg())
+ .addImm(AArch64::sube64)
+ .addUse(NewI->getOperand(1).getReg())
+ .addImm(AArch64::subo64);
+
+ CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
+
+ MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
+ MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
+ } else {
+ // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
+ // can take arbitrary registers so it just has the normal GPR64 operands the
+ // rest of AArch64 is expecting.
+ auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
+ unsigned Opcode;
+ switch (Ordering) {
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CMP_SWAP_128_RELEASE;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CMP_SWAP_128;
+ break;
+ default:
+ Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
+ break;
+ }
+
+ auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
+ {Addr, DesiredI->getOperand(0),
+ DesiredI->getOperand(1), NewI->getOperand(0),
+ NewI->getOperand(1)});
+ }
+
+ CAS.cloneMemRefs(MI);
+ constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
+ *MRI.getTargetRegisterInfo(),
+ *ST->getRegBankInfo());
+
+ MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi});
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
+ LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+ auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
+ MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
+ MI.eraseFromParent();
+ return true;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 8217e37..78fc245 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -17,6 +17,7 @@
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -34,6 +35,8 @@
MachineInstr &MI) const override;
private:
+ bool legalizeBSwap(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const;
bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -47,6 +50,15 @@
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const;
bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const;
+ bool legalizeBitfieldExtract(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
+ bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
+ bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
+ bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
+ bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
new file mode 100644
index 0000000..04bc913
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -0,0 +1,173 @@
+//=== lib/CodeGen/GlobalISel/AArch64O0PreLegalizerCombiner.cpp ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// before the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64GlobalISelUtils.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-O0-prelegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+class AArch64O0PreLegalizerCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+
+public:
+ AArch64O0PreLegalizerCombinerHelperState(CombinerHelper &Helper)
+ : Helper(Helper) {}
+};
+
+#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenO0PreLegalizeGICombiner.inc"
+#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenO0PreLegalizeGICombiner.inc"
+#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AArch64O0PreLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+ AArch64GenO0PreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+public:
+ AArch64O0PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ GISelKnownBits *KB,
+ MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+ /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AArch64O0PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B, KB, MDT);
+ AArch64GenO0PreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
+
+ if (Generated.tryCombineAll(Observer, MI, B))
+ return true;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
+ case TargetOpcode::G_MEMCPY_INLINE:
+ return Helper.tryEmitMemcpyInline(MI);
+ case TargetOpcode::G_MEMCPY:
+ case TargetOpcode::G_MEMMOVE:
+ case TargetOpcode::G_MEMSET: {
+ // At -O0 set a maxlen of 32 to inline;
+ unsigned MaxLen = 32;
+ // Try to inline memcpy type calls if optimizations are enabled.
+ if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
+ return true;
+ if (Opc == TargetOpcode::G_MEMSET)
+ return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
+ return false;
+ }
+ }
+
+ return false;
+}
+
+#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenO0PreLegalizeGICombiner.inc"
+#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AArch64O0PreLegalizerCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AArch64O0PreLegalizerCombiner();
+
+ StringRef getPassName() const override {
+ return "AArch64O0PreLegalizerCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+} // end anonymous namespace
+
+void AArch64O0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64O0PreLegalizerCombiner::AArch64O0PreLegalizerCombiner()
+ : MachineFunctionPass(ID) {
+ initializeAArch64O0PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ auto &TPC = getAnalysis<TargetPassConfig>();
+
+ const Function &F = MF.getFunction();
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ AArch64O0PreLegalizerCombinerInfo PCInfo(
+ false, F.hasOptSize(), F.hasMinSize(), KB, nullptr /* MDT */);
+ Combiner C(PCInfo, &TPC);
+ return C.combineMachineInstrs(MF, nullptr /* CSEInfo */);
+}
+
+char AArch64O0PreLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64O0PreLegalizerCombiner, DEBUG_TYPE,
+ "Combine AArch64 machine instrs before legalization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_END(AArch64O0PreLegalizerCombiner, DEBUG_TYPE,
+ "Combine AArch64 machine instrs before legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAArch64O0PreLegalizerCombiner() {
+ return new AArch64O0PreLegalizerCombiner();
+}
+} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index fdd04cb..b700c37 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -23,7 +23,9 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -36,6 +38,7 @@
#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
using namespace llvm;
+using namespace MIPatternMatch;
/// This combine tries do what performExtractVectorEltCombine does in SDAG.
/// Rewrite for pairwise fadd pattern
@@ -155,8 +158,9 @@
// folded into madd or msub.
if (MRI.hasOneNonDBGUse(Dst)) {
MachineInstr &UseMI = *MRI.use_instr_begin(Dst);
- if (UseMI.getOpcode() == TargetOpcode::G_ADD ||
- UseMI.getOpcode() == TargetOpcode::G_SUB)
+ unsigned UseOpc = UseMI.getOpcode();
+ if (UseOpc == TargetOpcode::G_ADD || UseOpc == TargetOpcode::G_PTR_ADD ||
+ UseOpc == TargetOpcode::G_SUB)
return false;
}
}
@@ -237,6 +241,27 @@
return true;
}
+/// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source
+/// is a zero, into a G_ZEXT of the first.
+bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ auto &Merge = cast<GMerge>(MI);
+ LLT SrcTy = MRI.getType(Merge.getSourceReg(0));
+ if (SrcTy != LLT::scalar(32) || Merge.getNumSources() != 2)
+ return false;
+ return mi_match(Merge.getSourceReg(1), MRI, m_SpecificICst(0));
+}
+
+void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, GISelChangeObserver &Observer) {
+ // Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32)
+ // ->
+ // %d(s64) = G_ZEXT %a(s32)
+ Observer.changingInstr(MI);
+ MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
+ MI.RemoveOperand(2);
+ Observer.changedInstr(MI);
+}
+
#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -308,6 +333,8 @@
if (!IsOptNone) {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addPreserved<GISelCSEAnalysisWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -333,8 +360,11 @@
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), KB, MDT);
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+ return C.combineMachineInstrs(MF, CSEInfo);
}
char AArch64PostLegalizerCombiner::ID = 0;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index a06ff4b..84ecb4b 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -19,9 +19,13 @@
///
//===----------------------------------------------------------------------===//
-#include "AArch64TargetMachine.h"
#include "AArch64GlobalISelUtils.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "GISel/AArch64LegalizerInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "TargetInfo/AArch64TargetInfo.h"
+#include "Utils/AArch64BaseInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -33,8 +37,10 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "aarch64-postlegalizer-lowering"
@@ -176,6 +182,37 @@
return true;
}
+/// Helper function for matchINS.
+///
+/// \returns a value when \p M is an ins mask for \p NumInputElements.
+///
+/// First element of the returned pair is true when the produced
+/// G_INSERT_VECTOR_ELT destination should be the LHS of the G_SHUFFLE_VECTOR.
+///
+/// Second element is the destination lane for the G_INSERT_VECTOR_ELT.
+static Optional<std::pair<bool, int>> isINSMask(ArrayRef<int> M,
+ int NumInputElements) {
+ if (M.size() != static_cast<size_t>(NumInputElements))
+ return None;
+ int NumLHSMatch = 0, NumRHSMatch = 0;
+ int LastLHSMismatch = -1, LastRHSMismatch = -1;
+ for (int Idx = 0; Idx < NumInputElements; ++Idx) {
+ if (M[Idx] == -1) {
+ ++NumLHSMatch;
+ ++NumRHSMatch;
+ continue;
+ }
+ M[Idx] == Idx ? ++NumLHSMatch : LastLHSMismatch = Idx;
+ M[Idx] == Idx + NumInputElements ? ++NumRHSMatch : LastRHSMismatch = Idx;
+ }
+ const int NumNeededToMatch = NumInputElements - 1;
+ if (NumLHSMatch == NumNeededToMatch)
+ return std::make_pair(true, LastLHSMismatch);
+ if (NumRHSMatch == NumNeededToMatch)
+ return std::make_pair(false, LastRHSMismatch);
+ return None;
+}
+
/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -378,6 +415,61 @@
return true;
}
+/// Match a G_SHUFFLE_VECTOR with a mask which corresponds to a
+/// G_INSERT_VECTOR_ELT and G_EXTRACT_VECTOR_ELT pair.
+///
+/// e.g.
+/// %shuf = G_SHUFFLE_VECTOR %left, %right, shufflemask(0, 0)
+///
+/// Can be represented as
+///
+/// %extract = G_EXTRACT_VECTOR_ELT %left, 0
+/// %ins = G_INSERT_VECTOR_ELT %left, %extract, 1
+///
+static bool matchINS(MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::tuple<Register, int, Register, int> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ int NumElts = MRI.getType(Dst).getNumElements();
+ auto DstIsLeftAndDstLane = isINSMask(ShuffleMask, NumElts);
+ if (!DstIsLeftAndDstLane)
+ return false;
+ bool DstIsLeft;
+ int DstLane;
+ std::tie(DstIsLeft, DstLane) = *DstIsLeftAndDstLane;
+ Register Left = MI.getOperand(1).getReg();
+ Register Right = MI.getOperand(2).getReg();
+ Register DstVec = DstIsLeft ? Left : Right;
+ Register SrcVec = Left;
+
+ int SrcLane = ShuffleMask[DstLane];
+ if (SrcLane >= NumElts) {
+ SrcVec = Right;
+ SrcLane -= NumElts;
+ }
+
+ MatchInfo = std::make_tuple(DstVec, DstLane, SrcVec, SrcLane);
+ return true;
+}
+
+static bool applyINS(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder,
+ std::tuple<Register, int, Register, int> &MatchInfo) {
+ Builder.setInstrAndDebugLoc(MI);
+ Register Dst = MI.getOperand(0).getReg();
+ auto ScalarTy = MRI.getType(Dst).getElementType();
+ Register DstVec, SrcVec;
+ int DstLane, SrcLane;
+ std::tie(DstVec, DstLane, SrcVec, SrcLane) = MatchInfo;
+ auto SrcCst = Builder.buildConstant(LLT::scalar(64), SrcLane);
+ auto Extract = Builder.buildExtractVectorElement(ScalarTy, SrcVec, SrcCst);
+ auto DstCst = Builder.buildConstant(LLT::scalar(64), DstLane);
+ Builder.buildInsertVectorElement(Dst, DstVec, Extract, DstCst);
+ MI.eraseFromParent();
+ return true;
+}
+
/// isVShiftRImm - Check if this is a valid vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift.
@@ -385,7 +477,7 @@
int64_t &Cnt) {
assert(Ty.isVector() && "vector shift count is not a vector type");
MachineInstr *MI = MRI.getVRegDef(Reg);
- auto Cst = getBuildVectorConstantSplat(*MI, MRI);
+ auto Cst = getAArch64VectorSplatScalar(*MI, MRI);
if (!Cst)
return false;
Cnt = *Cst;
@@ -575,6 +667,8 @@
case 2:
if (ScalarSize == 64)
Opc = AArch64::G_DUPLANE64;
+ else if (ScalarSize == 32)
+ Opc = AArch64::G_DUPLANE32;
break;
case 4:
if (ScalarSize == 32)
@@ -602,14 +696,286 @@
bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ Register Src1Reg = MI.getOperand(1).getReg();
+ const LLT SrcTy = MRI.getType(Src1Reg);
+
B.setInstrAndDebugLoc(MI);
auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second);
- B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()},
- {MI.getOperand(1).getReg(), Lane});
+
+ Register DupSrc = MI.getOperand(1).getReg();
+ // For types like <2 x s32>, we can use G_DUPLANE32, with a <4 x s32> source.
+ // To do this, we can use a G_CONCAT_VECTORS to do the widening.
+ if (SrcTy == LLT::fixed_vector(2, LLT::scalar(32))) {
+ assert(MRI.getType(MI.getOperand(0).getReg()).getNumElements() == 2 &&
+ "Unexpected dest elements");
+ auto Undef = B.buildUndef(SrcTy);
+ DupSrc = B.buildConcatVectors(
+ SrcTy.changeElementCount(ElementCount::getFixed(4)),
+ {Src1Reg, Undef.getReg(0)})
+ .getReg(0);
+ }
+ B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, {DupSrc, Lane});
MI.eraseFromParent();
return true;
}
+static bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+ auto Splat = getAArch64VectorSplat(MI, MRI);
+ if (!Splat)
+ return false;
+ if (Splat->isReg())
+ return true;
+ // Later, during selection, we'll try to match imported patterns using
+ // immAllOnesV and immAllZerosV. These require G_BUILD_VECTOR. Don't lower
+ // G_BUILD_VECTORs which could match those patterns.
+ int64_t Cst = Splat->getCst();
+ return (Cst != 0 && Cst != -1);
+}
+
+static bool applyBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) {
+ B.setInstrAndDebugLoc(MI);
+ B.buildInstr(AArch64::G_DUP, {MI.getOperand(0).getReg()},
+ {MI.getOperand(1).getReg()});
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \returns how many instructions would be saved by folding a G_ICMP's shift
+/// and/or extension operations.
+static unsigned getCmpOperandFoldingProfit(Register CmpOp,
+ const MachineRegisterInfo &MRI) {
+ // No instructions to save if there's more than one use or no uses.
+ if (!MRI.hasOneNonDBGUse(CmpOp))
+ return 0;
+
+ // FIXME: This is duplicated with the selector. (See: selectShiftedRegister)
+ auto IsSupportedExtend = [&](const MachineInstr &MI) {
+ if (MI.getOpcode() == TargetOpcode::G_SEXT_INREG)
+ return true;
+ if (MI.getOpcode() != TargetOpcode::G_AND)
+ return false;
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+ if (!ValAndVReg)
+ return false;
+ uint64_t Mask = ValAndVReg->Value.getZExtValue();
+ return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
+ };
+
+ MachineInstr *Def = getDefIgnoringCopies(CmpOp, MRI);
+ if (IsSupportedExtend(*Def))
+ return 1;
+
+ unsigned Opc = Def->getOpcode();
+ if (Opc != TargetOpcode::G_SHL && Opc != TargetOpcode::G_ASHR &&
+ Opc != TargetOpcode::G_LSHR)
+ return 0;
+
+ auto MaybeShiftAmt =
+ getConstantVRegValWithLookThrough(Def->getOperand(2).getReg(), MRI);
+ if (!MaybeShiftAmt)
+ return 0;
+ uint64_t ShiftAmt = MaybeShiftAmt->Value.getZExtValue();
+ MachineInstr *ShiftLHS =
+ getDefIgnoringCopies(Def->getOperand(1).getReg(), MRI);
+
+ // Check if we can fold an extend and a shift.
+ // FIXME: This is duplicated with the selector. (See:
+ // selectArithExtendedRegister)
+ if (IsSupportedExtend(*ShiftLHS))
+ return (ShiftAmt <= 4) ? 2 : 1;
+
+ LLT Ty = MRI.getType(Def->getOperand(0).getReg());
+ if (Ty.isVector())
+ return 0;
+ unsigned ShiftSize = Ty.getSizeInBits();
+ if ((ShiftSize == 32 && ShiftAmt <= 31) ||
+ (ShiftSize == 64 && ShiftAmt <= 63))
+ return 1;
+ return 0;
+}
+
+/// \returns true if it would be profitable to swap the LHS and RHS of a G_ICMP
+/// instruction \p MI.
+static bool trySwapICmpOperands(MachineInstr &MI,
+ const MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+ // Swap the operands if it would introduce a profitable folding opportunity.
+ // (e.g. a shift + extend).
+ //
+ // For example:
+ // lsl w13, w11, #1
+ // cmp w13, w12
+ // can be turned into:
+ // cmp w12, w11, lsl #1
+
+ // Don't swap if there's a constant on the RHS, because we know we can fold
+ // that.
+ Register RHS = MI.getOperand(3).getReg();
+ auto RHSCst = getConstantVRegValWithLookThrough(RHS, MRI);
+ if (RHSCst && isLegalArithImmed(RHSCst->Value.getSExtValue()))
+ return false;
+
+ Register LHS = MI.getOperand(2).getReg();
+ auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ auto GetRegForProfit = [&](Register Reg) {
+ MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ return isCMN(Def, Pred, MRI) ? Def->getOperand(2).getReg() : Reg;
+ };
+
+ // Don't have a constant on the RHS. If we swap the LHS and RHS of the
+ // compare, would we be able to fold more instructions?
+ Register TheLHS = GetRegForProfit(LHS);
+ Register TheRHS = GetRegForProfit(RHS);
+
+ // If the LHS is more likely to give us a folding opportunity, then swap the
+ // LHS and RHS.
+ return (getCmpOperandFoldingProfit(TheLHS, MRI) >
+ getCmpOperandFoldingProfit(TheRHS, MRI));
+}
+
+static bool applySwapICmpOperands(MachineInstr &MI,
+ GISelChangeObserver &Observer) {
+ auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ Register LHS = MI.getOperand(2).getReg();
+ Register RHS = MI.getOperand(3).getReg();
+ Observer.changedInstr(MI);
+ MI.getOperand(1).setPredicate(CmpInst::getSwappedPredicate(Pred));
+ MI.getOperand(2).setReg(RHS);
+ MI.getOperand(3).setReg(LHS);
+ Observer.changedInstr(MI);
+ return true;
+}
+
+/// \returns a function which builds a vector floating point compare instruction
+/// for a condition code \p CC.
+/// \param [in] IsZero - True if the comparison is against 0.
+/// \param [in] NoNans - True if the target has NoNansFPMath.
+static std::function<Register(MachineIRBuilder &)>
+getVectorFCMP(AArch64CC::CondCode CC, Register LHS, Register RHS, bool IsZero,
+ bool NoNans, MachineRegisterInfo &MRI) {
+ LLT DstTy = MRI.getType(LHS);
+ assert(DstTy.isVector() && "Expected vector types only?");
+ assert(DstTy == MRI.getType(RHS) && "Src and Dst types must match!");
+ switch (CC) {
+ default:
+ llvm_unreachable("Unexpected condition code!");
+ case AArch64CC::NE:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ auto FCmp = IsZero
+ ? MIB.buildInstr(AArch64::G_FCMEQZ, {DstTy}, {LHS})
+ : MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS});
+ return MIB.buildNot(DstTy, FCmp).getReg(0);
+ };
+ case AArch64CC::EQ:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ return IsZero
+ ? MIB.buildInstr(AArch64::G_FCMEQZ, {DstTy}, {LHS}).getReg(0)
+ : MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS})
+ .getReg(0);
+ };
+ case AArch64CC::GE:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ return IsZero
+ ? MIB.buildInstr(AArch64::G_FCMGEZ, {DstTy}, {LHS}).getReg(0)
+ : MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {LHS, RHS})
+ .getReg(0);
+ };
+ case AArch64CC::GT:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ return IsZero
+ ? MIB.buildInstr(AArch64::G_FCMGTZ, {DstTy}, {LHS}).getReg(0)
+ : MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {LHS, RHS})
+ .getReg(0);
+ };
+ case AArch64CC::LS:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ return IsZero
+ ? MIB.buildInstr(AArch64::G_FCMLEZ, {DstTy}, {LHS}).getReg(0)
+ : MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {RHS, LHS})
+ .getReg(0);
+ };
+ case AArch64CC::MI:
+ return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) {
+ return IsZero
+ ? MIB.buildInstr(AArch64::G_FCMLTZ, {DstTy}, {LHS}).getReg(0)
+ : MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {RHS, LHS})
+ .getReg(0);
+ };
+ }
+}
+
+/// Try to lower a vector G_FCMP \p MI into an AArch64-specific pseudo.
+static bool lowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIB) {
+ assert(MI.getOpcode() == TargetOpcode::G_FCMP);
+ const auto &ST = MI.getMF()->getSubtarget<AArch64Subtarget>();
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ if (!DstTy.isVector() || !ST.hasNEON())
+ return false;
+ const auto Pred =
+ static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ Register LHS = MI.getOperand(2).getReg();
+ // TODO: Handle v4s16 case.
+ unsigned EltSize = MRI.getType(LHS).getScalarSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ Register RHS = MI.getOperand(3).getReg();
+ auto Splat = getAArch64VectorSplat(*MRI.getVRegDef(RHS), MRI);
+
+ // Compares against 0 have special target-specific pseudos.
+ bool IsZero = Splat && Splat->isCst() && Splat->getCst() == 0;
+ bool Invert;
+ AArch64CC::CondCode CC, CC2;
+ changeVectorFCMPPredToAArch64CC(Pred, CC, CC2, Invert);
+ bool NoNans = ST.getTargetLowering()->getTargetMachine().Options.NoNaNsFPMath;
+
+ // Instead of having an apply function, just build here to simplify things.
+ MIB.setInstrAndDebugLoc(MI);
+ auto Cmp = getVectorFCMP(CC, LHS, RHS, IsZero, NoNans, MRI);
+ Register CmpRes;
+ if (CC2 == AArch64CC::AL)
+ CmpRes = Cmp(MIB);
+ else {
+ auto Cmp2 = getVectorFCMP(CC2, LHS, RHS, IsZero, NoNans, MRI);
+ auto Cmp2Dst = Cmp2(MIB);
+ auto Cmp1Dst = Cmp(MIB);
+ CmpRes = MIB.buildOr(DstTy, Cmp1Dst, Cmp2Dst).getReg(0);
+ }
+ if (Invert)
+ CmpRes = MIB.buildNot(DstTy, CmpRes).getReg(0);
+ MRI.replaceRegWith(Dst, CmpRes);
+ MI.eraseFromParent();
+ return false;
+}
+
+static bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ Register &SrcReg) {
+ assert(MI.getOpcode() == TargetOpcode::G_STORE);
+ Register DstReg = MI.getOperand(0).getReg();
+ if (MRI.getType(DstReg).isVector())
+ return false;
+ // Match a store of a truncate.
+ if (!mi_match(DstReg, MRI, m_GTrunc(m_Reg(SrcReg))))
+ return false;
+ // Only form truncstores for value types of max 64b.
+ return MRI.getType(SrcReg).getSizeInBits() <= 64;
+}
+
+static bool applyFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ GISelChangeObserver &Observer,
+ Register &SrcReg) {
+ assert(MI.getOpcode() == TargetOpcode::G_STORE);
+ Observer.changingInstr(MI);
+ MI.getOperand(0).setReg(SrcReg);
+ Observer.changedInstr(MI);
+ return true;
+}
+
#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
#include "AArch64GenPostLegalizeGILowering.inc"
#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
index 2f882ec..cc45c66 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
@@ -14,6 +14,7 @@
#include "AArch64.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -93,7 +94,12 @@
// Our solution here is to try to convert flag setting operations between
// a interval of identical FCMPs, so that CSE will be able to eliminate one.
bool Changed = false;
- const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+ auto &MF = *MBB.getParent();
+ auto &Subtarget = MF.getSubtarget();
+ const auto &TII = Subtarget.getInstrInfo();
+ auto TRI = Subtarget.getRegisterInfo();
+ auto RBI = Subtarget.getRegBankInfo();
+ auto &MRI = MF.getRegInfo();
// The first step is to find the first and last FCMPs. If we have found
// at least two, then set the limit of the bottom-up walk to the first FCMP
@@ -144,6 +150,11 @@
<< II);
II.setDesc(TII->get(NewOpc));
II.RemoveOperand(DeadNZCVIdx);
+ // Changing the opcode can result in differing regclass requirements,
+ // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
+ // Constrain the regclasses, possibly introducing a copy.
+ constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
+ II.getOperand(0), 0);
Changed |= true;
} else {
// Otherwise, we just set the nzcv imp-def operand to be dead, so the
@@ -169,7 +180,7 @@
bool Changed = false;
for (auto &BB : MF)
Changed |= optimizeNZCVDefs(BB);
- return true;
+ return Changed;
}
char AArch64PostSelectOptimize::ID = 0;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 5f9b64e..9efbcbb 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -11,15 +11,20 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64GlobalISelUtils.h"
#include "AArch64TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
@@ -53,6 +58,168 @@
MI.eraseFromParent();
}
+/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
+/// are sign bits. In this case, we can transform the G_ICMP to directly compare
+/// the wide value with a zero.
+static bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ GISelKnownBits *KB, Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
+
+ auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
+ if (!ICmpInst::isEquality(Pred))
+ return false;
+
+ Register LHS = MI.getOperand(2).getReg();
+ LLT LHSTy = MRI.getType(LHS);
+ if (!LHSTy.isScalar())
+ return false;
+
+ Register RHS = MI.getOperand(3).getReg();
+ Register WideReg;
+
+ if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
+ !mi_match(RHS, MRI, m_SpecificICst(0)))
+ return false;
+
+ LLT WideTy = MRI.getType(WideReg);
+ if (KB->computeNumSignBits(WideReg) <=
+ WideTy.getSizeInBits() - LHSTy.getSizeInBits())
+ return false;
+
+ MatchInfo = WideReg;
+ return true;
+}
+
+static bool applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder,
+ GISelChangeObserver &Observer,
+ Register &WideReg) {
+ assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+
+ LLT WideTy = MRI.getType(WideReg);
+ // We're going to directly use the wide register as the LHS, and then use an
+ // equivalent size zero for RHS.
+ Builder.setInstrAndDebugLoc(MI);
+ auto WideZero = Builder.buildConstant(WideTy, 0);
+ Observer.changingInstr(MI);
+ MI.getOperand(2).setReg(WideReg);
+ MI.getOperand(3).setReg(WideZero.getReg(0));
+ Observer.changedInstr(MI);
+ return true;
+}
+
+/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
+///
+/// e.g.
+///
+/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
+static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::pair<uint64_t, uint64_t> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
+ MachineFunction &MF = *MI.getMF();
+ auto &GlobalOp = MI.getOperand(1);
+ auto *GV = GlobalOp.getGlobal();
+ if (GV->isThreadLocal())
+ return false;
+
+ // Don't allow anything that could represent offsets etc.
+ if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
+ GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
+ return false;
+
+ // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
+ //
+ // %g = G_GLOBAL_VALUE @x
+ // %ptr1 = G_PTR_ADD %g, cst1
+ // %ptr2 = G_PTR_ADD %g, cst2
+ // ...
+ // %ptrN = G_PTR_ADD %g, cstN
+ //
+ // Identify the *smallest* constant. We want to be able to form this:
+ //
+ // %offset_g = G_GLOBAL_VALUE @x + min_cst
+ // %g = G_PTR_ADD %offset_g, -min_cst
+ // %ptr1 = G_PTR_ADD %g, cst1
+ // ...
+ Register Dst = MI.getOperand(0).getReg();
+ uint64_t MinOffset = -1ull;
+ for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
+ if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
+ return false;
+ auto Cst =
+ getConstantVRegValWithLookThrough(UseInstr.getOperand(2).getReg(), MRI);
+ if (!Cst)
+ return false;
+ MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
+ }
+
+ // Require that the new offset is larger than the existing one to avoid
+ // infinite loops.
+ uint64_t CurrOffset = GlobalOp.getOffset();
+ uint64_t NewOffset = MinOffset + CurrOffset;
+ if (NewOffset <= CurrOffset)
+ return false;
+
+ // Check whether folding this offset is legal. It must not go out of bounds of
+ // the referenced object to avoid violating the code model, and must be
+ // smaller than 2^21 because this is the largest offset expressible in all
+ // object formats.
+ //
+ // This check also prevents us from folding negative offsets, which will end
+ // up being treated in the same way as large positive ones. They could also
+ // cause code model violations, and aren't really common enough to matter.
+ if (NewOffset >= (1 << 21))
+ return false;
+
+ Type *T = GV->getValueType();
+ if (!T->isSized() ||
+ NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+ return false;
+ MatchInfo = std::make_pair(NewOffset, MinOffset);
+ return true;
+}
+
+static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ GISelChangeObserver &Observer,
+ std::pair<uint64_t, uint64_t> &MatchInfo) {
+ // Change:
+ //
+ // %g = G_GLOBAL_VALUE @x
+ // %ptr1 = G_PTR_ADD %g, cst1
+ // %ptr2 = G_PTR_ADD %g, cst2
+ // ...
+ // %ptrN = G_PTR_ADD %g, cstN
+ //
+ // To:
+ //
+ // %offset_g = G_GLOBAL_VALUE @x + min_cst
+ // %g = G_PTR_ADD %offset_g, -min_cst
+ // %ptr1 = G_PTR_ADD %g, cst1
+ // ...
+ // %ptrN = G_PTR_ADD %g, cstN
+ //
+ // Then, the original G_PTR_ADDs should be folded later on so that they look
+ // like this:
+ //
+ // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
+ uint64_t Offset, MinOffset;
+ std::tie(Offset, MinOffset) = MatchInfo;
+ B.setInstrAndDebugLoc(MI);
+ Observer.changingInstr(MI);
+ auto &GlobalOp = MI.getOperand(1);
+ auto *GV = GlobalOp.getGlobal();
+ GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
+ Register Dst = MI.getOperand(0).getReg();
+ Register NewGVDst = MRI.cloneVirtualRegister(Dst);
+ MI.getOperand(0).setReg(NewGVDst);
+ Observer.changedInstr(MI);
+ B.buildPtrAdd(
+ Dst, NewGVDst,
+ B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
+ return true;
+}
+
class AArch64PreLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
@@ -99,11 +266,14 @@
if (Generated.tryCombineAll(Observer, MI, B))
return true;
- switch (MI.getOpcode()) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
+ case TargetOpcode::G_MEMCPY_INLINE:
+ return Helper.tryEmitMemcpyInline(MI);
case TargetOpcode::G_MEMCPY:
case TargetOpcode::G_MEMMOVE:
case TargetOpcode::G_MEMSET: {
@@ -111,7 +281,11 @@
// heuristics decide.
unsigned MaxLen = EnableOpt ? 0 : 32;
// Try to inline memcpy type calls if optimizations are enabled.
- return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false;
+ if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
+ return true;
+ if (Opc == TargetOpcode::G_MEMSET)
+ return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
+ return false;
}
}
@@ -129,15 +303,13 @@
public:
static char ID;
- AArch64PreLegalizerCombiner(bool IsOptNone = false);
+ AArch64PreLegalizerCombiner();
StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
-private:
- bool IsOptNone;
};
} // end anonymous namespace
@@ -147,15 +319,15 @@
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
- if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
- }
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
-AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone)
- : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
+ : MachineFunctionPass(ID) {
initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
@@ -163,17 +335,22 @@
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
- auto *TPC = &getAnalysis<TargetPassConfig>();
+ auto &TPC = getAnalysis<TargetPassConfig>();
+
+ // Enable CSE.
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
+
const Function &F = MF.getFunction();
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
- MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), KB, MDT);
- Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+ Combiner C(PCInfo, &TPC);
+ return C.combineMachineInstrs(MF, CSEInfo);
}
char AArch64PreLegalizerCombiner::ID = 0;
@@ -182,13 +359,14 @@
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization", false,
false)
namespace llvm {
-FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) {
- return new AArch64PreLegalizerCombiner(IsOptNone);
+FunctionPass *createAArch64PreLegalizerCombiner() {
+ return new AArch64PreLegalizerCombiner();
}
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index c76c433..8c34027 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -25,6 +26,7 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/ErrorHandling.h"
#include <algorithm>
#include <cassert>
@@ -67,7 +69,7 @@
// GR64all + its subclasses.
assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
"Subclass not added?");
- assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+ assert(RBGPR.getSize() == 128 && "GPRs should hold up to 128-bit");
// The FPR register bank is fully defined by all the registers in
// GR64all + its subclasses.
@@ -85,7 +87,7 @@
// Check that the TableGen'ed like file is in sync we our expectations.
// First, the Idx.
assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
- {PMI_GPR32, PMI_GPR64}) &&
+ {PMI_GPR32, PMI_GPR64, PMI_GPR128}) &&
"PartialMappingIdx's are incorrectly ordered");
assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
{PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
@@ -102,6 +104,7 @@
CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+ CHECK_PARTIALMAP(PMI_GPR128, 0, 128, RBGPR);
CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
@@ -122,6 +125,7 @@
CHECK_VALUEMAP(GPR, 32);
CHECK_VALUEMAP(GPR, 64);
+ CHECK_VALUEMAP(GPR, 128);
CHECK_VALUEMAP(FPR, 16);
CHECK_VALUEMAP(FPR, 32);
CHECK_VALUEMAP(FPR, 64);
@@ -140,6 +144,7 @@
CHECK_VALUEMAP_3OPS(GPR, 32);
CHECK_VALUEMAP_3OPS(GPR, 64);
+ CHECK_VALUEMAP_3OPS(GPR, 128);
CHECK_VALUEMAP_3OPS(FPR, 32);
CHECK_VALUEMAP_3OPS(FPR, 64);
CHECK_VALUEMAP_3OPS(FPR, 128);
@@ -265,6 +270,7 @@
case AArch64::rtcGPR64RegClassID:
case AArch64::WSeqPairsClassRegClassID:
case AArch64::XSeqPairsClassRegClassID:
+ case AArch64::MatrixIndexGPR32_12_15RegClassID:
return getRegBank(AArch64::GPRRegBankID);
case AArch64::CCRRegClassID:
return getRegBank(AArch64::CCRegBankID);
@@ -466,11 +472,24 @@
getValueMapping(RBIdx, Size), NumOperands);
}
+/// \returns true if a given intrinsic \p ID only uses and defines FPRs.
+static bool isFPIntrinsic(unsigned ID) {
+ // TODO: Add more intrinsics.
+ switch (ID) {
+ default:
+ return false;
+ case Intrinsic::aarch64_neon_uaddlv:
+ return true;
+ }
+}
+
bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Depth) const {
unsigned Op = MI.getOpcode();
+ if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MI.getIntrinsicID()))
+ return true;
// Do we have an explicit floating point instruction?
if (isPreISelGenericFloatingPointOpcode(Op))
@@ -478,7 +497,8 @@
// No. Check if we have a copy-like instruction. If we do, then we could
// still be fed by floating point instructions.
- if (Op != TargetOpcode::COPY && !MI.isPHI())
+ if (Op != TargetOpcode::COPY && !MI.isPHI() &&
+ !isPreISelGenericOptimizationHint(Op))
return false;
// Check if we already know the register bank.
@@ -526,6 +546,8 @@
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
case TargetOpcode::G_INSERT_VECTOR_ELT:
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_BUILD_VECTOR_TRUNC:
return true;
default:
break;
@@ -665,9 +687,12 @@
switch (Opc) {
case AArch64::G_DUP: {
Register ScalarReg = MI.getOperand(1).getReg();
+ LLT ScalarTy = MRI.getType(ScalarReg);
auto ScalarDef = MRI.getVRegDef(ScalarReg);
- if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
- onlyDefinesFP(*ScalarDef, MRI, TRI))
+ // s8 is an exception for G_DUP, which we always want on gpr.
+ if (ScalarTy.getSizeInBits() != 8 &&
+ (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ onlyDefinesFP(*ScalarDef, MRI, TRI)))
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
else
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
@@ -698,10 +723,15 @@
break;
OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
break;
- case TargetOpcode::G_FCMP:
- OpRegBankIdx = {PMI_FirstGPR,
+ case TargetOpcode::G_FCMP: {
+ // If the result is a vector, it must use a FPR.
+ AArch64GenRegisterBankInfo::PartialMappingIdx Idx0 =
+ MRI.getType(MI.getOperand(0).getReg()).isVector() ? PMI_FirstFPR
+ : PMI_FirstGPR;
+ OpRegBankIdx = {Idx0,
/* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR};
break;
+ }
case TargetOpcode::G_BITCAST:
// This is going to be a cross register bank copy and this is expensive.
if (OpRegBankIdx[0] != OpRegBankIdx[1])
@@ -845,12 +875,16 @@
OpRegBankIdx[3] = PMI_FirstGPR;
break;
case TargetOpcode::G_EXTRACT: {
- // For s128 sources we have to use fpr.
+ // For s128 sources we have to use fpr unless we know otherwise.
+ auto Src = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
- if (SrcTy.getSizeInBits() == 128) {
- OpRegBankIdx[0] = PMI_FirstFPR;
- OpRegBankIdx[1] = PMI_FirstFPR;
- }
+ if (SrcTy.getSizeInBits() != 128)
+ break;
+ auto Idx = MRI.getRegClassOrNull(Src) == &AArch64::XSeqPairsClassRegClass
+ ? PMI_FirstGPR
+ : PMI_FirstFPR;
+ OpRegBankIdx[0] = Idx;
+ OpRegBankIdx[1] = Idx;
break;
}
case TargetOpcode::G_BUILD_VECTOR: {
@@ -876,7 +910,8 @@
}))
break;
if (isPreISelGenericFloatingPointOpcode(DefOpc) ||
- SrcTy.getSizeInBits() < 32) {
+ SrcTy.getSizeInBits() < 32 ||
+ getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank) {
// Have a floating point op.
// Make sure every operand gets mapped to a FPR register class.
unsigned NumOperands = MI.getNumOperands();
@@ -908,6 +943,20 @@
// Assign them FPR for now.
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR};
break;
+ case TargetOpcode::G_INTRINSIC: {
+ // Check if we know that the intrinsic has any constraints on its register
+ // banks. If it does, then update the mapping accordingly.
+ unsigned ID = MI.getIntrinsicID();
+ unsigned Idx = 0;
+ if (!isFPIntrinsic(ID))
+ break;
+ for (const auto &Op : MI.explicit_operands()) {
+ if (Op.isReg())
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ ++Idx;
+ }
+ break;
+ }
}
// Finally construct the computed mapping.
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 019017b..2d76e48 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -34,8 +34,9 @@
PMI_FPR512,
PMI_GPR32,
PMI_GPR64,
+ PMI_GPR128,
PMI_FirstGPR = PMI_GPR32,
- PMI_LastGPR = PMI_GPR64,
+ PMI_LastGPR = PMI_GPR128,
PMI_FirstFPR = PMI_FPR16,
PMI_LastFPR = PMI_FPR512,
PMI_Min = PMI_FirstFPR,
@@ -48,16 +49,16 @@
enum ValueMappingIdx {
InvalidIdx = 0,
First3OpsIdx = 1,
- Last3OpsIdx = 22,
+ Last3OpsIdx = 25,
DistanceBetweenRegBanks = 3,
- FirstCrossRegCpyIdx = 25,
- LastCrossRegCpyIdx = 39,
+ FirstCrossRegCpyIdx = 28,
+ LastCrossRegCpyIdx = 42,
DistanceBetweenCrossRegCpy = 2,
- FPExt16To32Idx = 41,
- FPExt16To64Idx = 43,
- FPExt32To64Idx = 45,
- FPExt64To128Idx = 47,
- Shift64Imm = 49
+ FPExt16To32Idx = 44,
+ FPExt16To64Idx = 46,
+ FPExt32To64Idx = 48,
+ FPExt64To128Idx = 50,
+ Shift64Imm = 52,
};
static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 2cbe831..c3e7475 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -760,17 +760,24 @@
/// Returns true if Imm is valid for CPY/DUP.
template <typename T>
static inline bool isSVECpyImm(int64_t Imm) {
- bool IsImm8 = int8_t(Imm) == Imm;
- bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
+ // Imm is interpreted as a signed value, which means top bits must be all ones
+ // (sign bits if the immediate value is negative and passed in a larger
+ // container), or all zeroes.
+ int64_t Mask = ~int64_t(std::numeric_limits<std::make_unsigned_t<T>>::max());
+ if ((Imm & Mask) != 0 && (Imm & Mask) != Mask)
+ return false;
- if (std::is_same<int8_t, std::make_signed_t<T>>::value ||
- std::is_same<int8_t, T>::value)
- return IsImm8 || uint8_t(Imm) == Imm;
+ // Imm is a signed 8-bit value.
+ // Top bits must be zeroes or sign bits.
+ if (Imm & 0xff)
+ return int8_t(Imm) == T(Imm);
- if (std::is_same<int16_t, std::make_signed_t<T>>::value)
- return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm;
+ // Imm is a signed 16-bit value and multiple of 256.
+ // Top bits must be zeroes or sign bits.
+ if (Imm & 0xff00)
+ return int16_t(Imm) == T(Imm);
- return IsImm8 || IsImm16;
+ return Imm == 0;
}
/// Returns true if Imm is valid for ADD/SUB.
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 75a9f2f..290fe88 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -67,8 +67,7 @@
{"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
{"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
{"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
- {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
- {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
+ {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}};
// Fixup kinds from .reloc directive are like R_AARCH64_NONE. They do not
// require any extra processing.
@@ -95,10 +94,6 @@
const MCSubtargetInfo &STI) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
- void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
-
- unsigned getPointerSize() const { return 8; }
-
unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
@@ -113,9 +108,6 @@
default:
llvm_unreachable("Unknown fixup kind!");
- case AArch64::fixup_aarch64_tlsdesc_call:
- return 0;
-
case FK_Data_1:
return 1;
@@ -342,6 +334,10 @@
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_AARCH64_NONE)
+ .Case("BFD_RELOC_16", ELF::R_AARCH64_ABS16)
+ .Case("BFD_RELOC_32", ELF::R_AARCH64_ABS32)
+ .Case("BFD_RELOC_64", ELF::R_AARCH64_ABS64)
.Default(-1u);
if (Type == -1u)
return None;
@@ -367,7 +363,6 @@
case FK_Data_8:
return 8;
- case AArch64::fixup_aarch64_tlsdesc_call:
case AArch64::fixup_aarch64_movw:
case AArch64::fixup_aarch64_pcrel_branch14:
case AArch64::fixup_aarch64_add_imm12:
@@ -496,13 +491,6 @@
if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
return true;
- AArch64MCExpr::VariantKind RefKind =
- static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
- AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
- // LDR GOT relocations need a relocation
- if (Kind == AArch64::fixup_aarch64_ldr_pcrel_imm19 &&
- SymLoc == AArch64MCExpr::VK_GOT)
- return true;
return false;
}
@@ -579,6 +567,7 @@
unsigned StackSize = 0;
uint32_t CompactUnwindEncoding = 0;
+ int CurOffset = 0;
for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
const MCCFIInstruction &Inst = Instrs[i];
@@ -608,6 +597,9 @@
assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
"Frame pointer not pushed!");
+ assert(FPPush.getOffset() + 8 == LRPush.getOffset());
+ CurOffset = FPPush.getOffset();
+
unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true);
@@ -634,11 +626,19 @@
if (i + 1 == e)
return CU::UNWIND_ARM64_MODE_DWARF;
+ if (CurOffset != 0 && Inst.getOffset() != CurOffset - 8)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+ CurOffset = Inst.getOffset();
+
const MCCFIInstruction &Inst2 = Instrs[++i];
if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
return CU::UNWIND_ARM64_MODE_DWARF;
unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true);
+ if (Inst2.getOffset() != CurOffset - 8)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+ CurOffset = Inst2.getOffset();
+
// N.B. The encodings must be in register number order, and the X
// registers before the D registers.
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index fcf67bd..2f9c172 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -444,8 +444,6 @@
Ctx.reportError(Fixup.getLoc(),
"invalid fixup for movz/movk instruction");
return ELF::R_AARCH64_NONE;
- case AArch64::fixup_aarch64_tlsdesc_call:
- return R_CLS(TLSDESC_CALL);
default:
Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
return ELF::R_AARCH64_NONE;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index ec97e1c..f2a4708 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -12,6 +12,8 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCTargetDesc.h"
#include "AArch64TargetStreamer.h"
#include "AArch64WinCOFFStreamer.h"
#include "llvm/ADT/DenseMap.h"
@@ -48,61 +50,61 @@
void emitInst(uint32_t Inst) override;
void emitDirectiveVariantPCS(MCSymbol *Symbol) override {
- OS << "\t.variant_pcs " << Symbol->getName() << "\n";
+ OS << "\t.variant_pcs\t" << Symbol->getName() << "\n";
}
- void EmitARM64WinCFIAllocStack(unsigned Size) override {
- OS << "\t.seh_stackalloc " << Size << "\n";
+ void emitARM64WinCFIAllocStack(unsigned Size) override {
+ OS << "\t.seh_stackalloc\t" << Size << "\n";
}
- void EmitARM64WinCFISaveR19R20X(int Offset) override {
- OS << "\t.seh_save_r19r20_x " << Offset << "\n";
+ void emitARM64WinCFISaveR19R20X(int Offset) override {
+ OS << "\t.seh_save_r19r20_x\t" << Offset << "\n";
}
- void EmitARM64WinCFISaveFPLR(int Offset) override {
- OS << "\t.seh_save_fplr " << Offset << "\n";
+ void emitARM64WinCFISaveFPLR(int Offset) override {
+ OS << "\t.seh_save_fplr\t" << Offset << "\n";
}
- void EmitARM64WinCFISaveFPLRX(int Offset) override {
- OS << "\t.seh_save_fplr_x " << Offset << "\n";
+ void emitARM64WinCFISaveFPLRX(int Offset) override {
+ OS << "\t.seh_save_fplr_x\t" << Offset << "\n";
}
- void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_reg x" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveReg(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_reg\tx" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_reg_x x" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveRegX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_reg_x\tx" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_regp x" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveRegP(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_regp\tx" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_regp_x x" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_regp_x\tx" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_lrpair x" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_lrpair\tx" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_freg d" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveFReg(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_freg\td" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_freg_x d" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_freg_x\td" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_fregp d" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_fregp\td" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override {
- OS << "\t.seh_save_fregp_x d" << Reg << ", " << Offset << "\n";
+ void emitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_fregp_x\td" << Reg << ", " << Offset << "\n";
}
- void EmitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; }
- void EmitARM64WinCFIAddFP(unsigned Size) override {
- OS << "\t.seh_add_fp " << Size << "\n";
+ void emitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; }
+ void emitARM64WinCFIAddFP(unsigned Size) override {
+ OS << "\t.seh_add_fp\t" << Size << "\n";
}
- void EmitARM64WinCFINop() override { OS << "\t.seh_nop\n"; }
- void EmitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; }
- void EmitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; }
- void EmitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; }
- void EmitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; }
- void EmitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; }
- void EmitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; }
- void EmitARM64WinCFIContext() override { OS << "\t.seh_context\n"; }
- void EmitARM64WinCFIClearUnwoundToCall() override {
+ void emitARM64WinCFINop() override { OS << "\t.seh_nop\n"; }
+ void emitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; }
+ void emitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; }
+ void emitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; }
+ void emitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; }
+ void emitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; }
+ void emitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; }
+ void emitARM64WinCFIContext() override { OS << "\t.seh_context\n"; }
+ void emitARM64WinCFIClearUnwoundToCall() override {
OS << "\t.seh_clear_unwound_to_call\n";
}
@@ -163,7 +165,7 @@
/// necessary.
void emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) override {
- EmitA64MappingSymbol();
+ emitA64MappingSymbol();
MCELFStreamer::emitInstruction(Inst, STI);
}
@@ -180,7 +182,7 @@
Inst >>= 8;
}
- EmitA64MappingSymbol();
+ emitA64MappingSymbol();
MCELFStreamer::emitBytes(StringRef(Buffer, 4));
}
@@ -215,18 +217,18 @@
void emitDataMappingSymbol() {
if (LastEMS == EMS_Data)
return;
- EmitMappingSymbol("$d");
+ emitMappingSymbol("$d");
LastEMS = EMS_Data;
}
- void EmitA64MappingSymbol() {
+ void emitA64MappingSymbol() {
if (LastEMS == EMS_A64)
return;
- EmitMappingSymbol("$x");
+ emitMappingSymbol("$x");
LastEMS = EMS_A64;
}
- void EmitMappingSymbol(StringRef Name) {
+ void emitMappingSymbol(StringRef Name) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
emitLabel(Symbol);
@@ -243,8 +245,6 @@
} // end anonymous namespace
-namespace llvm {
-
AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
return static_cast<AArch64ELFStreamer &>(Streamer);
}
@@ -257,23 +257,20 @@
cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
}
-MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
- formatted_raw_ostream &OS,
- MCInstPrinter *InstPrint,
- bool isVerboseAsm) {
+MCTargetStreamer *
+llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
return new AArch64TargetAsmStreamer(S, OS);
}
-MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
- std::unique_ptr<MCAsmBackend> TAB,
- std::unique_ptr<MCObjectWriter> OW,
- std::unique_ptr<MCCodeEmitter> Emitter,
- bool RelaxAll) {
+MCELFStreamer *llvm::createAArch64ELFStreamer(
+ MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
+ bool RelaxAll) {
AArch64ELFStreamer *S = new AArch64ELFStreamer(
Context, std::move(TAB), std::move(OW), std::move(Emitter));
if (RelaxAll)
S->getAssembler().setRelaxAll(true);
return S;
}
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index fe8043f..767dd88 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -55,9 +55,6 @@
// branch26 only on ELF.
fixup_aarch64_pcrel_call26,
- // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation.
- fixup_aarch64_tlsdesc_call,
-
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 340120d..cd1bfed 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -51,6 +51,14 @@
const MCRegisterInfo &MRI)
: AArch64InstPrinter(MAI, MII, MRI) {}
+bool AArch64InstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
+ if (Opt == "no-aliases") {
+ PrintAliases = false;
+ return true;
+ }
+ return false;
+}
+
void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
// This is for .cfi directives.
OS << getRegisterName(RegNo);
@@ -296,7 +304,7 @@
return;
}
- if (!printAliasInstr(MI, Address, STI, O))
+ if (!PrintAliases || !printAliasInstr(MI, Address, STI, O))
printInstruction(MI, Address, STI, O);
printAnnotation(O, Annot);
@@ -872,6 +880,70 @@
return true;
}
+template <int EltSize>
+void AArch64InstPrinter::printMatrix(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &RegOp = MI->getOperand(OpNum);
+ assert(RegOp.isReg() && "Unexpected operand type!");
+
+ O << getRegisterName(RegOp.getReg());
+ switch (EltSize) {
+ case 0:
+ break;
+ case 8:
+ O << ".b";
+ break;
+ case 16:
+ O << ".h";
+ break;
+ case 32:
+ O << ".s";
+ break;
+ case 64:
+ O << ".d";
+ break;
+ case 128:
+ O << ".q";
+ break;
+ default:
+ llvm_unreachable("Unsupported element size");
+ }
+}
+
+template <bool IsVertical>
+void AArch64InstPrinter::printMatrixTileVector(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &RegOp = MI->getOperand(OpNum);
+ assert(RegOp.isReg() && "Unexpected operand type!");
+ StringRef RegName = getRegisterName(RegOp.getReg());
+
+ // Insert the horizontal/vertical flag before the suffix.
+ StringRef Base, Suffix;
+ std::tie(Base, Suffix) = RegName.split('.');
+ O << Base << (IsVertical ? "v" : "h") << '.' << Suffix;
+}
+
+void AArch64InstPrinter::printMatrixTile(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &RegOp = MI->getOperand(OpNum);
+ assert(RegOp.isReg() && "Unexpected operand type!");
+ O << getRegisterName(RegOp.getReg());
+}
+
+void AArch64InstPrinter::printSVCROp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ assert(MO.isImm() && "Unexpected operand type!");
+ unsigned svcrop = MO.getImm();
+ const auto *SVCR = AArch64SVCR::lookupSVCRByEncoding(svcrop);
+ assert(SVCR && "Unexpected SVCR operand!");
+ O << SVCR->Name;
+}
+
void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1152,7 +1224,7 @@
void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- unsigned btihintop = (MI->getOperand(OpNum).getImm() ^ 32) >> 1;
+ unsigned btihintop = MI->getOperand(OpNum).getImm() ^ 32;
auto BTI = AArch64BTIHint::lookupBTIByEncoding(btihintop);
if (BTI)
O << BTI->Name;
@@ -1164,8 +1236,8 @@
const MCSubtargetInfo &STI,
raw_ostream &O) {
const MCOperand &MO = MI->getOperand(OpNum);
- float FPImm =
- MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+ float FPImm = MO.isDFPImm() ? bit_cast<double>(MO.getDFPImm())
+ : AArch64_AM::getFPImmFloat(MO.getImm());
// 8 decimal places are enough to perfectly represent permitted floats.
O << format("#%.8f", FPImm);
@@ -1268,6 +1340,36 @@
O << getRegisterName(Even) << ", " << getRegisterName(Odd);
}
+static const unsigned MatrixZADRegisterTable[] = {
+ AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
+ AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7
+};
+
+void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned MaxRegs = 8;
+ unsigned RegMask = MI->getOperand(OpNum).getImm();
+
+ unsigned NumRegs = 0;
+ for (unsigned I = 0; I < MaxRegs; ++I)
+ if ((RegMask & (1 << I)) != 0)
+ ++NumRegs;
+
+ O << "{";
+ unsigned Printed = 0;
+ for (unsigned I = 0; I < MaxRegs; ++I) {
+ unsigned Reg = RegMask & (1 << I);
+ if (Reg == 0)
+ continue;
+ O << getRegisterName(MatrixZADRegisterTable[I]);
+ if (Printed + 1 != NumRegs)
+ O << ", ";
+ ++Printed;
+ }
+ O << "}";
+}
+
void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O,
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 4be885e..9ec74a1 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -25,6 +25,8 @@
AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI);
+ bool applyTargetSpecificCLOption(StringRef Opt) override;
+
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &O) override;
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
@@ -144,6 +146,9 @@
const MCSubtargetInfo &STI, raw_ostream &O,
StringRef LayoutSuffix);
+ void printMatrixTileList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
/// Print a list of vector registers where the type suffix is implicit
/// (i.e. attached to the instruction rather than the registers).
void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
@@ -185,6 +190,17 @@
const MCSubtargetInfo &STI, raw_ostream &O);
void printSVEPattern(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+
+ template <bool IsVertical>
+ void printMatrixTileVector(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixTile(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <int EltSize>
+ void printMatrix(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSVCROp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O);
template <char = 0>
void printSVERegOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index da8f511..ad97071 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -186,6 +186,13 @@
unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
const MCSubtargetInfo &STI) const;
+ uint32_t EncodeMatrixTileListRegisterClass(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t encodeMatrixIndexGPR32(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
private:
FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
void
@@ -516,6 +523,24 @@
return MO.getImm() - 8;
}
+uint32_t AArch64MCCodeEmitter::EncodeMatrixTileListRegisterClass(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned RegMask = MI.getOperand(OpIdx).getImm();
+ assert(RegMask <= 0xFF && "Invalid register mask!");
+ return RegMask;
+}
+
+uint32_t
+AArch64MCCodeEmitter::encodeMatrixIndexGPR32(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto RegOpnd = MI.getOperand(OpIdx).getReg();
+ assert(RegOpnd >= AArch64::W12 && RegOpnd <= AArch64::W15 &&
+ "Expected register in the range w12-w15!");
+ return RegOpnd - AArch64::W12;
+}
+
uint32_t
AArch64MCCodeEmitter::getImm8OptLsl(const MCInst &MI, unsigned OpIdx,
SmallVectorImpl<MCFixup> &Fixups,
@@ -599,8 +624,12 @@
// This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
// following (BLR) instruction. It doesn't emit any code itself so it
// doesn't go through the normal TableGenerated channels.
- MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
- Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
+ auto Reloc = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32
+ ? ELF::R_AARCH64_P32_TLSDESC_CALL
+ : ELF::R_AARCH64_TLSDESC_CALL;
+ Fixups.push_back(
+ MCFixup::create(0, MI.getOperand(0).getExpr(),
+ MCFixupKind(FirstLiteralRelocationKind + Reloc)));
return;
}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index f32a8f1..557603c 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -107,10 +107,9 @@
getStreamer().emitBytes(StringRef(Buffer, 4));
}
-namespace llvm {
-
MCTargetStreamer *
-createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+llvm::createAArch64ObjectTargetStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI) {
const Triple &TT = STI.getTargetTriple();
if (TT.isOSBinFormatELF())
return new AArch64TargetELFStreamer(S);
@@ -118,5 +117,3 @@
return new AArch64TargetWinCOFFStreamer(S);
return nullptr;
}
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 73dc1e5..9b03077 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -42,30 +42,30 @@
/// Callback used to implement the .variant_pcs directive.
virtual void emitDirectiveVariantPCS(MCSymbol *Symbol) {};
- virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
- virtual void EmitARM64WinCFISaveR19R20X(int Offset) {}
- virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
- virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
- virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) {}
- virtual void EmitARM64WinCFISetFP() {}
- virtual void EmitARM64WinCFIAddFP(unsigned Size) {}
- virtual void EmitARM64WinCFINop() {}
- virtual void EmitARM64WinCFISaveNext() {}
- virtual void EmitARM64WinCFIPrologEnd() {}
- virtual void EmitARM64WinCFIEpilogStart() {}
- virtual void EmitARM64WinCFIEpilogEnd() {}
- virtual void EmitARM64WinCFITrapFrame() {}
- virtual void EmitARM64WinCFIMachineFrame() {}
- virtual void EmitARM64WinCFIContext() {}
- virtual void EmitARM64WinCFIClearUnwoundToCall() {}
+ virtual void emitARM64WinCFIAllocStack(unsigned Size) {}
+ virtual void emitARM64WinCFISaveR19R20X(int Offset) {}
+ virtual void emitARM64WinCFISaveFPLR(int Offset) {}
+ virtual void emitARM64WinCFISaveFPLRX(int Offset) {}
+ virtual void emitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) {}
+ virtual void emitARM64WinCFISetFP() {}
+ virtual void emitARM64WinCFIAddFP(unsigned Size) {}
+ virtual void emitARM64WinCFINop() {}
+ virtual void emitARM64WinCFISaveNext() {}
+ virtual void emitARM64WinCFIPrologEnd() {}
+ virtual void emitARM64WinCFIEpilogStart() {}
+ virtual void emitARM64WinCFIEpilogEnd() {}
+ virtual void emitARM64WinCFITrapFrame() {}
+ virtual void emitARM64WinCFIMachineFrame() {}
+ virtual void emitARM64WinCFIContext() {}
+ virtual void emitARM64WinCFIClearUnwoundToCall() {}
private:
std::unique_ptr<AssemblerConstantPools> ConstantPools;
@@ -95,33 +95,33 @@
// The unwind codes on ARM64 Windows are documented at
// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
- void EmitARM64WinCFIAllocStack(unsigned Size) override;
- void EmitARM64WinCFISaveR19R20X(int Offset) override;
- void EmitARM64WinCFISaveFPLR(int Offset) override;
- void EmitARM64WinCFISaveFPLRX(int Offset) override;
- void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override;
- void EmitARM64WinCFISetFP() override;
- void EmitARM64WinCFIAddFP(unsigned Size) override;
- void EmitARM64WinCFINop() override;
- void EmitARM64WinCFISaveNext() override;
- void EmitARM64WinCFIPrologEnd() override;
- void EmitARM64WinCFIEpilogStart() override;
- void EmitARM64WinCFIEpilogEnd() override;
- void EmitARM64WinCFITrapFrame() override;
- void EmitARM64WinCFIMachineFrame() override;
- void EmitARM64WinCFIContext() override;
- void EmitARM64WinCFIClearUnwoundToCall() override;
+ void emitARM64WinCFIAllocStack(unsigned Size) override;
+ void emitARM64WinCFISaveR19R20X(int Offset) override;
+ void emitARM64WinCFISaveFPLR(int Offset) override;
+ void emitARM64WinCFISaveFPLRX(int Offset) override;
+ void emitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override;
+ void emitARM64WinCFISetFP() override;
+ void emitARM64WinCFIAddFP(unsigned Size) override;
+ void emitARM64WinCFINop() override;
+ void emitARM64WinCFISaveNext() override;
+ void emitARM64WinCFIPrologEnd() override;
+ void emitARM64WinCFIEpilogStart() override;
+ void emitARM64WinCFIEpilogEnd() override;
+ void emitARM64WinCFITrapFrame() override;
+ void emitARM64WinCFIMachineFrame() override;
+ void emitARM64WinCFIContext() override;
+ void emitARM64WinCFIClearUnwoundToCall() override;
private:
- void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
+ void emitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
};
MCTargetStreamer *
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index aaadc8d..0072af4 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -6,6 +6,7 @@
//
//===---------------------------------------------------------------------===//
+#include "AArch64MCTargetDesc.h"
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "llvm/ADT/Twine.h"
@@ -45,6 +46,19 @@
unsigned AArch64WinCOFFObjectWriter::getRelocType(
MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
bool IsCrossSection, const MCAsmBackend &MAB) const {
+ unsigned FixupKind = Fixup.getKind();
+ if (IsCrossSection) {
+ // IMAGE_REL_ARM64_REL64 does not exist. We treat FK_Data_8 as FK_PCRel_4 so
+ // that .xword a-b can lower to IMAGE_REL_ARM64_REL32. This allows generic
+ // instrumentation to not bother with the COFF limitation. A negative value
+ // needs attention.
+ if (FixupKind != FK_Data_4 && FixupKind != FK_Data_8) {
+ Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+ return COFF::IMAGE_REL_ARM64_ADDR32;
+ }
+ FixupKind = FK_PCRel_4;
+ }
+
auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
: Target.getSymA()->getKind();
const MCExpr *Expr = Fixup.getValue();
@@ -64,7 +78,7 @@
}
}
- switch (static_cast<unsigned>(Fixup.getKind())) {
+ switch (FixupKind) {
default: {
if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
Ctx.reportError(Fixup.getLoc(), "relocation type " +
@@ -78,6 +92,9 @@
return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
}
+ case FK_PCRel_4:
+ return COFF::IMAGE_REL_ARM64_REL32;
+
case FK_Data_4:
switch (Modifier) {
default:
@@ -141,10 +158,6 @@
return true;
}
-namespace llvm {
-
-std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
+std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64WinCOFFObjectWriter() {
return std::make_unique<AArch64WinCOFFObjectWriter>();
}
-
-} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 1c50706..b688165 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -59,17 +59,14 @@
}
} // end anonymous namespace
-namespace llvm {
-
// Helper function to common out unwind code setup for those codes that can
// belong to both prolog and epilog.
// There are three types of Windows ARM64 SEH codes. They can
// 1) take no operands: SEH_Nop, SEH_PrologEnd, SEH_EpilogStart, SEH_EpilogEnd
// 2) take an offset: SEH_StackAlloc, SEH_SaveFPLR, SEH_SaveFPLR_X
// 3) take a register and an offset/size: all others
-void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode,
- int Reg,
- int Offset) {
+void AArch64TargetWinCOFFStreamer::emitARM64WinUnwindCode(unsigned UnwindCode,
+ int Reg, int Offset) {
auto &S = getStreamer();
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
@@ -82,96 +79,96 @@
CurFrame->Instructions.push_back(Inst);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) {
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIAllocStack(unsigned Size) {
unsigned Op = Win64EH::UOP_AllocSmall;
if (Size >= 16384)
Op = Win64EH::UOP_AllocLarge;
else if (Size >= 512)
Op = Win64EH::UOP_AllocMedium;
- EmitARM64WinUnwindCode(Op, -1, Size);
+ emitARM64WinUnwindCode(Op, -1, Size);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveR19R20X(int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveR19R20X(int Offset) {
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFPLR(int Offset) {
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLRX(int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLRX, -1, Offset);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFPLRX(int Offset) {
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFPLRX, -1, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveReg(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveReg(unsigned Reg,
int Offset) {
assert(Offset >= 0 && Offset <= 504 &&
"Offset for save reg should be >= 0 && <= 504");
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveReg, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveReg, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegX(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveRegX(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegX, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveRegX, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegP(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveRegP(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegP, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveRegP, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveRegPX(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveLRPair(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveLRPair(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFReg(unsigned Reg,
int Offset) {
assert(Offset >= 0 && Offset <= 504 &&
"Offset for save reg should be >= 0 && <= 504");
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFReg, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFReg, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegX(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFRegX(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegX, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFRegX, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegP(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFRegP(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegP, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFRegP, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegPX(unsigned Reg,
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveFRegPX(unsigned Reg,
int Offset) {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegPX, Reg, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveFRegPX, Reg, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISetFP() {
- EmitARM64WinUnwindCode(Win64EH::UOP_SetFP, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISetFP() {
+ emitARM64WinUnwindCode(Win64EH::UOP_SetFP, -1, 0);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAddFP(unsigned Offset) {
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIAddFP(unsigned Offset) {
assert(Offset <= 2040 && "UOP_AddFP must have offset <= 2040");
- EmitARM64WinUnwindCode(Win64EH::UOP_AddFP, -1, Offset);
+ emitARM64WinUnwindCode(Win64EH::UOP_AddFP, -1, Offset);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() {
- EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFINop() {
+ emitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveNext() {
- EmitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveNext() {
+ emitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0);
}
// The functions below handle opcodes that can end up in either a prolog or
// an epilog, but not both.
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIPrologEnd() {
auto &S = getStreamer();
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
@@ -184,7 +181,7 @@
CurFrame->Instructions.insert(it, Inst);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() {
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogStart() {
auto &S = getStreamer();
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
@@ -194,7 +191,7 @@
CurrentEpilog = S.emitCFILabel();
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() {
auto &S = getStreamer();
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
@@ -207,23 +204,23 @@
CurrentEpilog = nullptr;
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFITrapFrame() {
- EmitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFITrapFrame() {
+ emitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIMachineFrame() {
- EmitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIMachineFrame() {
+ emitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIContext() {
- EmitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIContext() {
+ emitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0);
}
-void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIClearUnwoundToCall() {
- EmitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0);
+void AArch64TargetWinCOFFStreamer::emitARM64WinCFIClearUnwoundToCall() {
+ emitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0);
}
-MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
+MCWinCOFFStreamer *llvm::createAArch64WinCOFFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll, bool IncrementalLinkerCompatible) {
@@ -232,5 +229,3 @@
S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
}
-
-} // end llvm namespace
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/SMEInstrFormats.td b/src/llvm-project/llvm/lib/Target/AArch64/SMEInstrFormats.td
new file mode 100644
index 0000000..6208916
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -0,0 +1,792 @@
+//=-- SMEInstrFormats.td - AArch64 SME Instruction classes -*- tablegen -*--=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Matrix Extension (SME) Instruction Class Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SME Outer Products
+//===----------------------------------------------------------------------===//
+
+class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
+ ZPRRegOp zpr_ty, string mnemonic>
+ : I<(outs za_ty:$ZAda),
+ (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+ mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
+ "", []>,
+ Sched<[]> {
+ bits<5> Zm;
+ bits<3> Pm;
+ bits<3> Pn;
+ bits<5> Zn;
+ let Inst{31-23} = 0b100000001;
+ let Inst{22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = Pm;
+ let Inst{12-10} = Pn;
+ let Inst{9-5} = Zn;
+ let Inst{4} = S;
+ let Inst{3} = 0b0;
+}
+
+class sme_outer_product_fp32<bit S, string mnemonic>
+ : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
+ bits<2> ZAda;
+ let Inst{1-0} = ZAda;
+ let Inst{2} = 0b0;
+}
+
+class sme_outer_product_fp64<bit S, string mnemonic>
+ : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
+ bits<3> ZAda;
+ let Inst{2-0} = ZAda;
+}
+
+class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
+ MatrixTileOperand za_ty, ZPRRegOp zpr_ty,
+ string mnemonic>
+ : I<(outs za_ty:$ZAda),
+ (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+ mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
+ "", []>,
+ Sched<[]> {
+ bits<5> Zm;
+ bits<3> Pm;
+ bits<3> Pn;
+ bits<5> Zn;
+ let Inst{31-25} = 0b1010000;
+ let Inst{24} = u0;
+ let Inst{23} = 0b1;
+ let Inst{22} = sz;
+ let Inst{21} = u1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = Pm;
+ let Inst{12-10} = Pn;
+ let Inst{9-5} = Zn;
+ let Inst{4} = S;
+ let Inst{3} = 0b0;
+}
+
+class sme_int_outer_product_i32<bits<3> opc, string mnemonic>
+ : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8,
+ mnemonic> {
+ bits<2> ZAda;
+ let Inst{1-0} = ZAda;
+ let Inst{2} = 0b0;
+}
+
+class sme_int_outer_product_i64<bits<3> opc, string mnemonic>
+ : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16,
+ mnemonic> {
+ bits<3> ZAda;
+ let Inst{2-0} = ZAda;
+}
+
+class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
+ : I<(outs TileOp32:$ZAda),
+ (ins PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm),
+ mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
+ "", []>,
+ Sched<[]> {
+ bits<5> Zm;
+ bits<3> Pm;
+ bits<3> Pn;
+ bits<5> Zn;
+ bits<2> ZAda;
+ let Inst{31-22} = 0b1000000110;
+ let Inst{21} = op;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = Pm;
+ let Inst{12-10} = Pn;
+ let Inst{9-5} = Zn;
+ let Inst{4} = S;
+ let Inst{3-2} = 0b00;
+ let Inst{1-0} = ZAda;
+}
+
+multiclass sme_bf16_outer_product<bit S, string mnemonic> {
+ def : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+}
+
+multiclass sme_f16_outer_product<bit S, string mnemonic> {
+ def : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+}
+
+//===----------------------------------------------------------------------===//
+// SME Add Vector to Tile
+//===----------------------------------------------------------------------===//
+
+class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty,
+ ZPRRegOp zpr_ty, string mnemonic>
+ : I<(outs tile_ty:$ZAda),
+ (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn),
+ mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn",
+ "", []>, Sched<[]> {
+ bits<3> Pm;
+ bits<3> Pn;
+ bits<5> Zn;
+ let Inst{31-23} = 0b110000001;
+ let Inst{22} = op;
+ let Inst{21-17} = 0b01000;
+ let Inst{16} = V;
+ let Inst{15-13} = Pm;
+ let Inst{12-10} = Pn;
+ let Inst{9-5} = Zn;
+ let Inst{4-3} = 0b00;
+}
+
+class sme_add_vector_to_tile_u32<bit V, string mnemonic>
+ : sme_add_vector_to_tile_inst<0b0, V, TileOp32, ZPR32, mnemonic> {
+ bits<2> ZAda;
+ let Inst{2} = 0b0;
+ let Inst{1-0} = ZAda;
+}
+
+class sme_add_vector_to_tile_u64<bit V, string mnemonic>
+ : sme_add_vector_to_tile_inst<0b1, V, TileOp64, ZPR64, mnemonic> {
+ bits<3> ZAda;
+ let Inst{2-0} = ZAda;
+}
+
+//===----------------------------------------------------------------------===//
+// SME Contiguous Loads
+//===----------------------------------------------------------------------===//
+
+class sme_mem_ld_ss_base<bit Q, bit V, bits<2> msz, dag outs, dag ins,
+ string mnemonic, string argstr>
+ : I<outs, ins, mnemonic, argstr, "", []>, Sched<[]> {
+ bits<5> Rm;
+ bits<2> Rv;
+ bits<3> Pg;
+ bits<5> Rn;
+ let Inst{31-25} = 0b1110000;
+ let Inst{24} = Q;
+ let Inst{23-22} = msz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Rm;
+ let Inst{15} = V;
+ let Inst{14-13} = Rv;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+
+ let mayLoad = 1;
+}
+
+class sme_mem_ld_ss_inst_BHSD<bits<2> msz, string mnemonic,
+ MatrixTileVectorOperand tile_ty, bit is_col,
+ Operand imm_ty, RegisterOperand gpr_ty>
+ : sme_mem_ld_ss_base<
+ 0b0, is_col, msz, (outs tile_ty:$ZAt),
+ (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn,
+ gpr_ty:$Rm),
+ mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg/z, [$Rn, $Rm]">;
+
+class sme_mem_ld_ss_inst_Q<string mnemonic, MatrixTileVectorOperand tile_ty,
+ bit is_col>
+ : sme_mem_ld_ss_base<
+ 0b1, is_col, 0b11, (outs tile_ty:$ZAt),
+ (ins MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn,
+ GPR64shifted128:$Rm),
+ mnemonic, "\t\\{$ZAt[$Rv]\\}, $Pg/z, [$Rn, $Rm]">;
+
+multiclass sme_mem_ss_aliases_BHSD<string mnemonic, Instruction inst,
+ MatrixTileVectorOperand tile_ty, Operand imm_ty,
+ RegisterOperand gpr_ty,
+ string pg_suffix=""> {
+ def : InstAlias<mnemonic # "\t$ZAt[$Rv, $imm], $Pg" # pg_suffix # ", [$Rn, $Rm]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, gpr_ty:$Rm), 0>;
+ // Default XZR offset aliases
+ def : InstAlias<mnemonic # "\t\\{$ZAt[$Rv, $imm]\\}, $Pg" # pg_suffix # ", [$Rn]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;
+ def : InstAlias<mnemonic # "\t$ZAt[$Rv, $imm], $Pg" # pg_suffix # ", [$Rn]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+}
+
+multiclass sme_mem_ss_aliases_Q<string mnemonic, Instruction inst,
+ MatrixTileVectorOperand tile_ty,
+ string pg_suffix=""> {
+ def : InstAlias<mnemonic # "\t$ZAt[$Rv], $Pg" # pg_suffix # ", [$Rn, $Rm]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, GPR64shifted128:$Rm), 0>;
+ // Default XZR offset aliases
+ def : InstAlias<mnemonic # "\t\\{$ZAt[$Rv]\\}, $Pg" # pg_suffix # ", [$Rn]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 2>;
+ def : InstAlias<mnemonic # "\t$ZAt[$Rv], $Pg" # pg_suffix # ", [$Rn]",
+ (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+}
+
+multiclass sme_mem_ss_aliases<string mnemonic, string inst, bit is_col,
+ string pg_suffix=""> {
+ defm : sme_mem_ss_aliases_BHSD<mnemonic # "b", !cast<Instruction>(inst # _B),
+ !if(is_col, TileVectorOpV8, TileVectorOpH8),
+ imm0_15, GPR64shifted8, pg_suffix>;
+ defm : sme_mem_ss_aliases_BHSD<mnemonic # "h", !cast<Instruction>(inst # _H),
+ !if(is_col, TileVectorOpV16, TileVectorOpH16),
+ imm0_7, GPR64shifted16, pg_suffix>;
+ defm : sme_mem_ss_aliases_BHSD<mnemonic # "w", !cast<Instruction>(inst # _S),
+ !if(is_col, TileVectorOpV32, TileVectorOpH32),
+ imm0_3, GPR64shifted32, pg_suffix>;
+ defm : sme_mem_ss_aliases_BHSD<mnemonic # "d", !cast<Instruction>(inst # _D),
+ !if(is_col, TileVectorOpV64, TileVectorOpH64),
+ imm0_1, GPR64shifted64, pg_suffix>;
+ defm : sme_mem_ss_aliases_Q <mnemonic # "q", !cast<Instruction>(inst # _Q),
+ !if(is_col, TileVectorOpV128, TileVectorOpH128),
+ pg_suffix>;
+}
+
+multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
+ defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">;
+}
+
+multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
+ def _B : sme_mem_ld_ss_inst_BHSD<0b00, mnemonic # "b",
+ !if(is_col, TileVectorOpV8,
+ TileVectorOpH8),
+ is_col, imm0_15, GPR64shifted8> {
+ bits<4> imm;
+ let Inst{3-0} = imm;
+ }
+ def _H : sme_mem_ld_ss_inst_BHSD<0b01, mnemonic # "h",
+ !if(is_col, TileVectorOpV16,
+ TileVectorOpH16),
+ is_col, imm0_7, GPR64shifted16> {
+ bits<1> ZAt;
+ bits<3> imm;
+ let Inst{3} = ZAt;
+ let Inst{2-0} = imm;
+ }
+ def _S : sme_mem_ld_ss_inst_BHSD<0b10, mnemonic # "w",
+ !if(is_col, TileVectorOpV32,
+ TileVectorOpH32),
+ is_col, imm0_3, GPR64shifted32> {
+ bits<2> ZAt;
+ bits<2> imm;
+ let Inst{3-2} = ZAt;
+ let Inst{1-0} = imm;
+ }
+ def _D : sme_mem_ld_ss_inst_BHSD<0b11, mnemonic # "d",
+ !if(is_col, TileVectorOpV64,
+ TileVectorOpH64),
+ is_col, imm0_1, GPR64shifted64> {
+ bits<3> ZAt;
+ bits<1> imm;
+ let Inst{3-1} = ZAt;
+ let Inst{0} = imm;
+ }
+ def _Q : sme_mem_ld_ss_inst_Q<mnemonic # "q",
+ !if(is_col, TileVectorOpV128,
+ TileVectorOpH128),
+ is_col> {
+ bits<4> ZAt;
+ let Inst{3-0} = ZAt;
+ }
+
+ defm : sme_mem_ld_ss_aliases<NAME, is_col>;
+}
+
+multiclass sme_mem_ld_ss<string mnemonic> {
+ defm _H : sme_mem_ld_v_ss<mnemonic, /*is_col=*/0b0>;
+ defm _V : sme_mem_ld_v_ss<mnemonic, /*is_col=*/0b1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SME Contiguous Stores
+//===----------------------------------------------------------------------===//
+
+class sme_mem_st_ss_base<bit Q, bit V, bits<2> msz, dag ins,
+ string mnemonic, string argstr>
+ : I<(outs), ins, mnemonic, argstr, "", []>, Sched<[]> {
+ bits<5> Rm;
+ bits<2> Rv;
+ bits<3> Pg;
+ bits<5> Rn;
+ let Inst{31-25} = 0b1110000;
+ let Inst{24} = Q;
+ let Inst{23-22} = msz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = V;
+ let Inst{14-13} = Rv;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+
+ let mayStore = 1;
+ let hasSideEffects = 1;
+}
+
+class sme_mem_st_ss_inst_BHSD<bits<2> msz, string mnemonic,
+ MatrixTileVectorOperand tile_ty, bit is_col,
+ Operand imm_ty, RegisterOperand gpr_ty>
+ : sme_mem_st_ss_base<
+ 0b0, is_col, msz,
+ (ins tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg,
+ GPR64sp:$Rn, gpr_ty:$Rm),
+ mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg, [$Rn, $Rm]">;
+
+class sme_mem_st_ss_inst_Q<string mnemonic, MatrixTileVectorOperand tile_ty,
+ bit is_col>
+ : sme_mem_st_ss_base<
+ 0b1, is_col, 0b11,
+ (ins tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg,
+ GPR64sp:$Rn, GPR64shifted128:$Rm),
+ mnemonic, "\t\\{$ZAt[$Rv]\\}, $Pg, [$Rn, $Rm]">;
+
+multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
+ defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
+}
+
+multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
+ def _B : sme_mem_st_ss_inst_BHSD<0b00, mnemonic # "b",
+ !if(is_col, TileVectorOpV8,
+ TileVectorOpH8),
+ is_col, imm0_15, GPR64shifted8> {
+ bits<4> imm;
+ let Inst{3-0} = imm;
+ }
+ def _H : sme_mem_st_ss_inst_BHSD<0b01, mnemonic # "h",
+ !if(is_col, TileVectorOpV16,
+ TileVectorOpH16),
+ is_col, imm0_7, GPR64shifted16> {
+ bits<1> ZAt;
+ bits<3> imm;
+ let Inst{3} = ZAt;
+ let Inst{2-0} = imm;
+ }
+ def _S : sme_mem_st_ss_inst_BHSD<0b10, mnemonic # "w",
+ !if(is_col, TileVectorOpV32,
+ TileVectorOpH32),
+ is_col, imm0_3, GPR64shifted32> {
+ bits<2> ZAt;
+ bits<2> imm;
+ let Inst{3-2} = ZAt;
+ let Inst{1-0} = imm;
+ }
+ def _D : sme_mem_st_ss_inst_BHSD<0b11, mnemonic # "d",
+ !if(is_col, TileVectorOpV64,
+ TileVectorOpH64),
+ is_col, imm0_1, GPR64shifted64> {
+ bits<3> ZAt;
+ bits<1> imm;
+ let Inst{3-1} = ZAt;
+ let Inst{0} = imm;
+ }
+ def _Q : sme_mem_st_ss_inst_Q<mnemonic # "q",
+ !if(is_col, TileVectorOpV128,
+ TileVectorOpH128),
+ is_col> {
+ bits<4> ZAt;
+ let Inst{3-0} = ZAt;
+ }
+
+ defm : sme_mem_st_ss_aliases<NAME, is_col>;
+}
+
+multiclass sme_mem_st_ss<string mnemonic> {
+ defm _H : sme_mem_st_v_ss<mnemonic, /*is_col=*/0b0>;
+ defm _V : sme_mem_st_v_ss<mnemonic, /*is_col=*/0b1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SME Save and Restore Array
+//===----------------------------------------------------------------------===//
+
+class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr>
+ : I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "",
+ []>,
+ Sched<[]> {
+ bits<2> Rv;
+ bits<5> Rn;
+ bits<4> imm4;
+ let Inst{31-22} = 0b1110000100;
+ let Inst{21} = isStore;
+ let Inst{20-15} = 0b000000;
+ let Inst{14-13} = Rv;
+ let Inst{12-10} = 0b000;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = imm4;
+
+ let mayLoad = !not(isStore);
+ let mayStore = isStore;
+}
+
+multiclass sme_spill_fill<bit isStore, dag outs, dag ins, string opcodestr> {
+ def NAME : sme_spill_fill_inst<isStore, outs, ins, opcodestr>;
+
+ def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
+ (!cast<Instruction>(NAME) MatrixOp:$ZAt,
+ MatrixIndexGPR32Op12_15:$Rv, imm0_15:$imm4, GPR64sp:$Rn, 0), 1>;
+}
+
+multiclass sme_spill<string opcodestr> {
+ defm NAME : sme_spill_fill<0b1, (outs),
+ (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
+ imm0_15:$imm4, GPR64sp:$Rn,
+ imm0_15:$offset),
+ opcodestr>;
+}
+
+multiclass sme_fill<string opcodestr> {
+ defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt),
+ (ins MatrixIndexGPR32Op12_15:$Rv,
+ imm0_15:$imm4, GPR64sp:$Rn,
+ imm0_15:$offset),
+ opcodestr>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+class sme_vector_to_tile_base<bit Q, bit V, bits<2> sz, dag outs, dag ins,
+ string mnemonic, string argstr>
+ : I<outs, ins, mnemonic, argstr, "", []>, Sched<[]> {
+ bits<2> Rv;
+ bits<3> Pg;
+ bits<5> Zn;
+ let Inst{31-24} = 0b11000000;
+ let Inst{23-22} = sz;
+ let Inst{21-17} = 0b00000;
+ let Inst{16} = Q;
+ let Inst{15} = V;
+ let Inst{14-13} = Rv;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = 0b0;
+}
+
+class sme_vector_to_tile_inst<bits<2> sz, MatrixTileVectorOperand tile_ty,
+ bit is_col, Operand imm_ty, ZPRRegOp zpr_ty,
+ string mnemonic>
+ : sme_vector_to_tile_base<0b0, is_col, sz, (outs tile_ty:$ZAd),
+ (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
+ mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">;
+
+class sme_vector_to_tile_inst_Q<MatrixTileVectorOperand tile_ty,
+ bit is_col, string mnemonic>
+ : sme_vector_to_tile_base<0b1, is_col, 0b11, (outs tile_ty:$ZAd),
+ (ins MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, ZPR128:$Zn),
+ mnemonic, "\t$ZAd[$Rv], $Pg/m, $Zn">;
+
+multiclass sme_vector_to_tile_aliases<Instruction inst,
+ MatrixTileVectorOperand tile_ty,
+ ZPRRegOp zpr_ty, Operand imm_ty> {
+ def : InstAlias<"mov\t$ZAd[$Rv, $imm], $Pg/m, $Zn",
+ (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm0_15:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>;
+}
+
+multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
+ def _B : sme_vector_to_tile_inst<0b00, !if(is_col, TileVectorOpV8,
+ TileVectorOpH8),
+ is_col, imm0_15, ZPR8, mnemonic> {
+ bits<4> imm;
+ let Inst{3-0} = imm;
+ }
+ def _H : sme_vector_to_tile_inst<0b01, !if(is_col, TileVectorOpV16,
+ TileVectorOpH16),
+ is_col, imm0_7, ZPR16, mnemonic> {
+ bits<1> ZAd;
+ bits<3> imm;
+ let Inst{3} = ZAd;
+ let Inst{2-0} = imm;
+ }
+ def _S : sme_vector_to_tile_inst<0b10, !if(is_col, TileVectorOpV32,
+ TileVectorOpH32),
+ is_col, imm0_3, ZPR32, mnemonic> {
+ bits<2> ZAd;
+ bits<2> imm;
+ let Inst{3-2} = ZAd;
+ let Inst{1-0} = imm;
+ }
+ def _D : sme_vector_to_tile_inst<0b11, !if(is_col, TileVectorOpV64,
+ TileVectorOpH64),
+ is_col, imm0_1, ZPR64, mnemonic> {
+ bits<3> ZAd;
+ bits<1> imm;
+ let Inst{3-1} = ZAd;
+ let Inst{0} = imm;
+ }
+ def _Q : sme_vector_to_tile_inst_Q<!if(is_col, TileVectorOpV128,
+ TileVectorOpH128),
+ is_col, mnemonic> {
+ bits<4> ZAd;
+ bits<1> imm;
+ let Inst{3-0} = ZAd;
+ }
+
+ defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _B),
+ !if(is_col, TileVectorOpV8,
+ TileVectorOpH8),
+ ZPR8, imm0_15>;
+ defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _H),
+ !if(is_col, TileVectorOpV16,
+ TileVectorOpH16),
+ ZPR16, imm0_7>;
+ defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _S),
+ !if(is_col, TileVectorOpV32,
+ TileVectorOpH32),
+ ZPR32, imm0_3>;
+ defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _D),
+ !if(is_col, TileVectorOpV64,
+ TileVectorOpH64),
+ ZPR64, imm0_1>;
+
+ def : InstAlias<"mov\t$ZAd[$Rv], $Pg/m, $Zn",
+ (!cast<Instruction>(NAME # _Q) !if(is_col,
+ TileVectorOpV128,
+ TileVectorOpH128):$ZAd,
+ MatrixIndexGPR32Op12_15:$Rv,
+ PPR3bAny:$Pg, ZPR128:$Zn), 1>;
+}
+
+multiclass sme_vector_to_tile<string mnemonic> {
+ defm _H : sme_vector_v_to_tile<mnemonic, /*is_col=*/0b0>;
+ defm _V : sme_vector_v_to_tile<mnemonic, /*is_col=*/0b1>;
+}
+
+class sme_tile_to_vector_base<bit Q, bit V, bits<2> sz, dag outs, dag ins,
+ string mnemonic, string argstr>
+ : I<outs, ins, mnemonic, argstr, "", []>, Sched<[]> {
+ bits<2> Rv;
+ bits<3> Pg;
+ bits<5> Zd;
+ let Inst{31-24} = 0b11000000;
+ let Inst{23-22} = sz;
+ let Inst{21-17} = 0b00001;
+ let Inst{16} = Q;
+ let Inst{15} = V;
+ let Inst{14-13} = Rv;
+ let Inst{12-10} = Pg;
+ let Inst{9} = 0b0;
+ let Inst{4-0} = Zd;
+}
+
+class sme_tile_to_vector_inst<bits<2> sz, ZPRRegOp zpr_ty,
+ MatrixTileVectorOperand tile_ty,
+ bit is_col, Operand imm_ty, string mnemonic>
+ : sme_tile_to_vector_base<0b0, is_col, sz, (outs zpr_ty:$Zd),
+ (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
+ mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]">;
+
+class sme_tile_to_vector_inst_Q<MatrixTileVectorOperand tile_ty,
+ bit is_col, string mnemonic>
+ : sme_tile_to_vector_base<0b1, is_col, 0b11, (outs ZPR128:$Zd),
+ (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv),
+ mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv]">;
+
+multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
+ MatrixTileVectorOperand tile_ty,
+ Operand imm_ty > {
+ def : InstAlias<"mov\t$Zd, $Pg/m, $ZAn[$Rv, $imm]",
+ (inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>;
+}
+
+multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
+ def _B : sme_tile_to_vector_inst<0b00, ZPR8, !if(is_col, TileVectorOpV8,
+ TileVectorOpH8),
+ is_col, imm0_15, mnemonic> {
+ bits<4> imm;
+ let Inst{8-5} = imm;
+ }
+ def _H : sme_tile_to_vector_inst<0b01, ZPR16, !if(is_col, TileVectorOpV16,
+ TileVectorOpH16),
+ is_col, imm0_7, mnemonic> {
+ bits<1> ZAn;
+ bits<3> imm;
+ let Inst{8} = ZAn;
+ let Inst{7-5} = imm;
+ }
+ def _S : sme_tile_to_vector_inst<0b10, ZPR32, !if(is_col, TileVectorOpV32,
+ TileVectorOpH32),
+ is_col, imm0_3, mnemonic> {
+ bits<2> ZAn;
+ bits<2> imm;
+ let Inst{8-7} = ZAn;
+ let Inst{6-5} = imm;
+ }
+ def _D : sme_tile_to_vector_inst<0b11, ZPR64, !if(is_col, TileVectorOpV64,
+ TileVectorOpH64),
+ is_col, imm0_1, mnemonic> {
+ bits<3> ZAn;
+ bits<1> imm;
+ let Inst{8-6} = ZAn;
+ let Inst{5} = imm;
+ }
+ def _Q : sme_tile_to_vector_inst_Q<!if(is_col, TileVectorOpV128,
+ TileVectorOpH128),
+ is_col, mnemonic> {
+ bits<4> ZAn;
+ let Inst{8-5} = ZAn;
+ }
+
+ defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
+ !if(is_col, TileVectorOpV8,
+ TileVectorOpH8), imm0_15>;
+ defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _H), ZPR16,
+ !if(is_col, TileVectorOpV16,
+ TileVectorOpH16), imm0_7>;
+ defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _S), ZPR32,
+ !if(is_col, TileVectorOpV32,
+ TileVectorOpH32), imm0_3>;
+ defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _D), ZPR64,
+ !if(is_col, TileVectorOpV64,
+ TileVectorOpH64), imm0_1>;
+
+ def : InstAlias<"mov\t$Zd, $Pg/m, $ZAn[$Rv]",
+ (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, PPR3bAny:$Pg,
+ !if(is_col,
+ TileVectorOpV128,
+ TileVectorOpH128):$ZAn,
+ MatrixIndexGPR32Op12_15:$Rv), 1>;
+}
+
+multiclass sme_tile_to_vector<string mnemonic> {
+ defm _H : sme_tile_to_vector_v<mnemonic, /*is_col=*/0b0>;
+ defm _V : sme_tile_to_vector_v<mnemonic, /*is_col=*/0b1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SME Zero
+//===----------------------------------------------------------------------===//
+
+class sme_zero_inst<string mnemonic>
+ : I<(outs MatrixTileList:$imm), (ins),
+ mnemonic, "\t$imm", "", []>, Sched<[]> {
+ bits<8> imm;
+ let Inst{31-8} = 0b110000000000100000000000;
+ let Inst{7-0} = imm;
+}
+
+multiclass sme_zero<string mnemonic> {
+ def NAME : sme_zero_inst<mnemonic>;
+
+ def : InstAlias<"zero\t\\{za\\}", (!cast<Instruction>(NAME) 0b11111111), 1>;
+ def : InstAlias<"zero\t\\{za0.h\\}", (!cast<Instruction>(NAME) 0b01010101), 1>;
+ def : InstAlias<"zero\t\\{za1.h\\}", (!cast<Instruction>(NAME) 0b10101010), 1>;
+ def : InstAlias<"zero\t\\{za0.s\\}", (!cast<Instruction>(NAME) 0b00010001), 1>;
+ def : InstAlias<"zero\t\\{za1.s\\}", (!cast<Instruction>(NAME) 0b00100010), 1>;
+ def : InstAlias<"zero\t\\{za2.s\\}", (!cast<Instruction>(NAME) 0b01000100), 1>;
+ def : InstAlias<"zero\t\\{za3.s\\}", (!cast<Instruction>(NAME) 0b10001000), 1>;
+ def : InstAlias<"zero\t\\{za0.s,za1.s\\}", (!cast<Instruction>(NAME) 0b00110011), 1>;
+ def : InstAlias<"zero\t\\{za0.s,za3.s\\}", (!cast<Instruction>(NAME) 0b10011001), 1>;
+ def : InstAlias<"zero\t\\{za1.s,za2.s\\}", (!cast<Instruction>(NAME) 0b01100110), 1>;
+ def : InstAlias<"zero\t\\{za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11001100), 1>;
+ def : InstAlias<"zero\t\\{za0.s,za1.s,za2.s\\}", (!cast<Instruction>(NAME) 0b01110111), 1>;
+ def : InstAlias<"zero\t\\{za0.s,za1.s,za3.s\\}", (!cast<Instruction>(NAME) 0b10111011), 1>;
+ def : InstAlias<"zero\t\\{za0.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11011101), 1>;
+ def : InstAlias<"zero\t\\{za1.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11101110), 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Instructions
+//===----------------------------------------------------------------------===//
+
+class sve2_int_perm_revd<string asm>
+ : I<(outs ZPR128:$Zd), (ins ZPR128:$_Zd, PPR3bAny:$Pg, ZPR128:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn", "", []>,
+ Sched<[]> {
+ bits<5> Zd;
+ bits<3> Pg;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = 0b00; // size
+ let Inst{21-13} = 0b101110100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = DestructiveUnary;
+ let ElementSize = ZPR128.ElementSize;
+}
+
+class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
+ : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd),
+ asm, "\t$Zd, $Zn, $Zm", "", []>,
+ Sched<[]> {
+ bits<5> Zm;
+ bits<5> Zn;
+ bits<5> Zd;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b11000;
+ let Inst{10} = U;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = zpr_ty.ElementSize;
+}
+
+multiclass sve2_clamp<string asm, bit U> {
+ def _B : sve2_clamp<asm, 0b00, U, ZPR8>;
+ def _H : sve2_clamp<asm, 0b01, U, ZPR16>;
+ def _S : sve2_clamp<asm, 0b10, U, ZPR32>;
+ def _D : sve2_clamp<asm, 0b11, U, ZPR64>;
+}
+
+class sve2_int_perm_dup_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
+ : I<(outs ppr_ty:$Pd), (ins PPRAny:$Pg, ppr_ty:$Pn,
+ MatrixIndexGPR32Op12_15:$Rm, imm_ty:$imm),
+ asm, "\t$Pd, $Pg/z, $Pn[$Rm, $imm]", "", []>,
+ Sched<[]> {
+ bits<2> Rm;
+ bits<4> Pg;
+ bits<4> Pn;
+ bits<4> Pd;
+ let Inst{31-24} = 0b00100101;
+ let Inst{21} = 0b1;
+ let Inst{17-16} = Rm;
+ let Inst{15-14} = 0b01;
+ let Inst{13-10} = Pg;
+ let Inst{9} = 0b0;
+ let Inst{8-5} = Pn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pd;
+}
+
+multiclass sve2_int_perm_dup_p<string asm> {
+ def _B : sve2_int_perm_dup_p<asm, PPR8, imm0_15> {
+ bits<4> imm;
+ let Inst{23-22} = imm{3-2};
+ let Inst{20-19} = imm{1-0};
+ let Inst{18} = 0b1;
+ }
+ def _H : sve2_int_perm_dup_p<asm, PPR16, imm0_7> {
+ bits<3> imm;
+ let Inst{23-22} = imm{2-1};
+ let Inst{20} = imm{0};
+ let Inst{19-18} = 0b10;
+ }
+ def _S : sve2_int_perm_dup_p<asm, PPR32, imm0_3> {
+ bits<2> imm;
+ let Inst{23-22} = imm{1-0};
+ let Inst{20-18} = 0b100;
+ }
+ def _D : sve2_int_perm_dup_p<asm, PPR64, imm0_1> {
+ bits<1> imm;
+ let Inst{23} = imm;
+ let Inst{22} = 0b1;
+ let Inst{20-18} = 0b000;
+ }
+
+ def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
+ (!cast<Instruction>(NAME # _B) PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
+ def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
+ (!cast<Instruction>(NAME # _H) PPR16:$Pd, PPRAny:$Pg, PPR16:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
+ def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
+ (!cast<Instruction>(NAME # _S) PPR32:$Pd, PPRAny:$Pg, PPR32:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
+ def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
+ (!cast<Instruction>(NAME # _D) PPR64:$Pd, PPRAny:$Pg, PPR64:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td b/src/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4eecf72..02d3a76 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -204,6 +204,11 @@
def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
+def SVELogicalImm8NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8, true>", []>;
+def SVELogicalImm16NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16, true>", []>;
+def SVELogicalImm32NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
+def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>;
+
def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
@@ -221,6 +226,8 @@
def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
+def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
+
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
let DiagnosticType = "Invalid" # Name;
@@ -256,6 +263,17 @@
def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;
+
+def sve_ext_imm_0_1 : ComplexPattern<i32, 1, "SelectEXTImm<1, 8>">;
+def sve_ext_imm_0_3 : ComplexPattern<i32, 1, "SelectEXTImm<3, 4>">;
+def sve_ext_imm_0_7 : ComplexPattern<i32, 1, "SelectEXTImm<7, 2>">;
+def sve_ext_imm_0_15 : ComplexPattern<i32, 1, "SelectEXTImm<15, 1>">;
+
+def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2),
+ (int_aarch64_sve_cntp node:$pred, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
//===----------------------------------------------------------------------===//
// SVE PTrue - These are used extensively throughout the pattern matching so
// it's important we define them first.
@@ -322,6 +340,15 @@
: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)),
(inst $Op3, $Op1, $Op2)>;
+
+multiclass SVE_1_Op_PassthruUndef_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+ ValueType vts, Instruction inst> {
+ def : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd undef))),
+ (inst (IMPLICIT_DEF), $Op1, $Op2)>;
+ def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, vtd:$Op3)),
+ (inst $Op3, $Op1, $Op2)>;
+}
+
// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
// type of rounding. This is matched by timm0_1 in pattern below and ignored.
class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
@@ -339,9 +366,9 @@
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
(inst $Op1, i32:$imm, i32:$shift)>;
-class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+ : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
@@ -357,14 +384,28 @@
class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
ValueType pt, ValueType vt1, ValueType vt2,
Instruction inst>
-: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)),
+: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
(inst $Op1, $Op2)>;
+class SVE_2_Op_Pred_All_Active_Pt<ValueType vtd, SDPatternOperator op,
+ ValueType pt, ValueType vt1, ValueType vt2,
+ Instruction inst>
+: Pat<(vtd (op (pt (SVEAllActive:$Op1)), vt1:$Op2, vt2:$Op3)),
+ (inst $Op1, $Op2, $Op3)>;
+
class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
(inst $Op1, $Op2, $Op3)>;
+multiclass SVE_3_Op_Undef_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, ValueType vt3, Instruction inst> {
+ def : Pat<(vtd (op (vt1 undef), vt2:$Op1, vt3:$Op2)),
+ (inst (IMPLICIT_DEF), $Op1, $Op2)>;
+ def : Pat<(vtd (op vt1:$Op1, (vt2 (SVEAllActive:$Op2)), vt3:$Op3)),
+ (inst $Op1, $Op2, $Op3)>;
+}
+
class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, ValueType vt4,
Instruction inst>
@@ -423,6 +464,14 @@
: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)),
(inst $PassThru, $Pg, $Src)>;
+multiclass SVE_InReg_Extend_PassthruUndef<ValueType vt, SDPatternOperator op, ValueType pt,
+ ValueType inreg_vt, Instruction inst> {
+ def : Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, (vt undef))),
+ (inst (IMPLICIT_DEF), $Pg, $Src)>;
+ def : Pat<(vt (op (pt (SVEAllActive:$Pg)), vt:$Src, inreg_vt, vt:$PassThru)),
+ (inst $PassThru, $Pg, $Src)>;
+}
+
class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
ComplexPattern cast, Instruction inst>
@@ -432,7 +481,7 @@
class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
ComplexPattern cast, Instruction inst>
-: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
(inst $Rn, i32:$imm)>;
//
@@ -491,6 +540,22 @@
Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
let FalseLanes = flags;
}
+
+ class PredThreeOpPseudo<string name, ZPRRegOp zprty,
+ FalseLanesEnum flags = FalseLanesNone>
+ : SVEPseudo2Instr<name, 0>,
+ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> {
+ let FalseLanes = flags;
+ }
+}
+
+//
+// Pseudos for passthru operands
+//
+let hasNoSchedulingInfo = 1 in {
+ class PredOneOpPassthruPseudo<string name, ZPRRegOp zprty>
+ : SVEPseudo2Instr<name, 0>,
+ Pseudo<(outs zprty:$Zd), (ins zprty:$Passthru, PPR3bAny:$Pg, zprty:$Zs), []>;
}
//===----------------------------------------------------------------------===//
@@ -650,7 +715,8 @@
}
multiclass sve_int_count_r_x64<bits<5> opc, string asm,
- SDPatternOperator op = null_frag> {
+ SDPatternOperator op,
+ SDPatternOperator combine_op = null_frag> {
def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
@@ -664,6 +730,16 @@
(!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
+
+ // Combine cntp with combine_op
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _H) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 (SVEAllActive)), (nxv4i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>;
}
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
@@ -1012,6 +1088,30 @@
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
def : InstAlias<"mov $Zd, $Qn",
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
+
+ // Duplicate extracted element of vector into all vector elements
+ def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
+ (!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>;
+ def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
+ def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+ def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+ def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
+ def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
+ def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+ def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+ def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
}
class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty,
@@ -1261,8 +1361,8 @@
}
class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
- RegisterClass srcRegType>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
+ FPRasZPROperand srcOpType>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcOpType:$Vm),
asm, "\t$Zdn, $Vm",
"",
[]>, Sched<[]> {
@@ -1279,16 +1379,31 @@
}
multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
- def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
- def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
- def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
- def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
+ def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8asZPR>;
+ def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16asZPR>;
+ def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32asZPR>;
+ def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64asZPR>;
- def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
+ def : Pat<(nxv8f16 (op nxv8f16:$Zn, f16:$Vm)),
+ (!cast<Instruction>(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>;
+ def : Pat<(nxv4f32 (op nxv4f32:$Zn, f32:$Vm)),
+ (!cast<Instruction>(NAME # _S) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, ssub))>;
+ def : Pat<(nxv2f64 (op nxv2f64:$Zn, f64:$Vm)),
+ (!cast<Instruction>(NAME # _D) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, dsub))>;
- def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>;
+ def : Pat<(nxv8bf16 (op nxv8bf16:$Zn, bf16:$Vm)),
+ (!cast<Instruction>(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>;
+
+ // Keep integer insertions within the vector unit.
+ def : Pat<(nxv16i8 (op (nxv16i8 ZPR:$Zn), (i32 (vector_extract (nxv16i8 ZPR:$Vm), 0)))),
+ (!cast<Instruction>(NAME # _B) $Zn, ZPR:$Vm)>;
+ def : Pat<(nxv8i16 (op (nxv8i16 ZPR:$Zn), (i32 (vector_extract (nxv8i16 ZPR:$Vm), 0)))),
+ (!cast<Instruction>(NAME # _H) $Zn, ZPR:$Vm)>;
+ def : Pat<(nxv4i32 (op (nxv4i32 ZPR:$Zn), (i32 (vector_extract (nxv4i32 ZPR:$Vm), 0)))),
+ (!cast<Instruction>(NAME # _S) $Zn, ZPR: $Vm)>;
+ def : Pat<(nxv2i64 (op (nxv2i64 ZPR:$Zn), (i64 (vector_extract (nxv2i64 ZPR:$Vm), 0)))),
+ (!cast<Instruction>(NAME # _D) $Zn, ZPR:$Vm)>;
+
}
//===----------------------------------------------------------------------===//
@@ -1371,7 +1486,9 @@
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
@@ -1486,6 +1603,13 @@
(!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
}
+multiclass sve_int_log_imm_bic<SDPatternOperator op> {
+ def : SVE_1_Op_Imm_Log_Pat<nxv16i8, op, ZPR8, i32, SVELogicalImm8NotPat, !cast<Instruction>("AND_ZI")>;
+ def : SVE_1_Op_Imm_Log_Pat<nxv8i16, op, ZPR16, i32, SVELogicalImm16NotPat, !cast<Instruction>("AND_ZI")>;
+ def : SVE_1_Op_Imm_Log_Pat<nxv4i32, op, ZPR32, i32, SVELogicalImm32NotPat, !cast<Instruction>("AND_ZI")>;
+ def : SVE_1_Op_Imm_Log_Pat<nxv2i64, op, ZPR64, i64, SVELogicalImm64NotPat, !cast<Instruction>("AND_ZI")>;
+}
+
class sve_int_dup_mask_imm<string asm>
: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
asm, "\t$Zd, $imms",
@@ -1542,8 +1666,7 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm,
- SDPatternOperator op, SDPatternOperator int_op> {
+multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
@@ -1553,12 +1676,6 @@
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
-
- // Intrinsic version
- def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -1762,14 +1879,20 @@
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
+ SDPatternOperator op, string revname,
+ bit isReverseInstr=0> {
+ let DestructiveInstType = DestructiveTernaryCommWithRev in {
+ def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
@@ -1801,16 +1924,26 @@
let ElementSize = zprty.ElementSize;
}
-multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op,
+ string revname, bit isReverseInstr> {
+ def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>,
+ SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>,
+ SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>,
+ SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_fp_3op_p_zds_zx<SDPatternOperator op, SDPatternOperator rev_op> {
+ def _UNDEF_H : PredThreeOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+ def _UNDEF_S : PredThreeOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredThreeOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
@@ -2451,13 +2584,13 @@
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>,
- SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>,
- SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>,
- SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>,
- SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
}
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
@@ -2471,13 +2604,13 @@
DestructiveInstTypeEnum flags> {
let DestructiveInstType = flags in {
def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>,
- SVEPseudo2Instr<Ps # _B, 1>;
+ SVEPseudo2Instr<Ps # _B, 1>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>,
- SVEPseudo2Instr<Ps # _H, 1>;
+ SVEPseudo2Instr<Ps # _H, 1>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>,
- SVEPseudo2Instr<Ps # _S, 1>;
+ SVEPseudo2Instr<Ps # _S, 1>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>,
- SVEPseudo2Instr<Ps # _D, 1>;
+ SVEPseudo2Instr<Ps # _D, 1>;
}
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
@@ -2491,13 +2624,13 @@
DestructiveInstTypeEnum flags> {
let DestructiveInstType = flags in {
def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>,
- SVEPseudo2Instr<Ps # _B, 1>;
+ SVEPseudo2Instr<Ps # _B, 1>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>,
- SVEPseudo2Instr<Ps # _H, 1>;
+ SVEPseudo2Instr<Ps # _H, 1>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
- SVEPseudo2Instr<Ps # _S, 1>;
+ SVEPseudo2Instr<Ps # _S, 1>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
- SVEPseudo2Instr<Ps # _D, 1>;
+ SVEPseudo2Instr<Ps # _D, 1>;
}
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
@@ -2513,9 +2646,9 @@
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
- SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
- SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
}
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
@@ -2805,10 +2938,8 @@
let Inst{19-16} = Zm;
}
- def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))),
- (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
- def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))),
- (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
+ def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -3082,11 +3213,20 @@
let ElementSize = zprty.ElementSize;
}
-multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
- def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
- def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
- def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
- def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
+multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op,
+ string Ps = "",
+ DestructiveInstTypeEnum flags=DestructiveOther,
+ string revname="", bit isReverseInstr=0> {
+ let DestructiveInstType = flags in {
+ def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+ def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -3146,26 +3286,46 @@
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = DestructiveOther;
+ let DestructiveInstType = DestructiveUnaryPassthru;
let ElementSize = zprty.ElementSize;
}
multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm,
SDPatternOperator op> {
- def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
+ def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+
+ defm : SVE_3_Op_Undef_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
}
multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
- def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
- def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
- def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
+ def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>,
+ SVEPseudo2Instr<NAME # _B, 1>;
+ def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>,
+ SVEPseudo2Instr<NAME # _H, 1>;
+ def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_B : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
+ def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_3_Op_Undef_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+ defm : SVE_3_Op_Undef_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_3_Op_Undef_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_3_Op_Undef_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
//===----------------------------------------------------------------------===//
@@ -3766,67 +3926,122 @@
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = DestructiveOther;
+ let DestructiveInstType = DestructiveUnaryPassthru;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
SDPatternOperator op> {
- def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
- def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
- def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+ def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>,
+ SVEPseudo2Instr<NAME # _B, 1>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>,
+ SVEPseudo2Instr<NAME # _H, 1>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_B : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
+ def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
SDPatternOperator op> {
- def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
- def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>,
+ SVEPseudo2Instr<NAME # _H, 1>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>;
def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_InReg_Extend_PassthruUndef<nxv8i16, op, nxv8i1, nxv8i8, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_InReg_Extend_PassthruUndef<nxv4i32, op, nxv4i1, nxv4i8, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i8, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
SDPatternOperator op> {
- def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_InReg_Extend_PassthruUndef<nxv4i32, op, nxv4i1, nxv4i16, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i16, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
SDPatternOperator op> {
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i32, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
SDPatternOperator op> {
- def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
- def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
- def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+ def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>,
+ SVEPseudo2Instr<NAME # _B, 1>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>,
+ SVEPseudo2Instr<NAME # _H, 1>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_B : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
+ def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm, SDPatternOperator op> {
- def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
- def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>,
+ SVEPseudo2Instr<NAME # _H, 1>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>,
+ SVEPseudo2Instr<NAME # _S, 1>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>,
+ SVEPseudo2Instr<NAME # _D, 1>;
def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
@@ -3834,6 +4049,17 @@
def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
//===----------------------------------------------------------------------===//
@@ -3933,8 +4159,7 @@
let ElementSize = ElementSizeNone;
}
-multiclass sve_int_arith_imm0<bits<3> opc, string asm,
- SDPatternOperator op, SDPatternOperator int_op> {
+multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
@@ -3944,12 +4169,6 @@
def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
-
- // Intrinsic version
- def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3990,10 +4209,10 @@
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
@@ -4002,10 +4221,10 @@
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -4014,10 +4233,10 @@
def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -4294,13 +4513,21 @@
}
multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
- ValueType intvt, sve_int_cmp cmp> {
+ ValueType intvt, Instruction cmp> {
def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)),
(cmp $Op1, $Op2, $Op3)>;
def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
(cmp $Op1, $Op3, $Op2)>;
}
+multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt,
+ ValueType intvt, Instruction cmp> {
+ def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, (SVEDup0), cc)),
+ (cmp $Op1, $Op2)>;
+ def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
+ (cmp $Op1, $Op2)>;
+}
+
multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
@@ -4661,21 +4888,26 @@
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
- SDPatternOperator op_nopred>
-: sve_fp_3op_p_pd<opc, asm, op> {
- def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16,
- !cast<Instruction>(NAME # _H), PTRUE_H>;
- def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16,
- !cast<Instruction>(NAME # _H), PTRUE_S>;
- def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16,
- !cast<Instruction>(NAME # _H), PTRUE_D>;
- def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32,
- !cast<Instruction>(NAME # _S), PTRUE_S>;
- def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32,
- !cast<Instruction>(NAME # _S), PTRUE_D>;
- def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64,
- !cast<Instruction>(NAME # _D), PTRUE_D>;
+multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm,
+ CondCode cc1, CondCode cc2,
+ CondCode invcc1, CondCode invcc2> {
+ def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+ def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+ def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -4702,10 +4934,26 @@
let Inst{3-0} = Pd;
}
-multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
+multiclass sve_fp_2op_p_pd<bits<3> opc, string asm,
+ CondCode cc1, CondCode cc2,
+ CondCode invcc1, CondCode invcc2> {
def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
@@ -4713,6 +4961,14 @@
//SVE Index Generation Group
//===----------------------------------------------------------------------===//
+def simm5_8b_tgt : TImmLeaf<i8, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]>;
+def simm5_16b_tgt : TImmLeaf<i16, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]>;
+def simm5_32b_tgt : TImmLeaf<i32, [{ return (int32_t)Imm >= -16 && (int32_t)Imm < 16; }]>;
+def simm5_64b_tgt : TImmLeaf<i64, [{ return (int64_t)Imm >= -16 && (int64_t)Imm < 16; }]>;
+def i64imm_32bit_tgt : TImmLeaf<i64, [{
+ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
Operand imm_ty>
: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
@@ -4730,19 +4986,29 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ii<string asm, SDPatternOperator op> {
+multiclass sve_int_index_ii<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse> {
def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
- def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)),
- (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
- def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)),
- (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
- def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)),
+ def : Pat<(nxv16i8 (step_vector simm5_8b_tgt:$imm5b)),
+ (!cast<Instruction>(NAME # "_B") (i32 0), (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+ def : Pat<(nxv8i16 (step_vector simm5_16b_tgt:$imm5b)),
+ (!cast<Instruction>(NAME # "_H") (i32 0), (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+ def : Pat<(nxv4i32 (step_vector simm5_32b_tgt:$imm5b)),
+ (!cast<Instruction>(NAME # "_S") (i32 0), simm5_32b:$imm5b)>;
+ def : Pat<(nxv2i64 (step_vector simm5_64b_tgt:$imm5b)),
+ (!cast<Instruction>(NAME # "_D") (i64 0), simm5_64b:$imm5b)>;
+
+ // add(step_vector(step), dup(X)) -> index(X, step).
+ def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+ (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+ def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+ (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+ def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
- def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
}
@@ -4763,19 +5029,53 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ir<string asm, SDPatternOperator op> {
+multiclass sve_int_index_ir<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse, SDPatternOperator mulop, SDPatternOperator muloneuseop> {
def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
- def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)),
+ def : Pat<(nxv16i8 (step_vector i8:$imm)),
+ (!cast<Instruction>(NAME # "_B") (i32 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(nxv8i16 (step_vector i16:$imm)),
+ (!cast<Instruction>(NAME # "_H") (i32 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(nxv4i32 (step_vector i32:$imm)),
+ (!cast<Instruction>(NAME # "_S") (i32 0), (!cast<Instruction>("MOVi32imm") $imm))>;
+ def : Pat<(nxv2i64 (step_vector i64:$imm)),
+ (!cast<Instruction>(NAME # "_D") (i64 0), (!cast<Instruction>("MOVi64imm") $imm))>;
+ def : Pat<(nxv2i64 (step_vector i64imm_32bit_tgt:$imm)),
+ (!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
+
+ // add(step_vector(step), dup(X)) -> index(X, step).
+ def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+ (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+ (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+ (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") $imm))>;
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") $imm))>;
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
+
+ // mul(step_vector(1), dup(Y)) -> index(0, Y).
+ def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
+ def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
+ def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
+ def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+ (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
+
+ // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
+ def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
- def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)),
+ def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
- def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)),
+ def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
- def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)),
+ def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
}
@@ -4796,19 +5096,20 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ri<string asm, SDPatternOperator op> {
+multiclass sve_int_index_ri<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse> {
def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
- def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)),
- (!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
- def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)),
- (!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
- def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)),
+ // add(step_vector(step), dup(X)) -> index(X, step).
+ def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "_B") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
+ def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "_H") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
+ def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
- def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
(!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
}
@@ -4829,16 +5130,33 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
+multiclass sve_int_index_rr<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse, SDPatternOperator mulop> {
def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
- def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
+ // add(step_vector(step), dup(X)) -> index(X, step).
+ def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+ def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") $imm))>;
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") $imm))>;
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
+
+ // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
+ def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
+ def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
+ def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+ (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
+ def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ (!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>;
}
//===----------------------------------------------------------------------===//
@@ -5049,10 +5367,14 @@
let Inst{4-0} = Zd;
}
-multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
+
+ def : SVE_2_Op_Pred_All_Active<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pred_All_Active<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pred_All_Active<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
}
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
@@ -6705,8 +7027,8 @@
multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
- PatFrag op_sxtw,
- PatFrag op_uxtw> {
+ SDPatternOperator op_sxtw,
+ SDPatternOperator op_uxtw> {
def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
@@ -7059,8 +7381,8 @@
multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
- PatFrag op_sxtw,
- PatFrag op_uxtw> {
+ SDPatternOperator op_sxtw,
+ SDPatternOperator op_uxtw> {
def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
@@ -7073,7 +7395,7 @@
}
multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
- RegisterOperand zprext, PatFrag frag> {
+ RegisterOperand zprext, SDPatternOperator frag> {
def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
@@ -7466,6 +7788,7 @@
let Inst{3-0} = Pd;
let Defs = [NZCV];
+ let isPTestLike = 1;
}
multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> {
@@ -7998,3 +8321,15 @@
def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>;
def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>;
}
+
+multiclass sve_int_bin_pred_all_active_bhsd<SDPatternOperator op> {
+ def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+ def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+ def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+ def : SVE_2_Op_Pred_All_Active_Pt<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+ def : SVE_2_Op_Pred_All_Active_Pt<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ def : SVE_2_Op_Pred_All_Active_Pt<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ def : SVE_2_Op_Pred_All_Active_Pt<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/src/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 9911f33..79dcca8 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -9,17 +9,19 @@
//
// Performs general IR level optimizations on SVE intrinsics.
//
-// The main goal of this pass is to remove unnecessary reinterpret
-// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
+// This pass performs the following optimizations:
//
-// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
-// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
+// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+// ; (%1 can be replaced with a reinterpret of %2)
//
-// This pass also looks for ptest intrinsics & phi instructions where the
-// operands are being needlessly converted to and from svbool_t.
+// - optimizes ptest intrinsics where the operands are being needlessly
+// converted to and from svbool_t.
//
//===----------------------------------------------------------------------===//
+#include "AArch64.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
@@ -54,16 +56,13 @@
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
- static IntrinsicInst *isReinterpretToSVBool(Value *V);
+ bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
+ SmallSetVector<IntrinsicInst *, 4> &PTrues);
+ bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
- static bool optimizeIntrinsic(Instruction *I);
-
+ /// Operates at the function-scope. I.e., optimizations are applied local to
+ /// the functions themselves.
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
-
- static bool optimizeConvertFromSVBool(IntrinsicInst *I);
- static bool optimizePTest(IntrinsicInst *I);
-
- static bool processPhiNode(IntrinsicInst *I);
};
} // end anonymous namespace
@@ -78,185 +77,211 @@
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
-namespace llvm {
-ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
-} // namespace llvm
-
-/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
-/// otherwise.
-IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
- IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
- if (!I)
- return nullptr;
-
- if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
- return nullptr;
-
- return I;
+ModulePass *llvm::createSVEIntrinsicOptsPass() {
+ return new SVEIntrinsicOpts();
}
-/// The function will remove redundant reinterprets casting in the presence
-/// of the control flow
-bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
+/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
+/// ptrue will introduce zeroing. For example:
+///
+/// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+/// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+/// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
+///
+/// %1 is promoted, because it is converted:
+///
+/// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
+///
+/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
+static bool isPTruePromoted(IntrinsicInst *PTrue) {
+ // Find all users of this intrinsic that are calls to convert-to-svbool
+ // reinterpret intrinsics.
+ SmallVector<IntrinsicInst *, 4> ConvertToUses;
+ for (User *User : PTrue->users()) {
+ if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
+ ConvertToUses.push_back(cast<IntrinsicInst>(User));
+ }
+ }
- SmallVector<Instruction *, 32> Worklist;
- auto RequiredType = X->getType();
-
- auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
- assert(PN && "Expected Phi Node!");
-
- // Don't create a new Phi unless we can remove the old one.
- if (!PN->hasOneUse())
+ // If no such calls were found, this is ptrue is not promoted.
+ if (ConvertToUses.empty())
return false;
- for (Value *IncValPhi : PN->incoming_values()) {
- auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
- if (!Reinterpret ||
- RequiredType != Reinterpret->getArgOperand(0)->getType())
- return false;
+ // Otherwise, try to find users of the convert-to-svbool intrinsics that are
+ // calls to the convert-from-svbool intrinsic, and would result in some lanes
+ // being zeroed.
+ const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
+ for (IntrinsicInst *ConvertToUse : ConvertToUses) {
+ for (User *User : ConvertToUse->users()) {
+ auto *IntrUser = dyn_cast<IntrinsicInst>(User);
+ if (IntrUser && IntrUser->getIntrinsicID() ==
+ Intrinsic::aarch64_sve_convert_from_svbool) {
+ const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
+
+ // Would some lanes become zeroed by the conversion?
+ if (IntrUserVTy->getElementCount().getKnownMinValue() >
+ PTrueVTy->getElementCount().getKnownMinValue())
+ // This is a promoted ptrue.
+ return true;
+ }
+ }
}
- // Create the new Phi
- LLVMContext &Ctx = PN->getContext();
- IRBuilder<> Builder(Ctx);
- Builder.SetInsertPoint(PN);
- PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
- Worklist.push_back(PN);
-
- for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
- auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
- NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
- Worklist.push_back(Reinterpret);
- }
-
- // Cleanup Phi Node and reinterprets
- X->replaceAllUsesWith(NPN);
- X->eraseFromParent();
-
- for (auto &I : Worklist)
- if (I->use_empty())
- I->eraseFromParent();
-
- return true;
-}
-
-bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
- IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
- IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
-
- if (Op1 && Op2 &&
- Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
- Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
- Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
-
- Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
- Type *Tys[] = {Op1->getArgOperand(0)->getType()};
- Module *M = I->getParent()->getParent()->getParent();
-
- auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
- auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
-
- I->replaceAllUsesWith(CI);
- I->eraseFromParent();
- if (Op1->use_empty())
- Op1->eraseFromParent();
- if (Op1 != Op2 && Op2->use_empty())
- Op2->eraseFromParent();
-
- return true;
- }
-
+ // If no matching calls were found, this is not a promoted ptrue.
return false;
}
-bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
- assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
- "Unexpected opcode");
-
- // If the reinterpret instruction operand is a PHI Node
- if (isa<PHINode>(I->getArgOperand(0)))
- return processPhiNode(I);
-
- SmallVector<Instruction *, 32> CandidatesForRemoval;
- Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
-
- const auto *IVTy = cast<VectorType>(I->getType());
-
- // Walk the chain of conversions.
- while (Cursor) {
- // If the type of the cursor has fewer lanes than the final result, zeroing
- // must take place, which breaks the equivalence chain.
- const auto *CursorVTy = cast<VectorType>(Cursor->getType());
- if (CursorVTy->getElementCount().getKnownMinValue() <
- IVTy->getElementCount().getKnownMinValue())
- break;
-
- // If the cursor has the same type as I, it is a viable replacement.
- if (Cursor->getType() == IVTy)
- EarliestReplacement = Cursor;
-
- auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
-
- // If this is not an SVE conversion intrinsic, this is the end of the chain.
- if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
- Intrinsic::aarch64_sve_convert_to_svbool ||
- IntrinsicCursor->getIntrinsicID() ==
- Intrinsic::aarch64_sve_convert_from_svbool))
- break;
-
- CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
- Cursor = IntrinsicCursor->getOperand(0);
- }
-
- // If no viable replacement in the conversion chain was found, there is
- // nothing to do.
- if (!EarliestReplacement)
+/// Attempts to coalesce ptrues in a basic block.
+bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
+ BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
+ if (PTrues.size() <= 1)
return false;
- I->replaceAllUsesWith(EarliestReplacement);
- I->eraseFromParent();
+ // Find the ptrue with the most lanes.
+ auto *MostEncompassingPTrue = *std::max_element(
+ PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
+ auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
+ auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
+ return PTrue1VTy->getElementCount().getKnownMinValue() <
+ PTrue2VTy->getElementCount().getKnownMinValue();
+ });
- while (!CandidatesForRemoval.empty()) {
- Instruction *Candidate = CandidatesForRemoval.pop_back_val();
- if (Candidate->use_empty())
- Candidate->eraseFromParent();
+ // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
+ // behind only the ptrues to be coalesced.
+ PTrues.remove(MostEncompassingPTrue);
+ PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
+
+ // Hoist MostEncompassingPTrue to the start of the basic block. It is always
+ // safe to do this, since ptrue intrinsic calls are guaranteed to have no
+ // predecessors.
+ MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
+
+ LLVMContext &Ctx = BB.getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
+
+ auto *MostEncompassingPTrueVTy =
+ cast<VectorType>(MostEncompassingPTrue->getType());
+ auto *ConvertToSVBool = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
+ {MostEncompassingPTrue});
+
+ bool ConvertFromCreated = false;
+ for (auto *PTrue : PTrues) {
+ auto *PTrueVTy = cast<VectorType>(PTrue->getType());
+
+ // Only create the converts if the types are not already the same, otherwise
+ // just use the most encompassing ptrue.
+ if (MostEncompassingPTrueVTy != PTrueVTy) {
+ ConvertFromCreated = true;
+
+ Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
+ auto *ConvertFromSVBool =
+ Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
+ {PTrueVTy}, {ConvertToSVBool});
+ PTrue->replaceAllUsesWith(ConvertFromSVBool);
+ } else
+ PTrue->replaceAllUsesWith(MostEncompassingPTrue);
+
+ PTrue->eraseFromParent();
}
+
+ // We never used the ConvertTo so remove it
+ if (!ConvertFromCreated)
+ ConvertToSVBool->eraseFromParent();
+
return true;
}
-bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
- IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
- if (!IntrI)
- return false;
+/// The goal of this function is to remove redundant calls to the SVE ptrue
+/// intrinsic in each basic block within the given functions.
+///
+/// SVE ptrues have two representations in LLVM IR:
+/// - a logical representation -- an arbitrary-width scalable vector of i1s,
+/// i.e. <vscale x N x i1>.
+/// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
+/// scalable vector of i1s, i.e. <vscale x 16 x i1>.
+///
+/// The SVE ptrue intrinsic is used to create a logical representation of an SVE
+/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
+/// P1 creates a logical SVE predicate that is at least as wide as the logical
+/// SVE predicate created by P2, then all of the bits that are true in the
+/// physical representation of P2 are necessarily also true in the physical
+/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
+/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
+/// convert.{to,from}.svbool.
+///
+/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
+/// if they match the following conditions:
+///
+/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
+/// SV_ALL indicates that all bits of the predicate vector are to be set to
+/// true. SV_POW2 indicates that all bits of the predicate vector up to the
+/// largest power-of-two are to be set to true.
+/// - the result of the call to the intrinsic is not promoted to a wider
+/// predicate. In this case, keeping the extra ptrue leads to better codegen
+/// -- coalescing here would create an irreducible chain of SVE reinterprets
+/// via convert.{to,from}.svbool.
+///
+/// EXAMPLE:
+///
+/// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
+/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
+/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
+/// ...
+///
+/// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
+/// ; Logical: <1, 1, 1, 1>
+/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
+/// ...
+///
+/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
+///
+/// %1 = <vscale x 8 x i1> ptrue(i32 i31)
+/// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
+/// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
+///
+bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
+ SmallSetVector<Function *, 4> &Functions) {
+ bool Changed = false;
- switch (IntrI->getIntrinsicID()) {
- case Intrinsic::aarch64_sve_convert_from_svbool:
- return optimizeConvertFromSVBool(IntrI);
- case Intrinsic::aarch64_sve_ptest_any:
- case Intrinsic::aarch64_sve_ptest_first:
- case Intrinsic::aarch64_sve_ptest_last:
- return optimizePTest(IntrI);
- default:
- return false;
+ for (auto *F : Functions) {
+ for (auto &BB : *F) {
+ SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
+ SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
+
+ // For each basic block, collect the used ptrues and try to coalesce them.
+ for (Instruction &I : BB) {
+ if (I.use_empty())
+ continue;
+
+ auto *IntrI = dyn_cast<IntrinsicInst>(&I);
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ continue;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
+
+ if (PTruePattern == AArch64SVEPredPattern::all)
+ SVAllPTrues.insert(IntrI);
+ if (PTruePattern == AArch64SVEPredPattern::pow2)
+ SVPow2PTrues.insert(IntrI);
+ }
+
+ Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
+ Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
+ }
}
- return true;
+ return Changed;
}
bool SVEIntrinsicOpts::optimizeFunctions(
SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;
- for (auto *F : Functions) {
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
- // Traverse the DT with an rpo walk so we see defs before uses, allowing
- // simplification to be done incrementally.
- BasicBlock *Root = DT->getRoot();
- ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
- for (auto *BB : RPOT)
- for (Instruction &I : make_early_inc_range(*BB))
- Changed |= optimizeIntrinsic(&I);
- }
+ Changed |= optimizePTrueIntrinsicCalls(Functions);
+
return Changed;
}
@@ -272,10 +297,7 @@
continue;
switch (F.getIntrinsicID()) {
- case Intrinsic::aarch64_sve_convert_from_svbool:
- case Intrinsic::aarch64_sve_ptest_any:
- case Intrinsic::aarch64_sve_ptest_first:
- case Intrinsic::aarch64_sve_ptest_last:
+ case Intrinsic::aarch64_sve_ptrue:
for (User *U : F.users())
Functions.insert(cast<Instruction>(U)->getFunction());
break;
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index ac59d73..20aec47 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -169,3 +169,10 @@
#include "AArch64GenSystemOperands.inc"
}
}
+
+namespace llvm {
+ namespace AArch64SVCR {
+#define GET_SVCR_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 1b13c94..d168c2a 100644
--- a/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -106,6 +106,25 @@
return Reg;
}
+inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) {
+ switch (RegTuple) {
+ case AArch64::X0_X1_X2_X3_X4_X5_X6_X7: return AArch64::X0;
+ case AArch64::X2_X3_X4_X5_X6_X7_X8_X9: return AArch64::X2;
+ case AArch64::X4_X5_X6_X7_X8_X9_X10_X11: return AArch64::X4;
+ case AArch64::X6_X7_X8_X9_X10_X11_X12_X13: return AArch64::X6;
+ case AArch64::X8_X9_X10_X11_X12_X13_X14_X15: return AArch64::X8;
+ case AArch64::X10_X11_X12_X13_X14_X15_X16_X17: return AArch64::X10;
+ case AArch64::X12_X13_X14_X15_X16_X17_X18_X19: return AArch64::X12;
+ case AArch64::X14_X15_X16_X17_X18_X19_X20_X21: return AArch64::X14;
+ case AArch64::X16_X17_X18_X19_X20_X21_X22_X23: return AArch64::X16;
+ case AArch64::X18_X19_X20_X21_X22_X23_X24_X25: return AArch64::X18;
+ case AArch64::X20_X21_X22_X23_X24_X25_X26_X27: return AArch64::X20;
+ case AArch64::X22_X23_X24_X25_X26_X27_X28_FP: return AArch64::X22;
+ }
+ // For anything else, return it unchanged.
+ return RegTuple;
+}
+
static inline unsigned getBRegFromDReg(unsigned Reg) {
switch (Reg) {
case AArch64::D0: return AArch64::B0;
@@ -346,6 +365,14 @@
: SysAlias(N, E, F), ImmValue(I) {}
};
+namespace AArch64SVCR {
+ struct SVCR : SysAlias{
+ using SysAlias::SysAlias;
+ };
+ #define GET_SVCR_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
namespace AArch64AT{
struct AT : SysAlias {
using SysAlias::SysAlias;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
index 677c493..ca088e6 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -15,10 +15,10 @@
namespace llvm {
-class AMDGPUTargetMachine;
class FunctionPass;
class GCNTargetMachine;
class ImmutablePass;
+class MachineFunctionPass;
class ModulePass;
class Pass;
class Target;
@@ -51,12 +51,12 @@
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
+FunctionPass *createSIOptimizeVGPRLiveRangePass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
@@ -72,7 +72,10 @@
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
+ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
+FunctionPass *createGCNPreRAOptimizationsPass();
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
@@ -94,6 +97,8 @@
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
Pass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAttributorPass();
+void initializeAMDGPUAttributorPass(PassRegistry &);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -146,6 +151,21 @@
TargetMachine &TM;
};
+void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &);
+extern char &AMDGPUReplaceLDSUseWithPointerID;
+
+struct AMDGPUReplaceLDSUseWithPointerPass
+ : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSID;
+
+struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -197,14 +217,11 @@
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowID;
-void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
-extern char &SIRemoveShortExecBranchesID;
-
void initializeSIPreEmitPeepholePass(PassRegistry &);
extern char &SIPreEmitPeepholeID;
-void initializeSIInsertSkipsPass(PassRegistry &);
-extern char &SIInsertSkipsPassID;
+void initializeSILateBranchLoweringPass(PassRegistry &);
+extern char &SILateBranchLoweringPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
@@ -218,9 +235,6 @@
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
-void initializeSIAddIMGInitPass(PassRegistry &);
-extern char &SIAddIMGInitID;
-
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
@@ -271,6 +285,9 @@
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
extern char &AMDGPUPrintfRuntimeBindingID;
+void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
+extern char &AMDGPUResourceUsageAnalysisID;
+
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
@@ -287,6 +304,9 @@
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
extern char &SIOptimizeExecMaskingPreRAID;
+void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &);
+extern char &SIOptimizeVGPRLiveRangeID;
+
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
@@ -331,12 +351,12 @@
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
-void initializeGCNRegBankReassignPass(PassRegistry &);
-extern char &GCNRegBankReassignID;
-
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
+void initializeGCNPreRAOptimizationsPass(PassRegistry &);
+extern char &GCNPreRAOptimizationsID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index c352c00..7991f3d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -51,6 +51,12 @@
"Most fp64 instructions are half rate instead of quarter"
>;
+def FullRate64Ops : SubtargetFeature<"full-rate-64-ops",
+ "FullRate64Ops",
+ "true",
+ "Most fp64 instructions are full rate"
+>;
+
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"FlatAddressSpace",
"true",
@@ -148,6 +154,12 @@
"Enable XNACK support"
>;
+def FeatureTgSplit : SubtargetFeature<"tgsplit",
+ "EnableTgSplit",
+ "true",
+ "Enable threadgroup split execution"
+>;
+
def FeatureCuMode : SubtargetFeature<"cumode",
"EnableCuMode",
"true",
@@ -214,10 +226,28 @@
"MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
>;
+def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug",
+ "HasNSAClauseBug",
+ "true",
+ "MIMG-NSA in a hard clause has unpredictable results on GFX10.1"
+>;
+
def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
"HasFlatSegmentOffsetBug",
"true",
- "GFX10 bug, inst_offset ignored in flat segment"
+ "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
+>;
+
+def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
+ "NegativeScratchOffsetBug",
+ "true",
+ "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
+>;
+
+def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
+ "NegativeUnalignedScratchOffsetBug",
+ "true",
+ "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
>;
def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
@@ -272,6 +302,12 @@
"Additional instructions for GFX9+"
>;
+def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
+ "GFX90AInsts",
+ "true",
+ "Additional instructions for GFX90A+"
+>;
+
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"GFX10Insts",
"true",
@@ -387,6 +423,18 @@
"Support DPP8 (Data Parallel Primitives) extension"
>;
+def Feature64BitDPP : SubtargetFeature<"dpp-64bit",
+ "Has64BitDPP",
+ "true",
+ "Support DPP (Data Parallel Primitives) extension"
+>;
+
+def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
+ "HasPackedFP32Ops",
+ "true",
+ "Support packed fp32 instructions"
+>;
+
def FeatureR128A16 : SubtargetFeature<"r128-a16",
"HasR128A16",
"true",
@@ -411,6 +459,18 @@
"Support NSA encoding for image instructions"
>;
+def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
+ "HasExtendedImageInsts",
+ "true",
+ "Support mips != 0, lod != 0, gather4, and get_lod"
+>;
+
+def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding",
+ "GFX10_AEncoding",
+ "true",
+ "Has BVH ray tracing instructions"
+>;
+
def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
"GFX10_BEncoding",
"true",
@@ -444,7 +504,7 @@
def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
"HasDot2Insts",
"true",
- "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+ "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions"
>;
def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
@@ -471,6 +531,12 @@
"Has v_dot4c_i32_i8 instruction"
>;
+def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
+ "HasDot7Insts",
+ "true",
+ "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+>;
+
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
@@ -527,6 +593,12 @@
"Has s_memtime instruction"
>;
+def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
+ "HasShaderCyclesRegister",
+ "true",
+ "Has SHADER_CYCLES hardware register"
+>;
+
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
"HasMadMacF32Insts",
"true",
@@ -557,6 +629,16 @@
"Does not need SW waitstates"
>;
+class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
+ "nsa-max-size-"#Value,
+ "NSAMaxSize",
+ !cast<string>(Value),
+ "The maximum non-sequential address size in VGPRs."
+>;
+
+def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
+def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -659,6 +741,18 @@
" supports it"
>;
+def FeaturePackedTID : SubtargetFeature<"packed-tid",
+ "HasPackedTID",
+ "true",
+ "Workitem IDs are packed into v0 at kernel launch"
+>;
+
+def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
+ "HasArchitectedFlatScratch",
+ "true",
+ "Flat Scratch register is a readonly SPI initialized architected register"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -675,7 +769,8 @@
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
- FeatureTrigReducedRange]
+ FeatureTrigReducedRange, FeatureExtendedImageInsts
+ ]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -684,7 +779,8 @@
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureUnalignedBufferAccess]
+ FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess
+ ]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -697,7 +793,9 @@
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess]
+ FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
+ FeatureUnalignedBufferAccess
+ ]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
@@ -712,9 +810,10 @@
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
- FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
- FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureSupportsXNACK]
+ FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+ FeatureNegativeScratchOffsetBug
+ ]
>;
def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
@@ -729,9 +828,9 @@
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
- FeatureVOP3Literal, FeatureDPP8,
+ FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
- FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
+ FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
]
>;
@@ -816,17 +915,26 @@
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureFmaMixInsts,
FeatureImageGather4D16Bug]>;
@@ -835,9 +943,13 @@
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
+ FeatureDot7Insts,
FeatureSupportsSRAMECC,
FeatureImageGather4D16Bug]>;
@@ -846,6 +958,9 @@
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
@@ -853,6 +968,7 @@
FeatureDot4Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureMAIInsts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,
@@ -864,13 +980,41 @@
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
+def FeatureISAVersion9_0_A : FeatureSet<
+ [FeatureGFX9,
+ FeatureGFX90AInsts,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureDot7Insts,
+ Feature64BitDPP,
+ FeaturePackedFP32Ops,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddInsts,
+ FeatureMadMacF32Insts,
+ FeatureSupportsSRAMECC,
+ FeaturePackedTID,
+ FullRate64Ops]>;
+
def FeatureISAVersion9_0_C : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureXNACK,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
// TODO: Organize more features into groups.
@@ -884,8 +1028,10 @@
FeatureVcmpxExecWARHazard,
FeatureLdsBranchVmemWARHazard,
FeatureNSAtoVMEMBug,
+ FeatureNSAClauseBug,
FeatureOffset3fBug,
- FeatureFlatSegmentOffsetBug
+ FeatureFlatSegmentOffsetBug,
+ FeatureNegativeUnalignedScratchOffsetBug
];
}
@@ -895,12 +1041,12 @@
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -915,13 +1061,14 @@
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -936,13 +1083,32 @@
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
+ FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts,
+ FeatureLdsMisalignedBug,
+ FeatureSupportsXNACK])>;
+
+def FeatureISAVersion10_1_3 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureGFX10_AEncoding,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureGetWaveIdInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -950,6 +1116,7 @@
def FeatureISAVersion10_3_0 : FeatureSet<
[FeatureGFX10,
+ FeatureGFX10_AEncoding,
FeatureGFX10_BEncoding,
FeatureGFX10_3Insts,
FeatureLDSBankCount32,
@@ -958,8 +1125,11 @@
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureNSAEncoding,
- FeatureWavefrontSize32]>;
+ FeatureNSAMaxSize13,
+ FeatureWavefrontSize32,
+ FeatureShaderCyclesRegister]>;
//===----------------------------------------------------------------------===//
@@ -1077,6 +1247,14 @@
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of (not FeatureGFX10Insts))>;
+def isGFX6GFX7GFX8GFX9NotGFX90A :
+ Predicate<"!Subtarget->hasGFX90AInsts() &&"
+ "(Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>;
+
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1097,6 +1275,32 @@
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
+def isGCN3ExcludingGFX90A :
+ Predicate<"Subtarget->isGCN3Encoding() && !Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90APlus :
+ Predicate<"Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
+def isNotGFX90APlus :
+ Predicate<"!Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of (not FeatureGFX90AInsts))>;
+
+def isGFX8GFX9NotGFX90A :
+ Predicate<"!Subtarget->hasGFX90AInsts() &&"
+ "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90AOnly :
+ Predicate<"Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
+def isGFX908orGFX90A :
+ Predicate<"Subtarget->hasMAIInsts()">,
+ AssemblerPredicate<(all_of FeatureMAIInsts)>;
+
def isGFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
@@ -1126,6 +1330,9 @@
def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
+ AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
+
def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
@@ -1177,6 +1384,19 @@
def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
+def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">,
+ AssemblerPredicate<(all_of Feature64BitDPP)>;
+
+def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
+ AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
+
+def HasFmaakFmamkF32Insts :
+ Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
+ AssemblerPredicate<(any_of FeatureGFX10Insts)>;
+
+def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
+ AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
+
def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
AssemblerPredicate<(all_of FeatureR128A16)>;
@@ -1238,6 +1458,9 @@
def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
AssemblerPredicate<(all_of FeatureDot6Insts)>;
+def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
+ AssemblerPredicate<(all_of FeatureDot7Insts)>;
+
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
@@ -1250,7 +1473,8 @@
def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
-def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">;
+def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
+ AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
@@ -1267,9 +1491,6 @@
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
-def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
- AssemblerPredicate<(all_of FeatureOffset3fBug)>;
-
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 0ed89e9..88b88a0 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -41,24 +41,28 @@
AU.setPreservesAll();
}
-// These arrays are indexed by address space value enum elements 0 ... to 7
-static const AliasResult ASAliasRules[8][8] = {
- /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */
- /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias},
- /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias},
- /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
- /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias},
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
- /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias},
- /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}
-};
-
static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range");
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
- return MayAlias;
+ return AliasResult::MayAlias;
+
+#define ASMay AliasResult::MayAlias
+#define ASNo AliasResult::NoAlias
+ // This array is indexed by address space value enum elements 0 ... to 7
+ static const AliasResult ASAliasRules[8][8] = {
+ /* Flat Global Region Group Constant Private Const32 Buf Fat Ptr */
+ /* Flat */ {ASMay, ASMay, ASNo, ASMay, ASMay, ASMay, ASMay, ASMay},
+ /* Global */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay},
+ /* Region */ {ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo, ASNo},
+ /* Group */ {ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo},
+ /* Constant */ {ASMay, ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASMay},
+ /* Private */ {ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASNo, ASNo},
+ /* Constant 32-bit */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASMay},
+ /* Buffer Fat Ptr */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay}
+ };
+#undef ASMay
+#undef ASNo
return ASAliasRules[AS1][AS2];
}
@@ -70,7 +74,7 @@
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
AliasResult Result = getAliasResult(asA, asB);
- if (Result == NoAlias)
+ if (Result == AliasResult::NoAlias)
return Result;
// In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE
@@ -87,21 +91,21 @@
if (asA == AMDGPUAS::FLAT_ADDRESS &&
(asB == AMDGPUAS::LOCAL_ADDRESS || asB == AMDGPUAS::PRIVATE_ADDRESS)) {
const auto *ObjA =
- getUnderlyingObject(A.Ptr->stripPointerCastsAndInvariantGroups());
+ getUnderlyingObject(A.Ptr->stripPointerCastsForAliasAnalysis());
if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) {
// If a generic pointer is loaded from the constant address space, it
// could only be a GLOBAL or CONSTANT one as that address space is soley
// prepared on the host side, where only GLOBAL or CONSTANT variables are
// visible. Note that this even holds for regular functions.
if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
- return NoAlias;
+ return AliasResult::NoAlias;
} else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) {
const Function *F = Arg->getParent();
switch (F->getCallingConv()) {
case CallingConv::AMDGPU_KERNEL:
// In the kernel function, kernel arguments won't alias to (local)
// variables in shared or private address space.
- return NoAlias;
+ return AliasResult::NoAlias;
default:
// TODO: In the regular function, if that local variable in the
// location B is not captured, that argument pointer won't alias to it
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 51af250..2af9fc9 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -120,10 +120,10 @@
for (GlobalVariable &GV : M.globals()) {
// TODO: Region address
unsigned AS = GV.getAddressSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
- continue;
-
- recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+ if ((AS == AMDGPUAS::REGION_ADDRESS) ||
+ (AS == AMDGPUAS::LOCAL_ADDRESS &&
+ !AMDGPUTargetMachine::EnableLowerModuleLDS))
+ recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a4e72f7..af6dfc0 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -25,6 +25,13 @@
using namespace llvm;
namespace {
+static constexpr StringLiteral ImplicitAttrNames[] = {
+ // X ids unnecessarily propagated to kernels.
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
@@ -194,18 +201,10 @@
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
- // X ids unnecessarily propagated to kernels.
- static constexpr StringLiteral AttrNames[] = {
- "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
- "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
- "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
- "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
- "amdgpu-implicitarg-ptr"};
-
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
- for (StringRef AttrName : AttrNames)
+ for (StringRef AttrName : ImplicitAttrNames)
handleAttr(Parent, Callee, AttrName);
}
@@ -268,7 +267,20 @@
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
+ bool HasIndirectCall = false;
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ CallingConv::ID CC = F.getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+
+ // If this function hasAddressTaken() = true
+ // then add all attributes corresponding to the implicit args.
+ if (CallingConvSupportsAllImplicits &&
+ F.hasAddressTaken(nullptr, true, true, true)) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
@@ -281,10 +293,12 @@
const Function *Callee =
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
- // TODO: Do something with indirect calls.
+ // Note the occurence of indirect call.
if (!Callee) {
- if (!CB->isInlineAsm())
+ if (!CB->isInlineAsm()) {
+ HasIndirectCall = true;
HaveCall = true;
+ }
continue;
}
@@ -351,6 +365,28 @@
Changed = true;
}
+ // This pass cannot copy attributes from callees to callers
+ // if there is an indirect call and in thus such cases,
+ // hasAddressTaken() would be false for kernels and functions
+ // making an indirect call (if they are themselves not indirectly called).
+ // We must tag all such kernels/functions with all implicits attributes
+ // for correctness.
+ // e.g.
+ // 1. Kernel K1 makes an indirect call to function F1.
+ // Without detecting an indirect call in K1, this pass will not
+ // add all implicit args to K1 (which is incorrect).
+ // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
+ // F2.
+ // Without detecting an indirect call in F1 (whose hasAddressTaken() is
+ // false), the pass will not add all implicit args to F1 (which is
+ // essential for correctness).
+ if (CallingConvSupportsAllImplicits && HasIndirectCall) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
+
return Changed;
}
@@ -367,9 +403,11 @@
}
Function *F = I->getFunction();
- // Add feature attributes
- if (!F || F->isDeclaration())
+ // Ignore functions with graphics calling conventions, these are currently
+ // not allowed to have kernel arguments.
+ if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
continue;
+ // Add feature attributes
Changed |= addFeatureAttributes(*F);
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index c2a4d67..7d6845b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,10 +14,8 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
@@ -30,8 +28,7 @@
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
LegacyDivergenceAnalysis *DA;
- MemoryDependenceResults *MDR;
- LoopInfo *LI;
+ MemorySSA *MSSA;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
@@ -46,8 +43,7 @@
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LegacyDivergenceAnalysis>();
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
AU.setPreservesAll();
}
@@ -61,8 +57,7 @@
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@@ -75,49 +70,14 @@
I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
}
-static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
- for (auto I : predecessors(Root))
- if (Set.insert(I))
- DFS(I, Set);
-}
-
bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
- // 1. get Loop for the Load->getparent();
- // 2. if it exists, collect all the BBs from the most outer
- // loop and check for the writes. If NOT - start DFS over all preds.
- // 3. Start DFS over all preds from the most outer loop header.
- SetVector<BasicBlock *> Checklist;
- BasicBlock *Start = Load->getParent();
- Checklist.insert(Start);
- const Value *Ptr = Load->getPointerOperand();
- const Loop *L = LI->getLoopFor(Start);
- if (L) {
- const Loop *P = L;
- do {
- L = P;
- P = P->getParentLoop();
- } while (P);
- Checklist.insert(L->block_begin(), L->block_end());
- Start = L->getHeader();
- }
-
- DFS(Start, Checklist);
- for (auto &BB : Checklist) {
- BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
- BasicBlock::iterator(Load) : BB->end();
- auto Q = MDR->getPointerDependencyFrom(
- MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
- if (Q.isClobber() || Q.isUnknown() ||
- // Store defines the load and thus clobbers it.
- (Q.isDef() && Q.getInst()->mayWriteToMemory()))
- return true;
- }
- return false;
+ const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
+ return !MSSA->isLiveOnEntryDef(MA);
}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (DA->isUniform(&I))
- setUniformMetadata(I.getParent()->getTerminator());
+ setUniformMetadata(&I);
}
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
@@ -154,9 +114,9 @@
Value *Idx = Constant::getIntegerValue(
Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
// Insert GEP at the entry to make it dominate all uses
- PtrI = GetElementPtrInst::Create(
- Ptr->getType()->getPointerElementType(), Ptr,
- ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
+ ArrayRef<Value *>(Idx), Twine(""),
+ F->getEntryBlock().getFirstNonPHI());
}
I.replaceUsesOfWith(Ptr, PtrI);
}
@@ -177,9 +137,8 @@
if (skipFunction(F))
return false;
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
- MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index fb273a1..aab76d2 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -92,7 +92,7 @@
case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: {
return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer
: nullptr,
- &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32));
+ &AMDGPU::SGPR_128RegClass, LLT::fixed_vector(4, 32));
}
case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR:
return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 139ac3b..e9ed45d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -143,7 +143,8 @@
// Input registers for non-HSA ABI
ArgDescriptor ImplicitBufferPtr;
- // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
+ // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
+ // into v0, 10 bits per dimension if packed-tid is set.
ArgDescriptor WorkItemIDX;
ArgDescriptor WorkItemIDY;
ArgDescriptor WorkItemIDZ;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c655e5e..cbc4ab2 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUResourceUsageAnalysis.h"
#include "AMDKernelCodeT.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -39,22 +40,6 @@
using namespace llvm;
using namespace llvm::AMDGPU;
-// We need to tell the runtime some amount ahead of time if we don't know the
-// true stack size. Assume a smaller number if this is only due to dynamic /
-// non-entry block allocas.
-static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
- "amdgpu-assume-external-call-stack-size",
- cl::desc("Assumed stack use of any external call (in bytes)"),
- cl::Hidden,
- cl::init(16384));
-
-static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
- "amdgpu-assume-dynamic-stack-object-size",
- cl::desc("Assumed extra stack use if there are any "
- "variable sized objects (in bytes)"),
- cl::Hidden,
- cl::init(4096));
-
// This should get the default rounding mode from the kernel. We just set the
// default here, but this could change if the OpenCL rounding mode pragmas are
// used.
@@ -97,12 +82,14 @@
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {
+ : AsmPrinter(TM, std::move(Streamer)) {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
if (isHsaAbiVersion2(getGlobalSTI())) {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
- } else {
+ } else if (isHsaAbiVersion3(getGlobalSTI())) {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+ } else {
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4());
}
}
}
@@ -122,34 +109,34 @@
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
- if (isHsaAbiVersion3(getGlobalSTI())) {
- std::string ExpectedTarget;
- raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
-
- getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
- }
+ // TODO: Which one is called first, emitStartOfAsmFile or
+ // emitFunctionBodyStart?
+ if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
+ initializeTargetID(M);
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
+ if (isHsaAbiVersion3Or4(getGlobalSTI()))
+ getTargetStreamer()->EmitDirectiveAMDGCNTarget();
+
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
- HSAMetadataStream->begin(M);
+ HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (isHsaAbiVersion3(getGlobalSTI()))
+ if (isHsaAbiVersion3Or4(getGlobalSTI()))
return;
- // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
+ // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
- // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
+ // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
- getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
+ getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2(
Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
@@ -159,15 +146,11 @@
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- isHsaAbiVersion2(getGlobalSTI())) {
- // Emit ISA Version (NT_AMD_AMDGPU_ISA).
- std::string ISAVersionString;
- raw_string_ostream ISAVersionStream(ISAVersionString);
- IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
- getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
- }
+ isHsaAbiVersion2(getGlobalSTI()))
+ getTargetStreamer()->EmitISAVersion();
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
+ // Emit HSA Metadata (NT_AMD_HSA_METADATA).
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
HSAMetadataStream->end();
bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
@@ -192,11 +175,37 @@
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ const Function &F = MF->getFunction();
+
+ // TODO: Which one is called first, emitStartOfAsmFile or
+ // emitFunctionBodyStart?
+ if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
+ initializeTargetID(*F.getParent());
+
+ const auto &FunctionTargetID = STM.getTargetID();
+ // Make sure function's xnack settings are compatible with module's
+ // xnack settings.
+ if (FunctionTargetID.isXnackSupported() &&
+ FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
+ FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
+ OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
+ "' function does not match module xnack setting");
+ return;
+ }
+ // Make sure function's sramecc settings are compatible with module's
+ // sramecc settings.
+ if (FunctionTargetID.isSramEccSupported() &&
+ FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
+ FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
+ OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
+ "' function does not match module sramecc setting");
+ return;
+ }
+
if (!MFI.isEntryFunction())
return;
- const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
- const Function &F = MF->getFunction();
if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
@@ -232,26 +241,25 @@
if (ReadOnlySection.getAlignment() < 64)
ReadOnlySection.setAlignment(Align(64));
- const MCSubtargetInfo &STI = MF->getSubtarget();
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
- STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(&STI,
+ IsaInfo::getNumExtraSGPRs(&STM,
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
- hasXNACK(STI));
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.PopSection();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
- isHsaAbiVersion3(getGlobalSTI())) {
+ isHsaAbiVersion3Or4(getGlobalSTI())) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -322,17 +330,15 @@
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
- CallGraphResourceInfo.clear();
-
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
// which is why this isn't done for Mesa.
const MCSubtargetInfo &STI = *getGlobalSTI();
- if (AMDGPU::isGFX10Plus(STI) &&
+ if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
- getTargetStreamer()->EmitCodeEnd();
+ getTargetStreamer()->EmitCodeEnd(STI);
}
return AsmPrinter::doFinalization(M);
@@ -400,6 +406,9 @@
amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
const MachineFunction &MF,
const SIProgramInfo &PI) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const Function &F = MF.getFunction();
+
amdhsa::kernel_descriptor_t KernelDescriptor;
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
@@ -409,14 +418,24 @@
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+
+ Align MaxKernArgAlign;
+ KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
+
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+ assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ if (STM.hasGFX90AInsts())
+ KernelDescriptor.compute_pgm_rsrc3 =
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+
return KernelDescriptor;
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
CurrentProgramInfo = SIProgramInfo();
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
@@ -438,12 +457,6 @@
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
- } else {
- auto I = CallGraphResourceInfo.insert(
- std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = I.first->second;
- assert(I.second && "should only be called once per function");
- Info = analyzeResourceUsage(MF);
}
if (STM.isAmdPalOS()) {
@@ -480,7 +493,8 @@
if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
- SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo(&MF.getFunction());
emitCommonFunctionComments(
Info.NumVGPR,
STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
@@ -521,6 +535,11 @@
" NumVGPRsForWavesPerEU: " +
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+ if (STM.hasGFX90AInsts())
+ OutStreamer->emitRawComment(
+ " AccumOffset: " +
+ Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+
OutStreamer->emitRawComment(
" Occupancy: " +
Twine(CurrentProgramInfo.Occupancy), false);
@@ -550,6 +569,21 @@
" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
false);
+
+ assert(STM.hasGFX90AInsts() ||
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ if (STM.hasGFX90AInsts()) {
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+ Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+ Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ false);
+ }
}
if (DumpCodeInstEmitter) {
@@ -572,6 +606,36 @@
return false;
}
+// TODO: Fold this into emitFunctionBodyStart.
+void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
+ // In the beginning all features are either 'Any' or 'NotSupported',
+ // depending on global target features. This will cover empty modules.
+ getTargetStreamer()->initializeTargetID(
+ *getGlobalSTI(), getGlobalSTI()->getFeatureString());
+
+ // If module is empty, we are done.
+ if (M.empty())
+ return;
+
+ // If module is not empty, need to find first 'Off' or 'On' feature
+ // setting per feature from functions in module.
+ for (auto &F : M) {
+ auto &TSTargetID = getTargetStreamer()->getTargetID();
+ if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
+ (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
+ break;
+
+ const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+ const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
+ if (TSTargetID->isXnackSupported())
+ if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
+ TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
+ if (TSTargetID->isSramEccSupported())
+ if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
+ TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
+ }
+}
+
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = STM.getInstrInfo();
@@ -593,398 +657,17 @@
return CodeSize;
}
-static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
- const SIInstrInfo &TII,
- unsigned Reg) {
- for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
- if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
- return true;
- }
-
- return false;
-}
-
-int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
- const GCNSubtarget &ST) const {
- return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
- UsesVCC, UsesFlatScratch);
-}
-
-int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
- const GCNSubtarget &ST) const {
- return std::max(NumVGPR, NumAGPR);
-}
-
-static const Function *getCalleeFunction(const MachineOperand &Op) {
- if (Op.isImm()) {
- assert(Op.getImm() == 0);
- return nullptr;
- }
-
- return cast<Function>(Op.getGlobal());
-}
-
-AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
- const MachineFunction &MF) const {
- SIFunctionResourceInfo Info;
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
-
- Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
- MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
-
- // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
- // instructions aren't used to access the scratch buffer. Inline assembly may
- // need it though.
- //
- // If we only have implicit uses of flat_scr on flat instructions, it is not
- // really needed.
- if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
- (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
- !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
- !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
- Info.UsesFlatScratch = false;
- }
-
- Info.PrivateSegmentSize = FrameInfo.getStackSize();
-
- // Assume a big number if there are any unknown sized objects.
- Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
- if (Info.HasDynamicallySizedStack)
- Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
-
- if (MFI->isStackRealigned())
- Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
-
- Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
- MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
- // If there are no calls, MachineRegisterInfo can tell us the used register
- // count easily.
- // A tail call isn't considered a call for MachineFrameInfo's purposes.
- if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
- MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestVGPRReg = Reg;
- break;
- }
- }
-
- if (ST.hasMAIInsts()) {
- MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestAGPRReg = Reg;
- break;
- }
- }
- Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestAGPRReg) + 1;
- }
-
- MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestSGPRReg = Reg;
- break;
- }
- }
-
- // We found the maximum register index. They start at 0, so add one to get the
- // number of registers.
- Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestVGPRReg) + 1;
- Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestSGPRReg) + 1;
-
- return Info;
- }
-
- int32_t MaxVGPR = -1;
- int32_t MaxAGPR = -1;
- int32_t MaxSGPR = -1;
- uint64_t CalleeFrameSize = 0;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- // TODO: Check regmasks? Do they occur anywhere except calls?
- for (const MachineOperand &MO : MI.operands()) {
- unsigned Width = 0;
- bool IsSGPR = false;
- bool IsAGPR = false;
-
- if (!MO.isReg())
- continue;
-
- Register Reg = MO.getReg();
- switch (Reg) {
- case AMDGPU::EXEC:
- case AMDGPU::EXEC_LO:
- case AMDGPU::EXEC_HI:
- case AMDGPU::SCC:
- case AMDGPU::M0:
- case AMDGPU::SRC_SHARED_BASE:
- case AMDGPU::SRC_SHARED_LIMIT:
- case AMDGPU::SRC_PRIVATE_BASE:
- case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SGPR_NULL:
- case AMDGPU::MODE:
- continue;
-
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- llvm_unreachable("src_pops_exiting_wave_id should not be used");
-
- case AMDGPU::NoRegister:
- assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
- continue;
-
- case AMDGPU::VCC:
- case AMDGPU::VCC_LO:
- case AMDGPU::VCC_HI:
- case AMDGPU::VCC_LO_LO16:
- case AMDGPU::VCC_LO_HI16:
- case AMDGPU::VCC_HI_LO16:
- case AMDGPU::VCC_HI_HI16:
- Info.UsesVCC = true;
- continue;
-
- case AMDGPU::FLAT_SCR:
- case AMDGPU::FLAT_SCR_LO:
- case AMDGPU::FLAT_SCR_HI:
- continue;
-
- case AMDGPU::XNACK_MASK:
- case AMDGPU::XNACK_MASK_LO:
- case AMDGPU::XNACK_MASK_HI:
- llvm_unreachable("xnack_mask registers should not be used");
-
- case AMDGPU::LDS_DIRECT:
- llvm_unreachable("lds_direct register should not be used");
-
- case AMDGPU::TBA:
- case AMDGPU::TBA_LO:
- case AMDGPU::TBA_HI:
- case AMDGPU::TMA:
- case AMDGPU::TMA_LO:
- case AMDGPU::TMA_HI:
- llvm_unreachable("trap handler registers should not be used");
-
- case AMDGPU::SRC_VCCZ:
- llvm_unreachable("src_vccz register should not be used");
-
- case AMDGPU::SRC_EXECZ:
- llvm_unreachable("src_execz register should not be used");
-
- case AMDGPU::SRC_SCC:
- llvm_unreachable("src_scc register should not be used");
-
- default:
- break;
- }
-
- if (AMDGPU::SReg_32RegClass.contains(Reg) ||
- AMDGPU::SReg_LO16RegClass.contains(Reg) ||
- AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
- AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
- AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
- AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 1;
- } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 2;
- } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 3;
- } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 3;
- } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 3;
- } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 4;
- } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 5;
- } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 5;
- } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 5;
- } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 6;
- } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 6;
- } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 6;
- } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 8;
- } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 8;
- } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 16;
- } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 16;
- } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 16;
- } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 32;
- } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 32;
- } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 32;
- } else {
- llvm_unreachable("Unknown register class");
- }
- unsigned HWReg = TRI.getHWRegIndex(Reg);
- int MaxUsed = HWReg + Width - 1;
- if (IsSGPR) {
- MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
- } else if (IsAGPR) {
- MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
- } else {
- MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
- }
- }
-
- if (MI.isCall()) {
- // Pseudo used just to encode the underlying global. Is there a better
- // way to track this?
-
- const MachineOperand *CalleeOp
- = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
-
- const Function *Callee = getCalleeFunction(*CalleeOp);
- DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
- CallGraphResourceInfo.end();
- bool IsExternal = !Callee || Callee->isDeclaration();
- if (!IsExternal)
- I = CallGraphResourceInfo.find(Callee);
-
- if (IsExternal || I == CallGraphResourceInfo.end()) {
- // Avoid crashing on undefined behavior with an illegal call to a
- // kernel. If a callsite's calling convention doesn't match the
- // function's, it's undefined behavior. If the callsite calling
- // convention does match, that would have errored earlier.
- // FIXME: The verifier shouldn't allow this.
- if (!IsExternal &&
- AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
- report_fatal_error("invalid call to entry function");
-
- // If this is a call to an external function, we can't do much. Make
- // conservative guesses.
-
- // 48 SGPRs - vcc, - flat_scr, -xnack
- int MaxSGPRGuess =
- 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
- MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
- MaxVGPR = std::max(MaxVGPR, 23);
- MaxAGPR = std::max(MaxAGPR, 23);
-
- CalleeFrameSize = std::max(CalleeFrameSize,
- static_cast<uint64_t>(AssumedStackSizeForExternalCall));
-
- Info.UsesVCC = true;
- Info.UsesFlatScratch = ST.hasFlatAddressSpace();
- Info.HasDynamicallySizedStack = true;
- } else {
- // We force CodeGen to run in SCC order, so the callee's register
- // usage etc. should be the cumulative usage of all callees.
-
- MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
- MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
- MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
- CalleeFrameSize
- = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
- Info.UsesVCC |= I->second.UsesVCC;
- Info.UsesFlatScratch |= I->second.UsesFlatScratch;
- Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
- Info.HasRecursion |= I->second.HasRecursion;
- }
-
- // FIXME: Call site could have norecurse on it
- if (!Callee || !Callee->doesNotRecurse())
- Info.HasRecursion = true;
- }
- }
- }
-
- Info.NumExplicitSGPR = MaxSGPR + 1;
- Info.NumVGPR = MaxVGPR + 1;
- Info.NumAGPR = MaxAGPR + 1;
- Info.PrivateSegmentSize += CalleeFrameSize;
-
- return Info;
-}
-
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
- SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
ProgInfo.NumArchVGPR = Info.NumVGPR;
ProgInfo.NumAccVGPR = Info.NumAGPR;
ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
+ ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
@@ -1001,7 +684,7 @@
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+ // The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
@@ -1163,6 +846,15 @@
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+ if (STM.hasGFX90AInsts()) {
+ AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ ProgInfo.AccumOffset);
+ AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ ProgInfo.TgSplit);
+ }
+
ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU);
@@ -1262,10 +954,16 @@
auto *MD = getTargetStreamer()->getPALMetadata();
const MachineFrameInfo &MFI = MF.getFrameInfo();
MD->setFunctionScratchSize(MF, MFI.getStackSize());
+
// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+
+ // Set optional info
+ MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
+ MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
@@ -1383,3 +1081,9 @@
}
return true;
}
+
+void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AMDGPUResourceUsageAnalysis>();
+ AU.addPreserved<AMDGPUResourceUsageAnalysis>();
+ AsmPrinter::getAnalysisUsage(AU);
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 9e1e26d..d3a555b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -22,6 +22,7 @@
namespace llvm {
class AMDGPUMachineFunction;
+struct AMDGPUResourceUsageAnalysis;
class AMDGPUTargetStreamer;
class MCCodeEmitter;
class MCOperand;
@@ -39,32 +40,17 @@
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
- // Track resource usage for callee functions.
- struct SIFunctionResourceInfo {
- // Track the number of explicitly used VGPRs. Special registers reserved at
- // the end are tracked separately.
- int32_t NumVGPR = 0;
- int32_t NumAGPR = 0;
- int32_t NumExplicitSGPR = 0;
- uint64_t PrivateSegmentSize = 0;
- bool UsesVCC = false;
- bool UsesFlatScratch = false;
- bool HasDynamicallySizedStack = false;
- bool HasRecursion = false;
+ void initializeTargetID(const Module &M);
- int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
- int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
- };
+ AMDGPUResourceUsageAnalysis *ResourceUsage;
SIProgramInfo CurrentProgramInfo;
- DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
- SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
@@ -146,6 +132,8 @@
const char *ExtraCode, raw_ostream &O) override;
protected:
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
std::vector<std::string> DisasmLines, HexLines;
size_t DisasmLineMaxLen;
};
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index aae2a54..3e9fdcb 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@
const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+ // Reduce within each row of 16 lanes.
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+
+ // Reduce within each pair of rows (i.e. 32 lanes).
+ assert(ST->hasPermLaneX16());
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+ if (ST->isWave32())
+ return V;
+
+ // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+ // combine them with a scalar operation.
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+ Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -287,10 +328,6 @@
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *PermLaneX16 =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
for (unsigned Idx = 0; Idx < 4; Idx++) {
V = buildNonAtomicBinOp(
@@ -317,9 +354,10 @@
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
- Value *const PermX =
- B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
- B.getFalse(), B.getFalse()});
+ assert(ST->hasPermLaneX16());
+ Value *const PermX = B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -327,7 +365,8 @@
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {V, B.getInt32(31)});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -346,10 +385,6 @@
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
if (ST->hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
@@ -357,6 +392,11 @@
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Value *Old = V;
@@ -480,6 +520,8 @@
Value *ExclScan = nullptr;
Value *NewV = nullptr;
+ const bool NeedResult = !I.use_empty();
+
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
@@ -489,35 +531,27 @@
const AtomicRMWInst::BinOp ScanOp =
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
- NewV = buildScan(B, ScanOp, NewV, Identity);
- ExclScan = buildShiftRight(B, NewV, Identity);
+ if (!NeedResult && ST->hasPermLaneX16()) {
+ // On GFX10 the permlanex16 instruction helps us build a reduction without
+ // too many readlanes and writelanes, which are generally bad for
+ // performance.
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
- // Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value which we
- // will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
- CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
- CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
- NewV = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
+ // Read the value from the last lane, which has accumlated the values of
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ assert(TyBitWidth == 32);
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, LastLaneIdx});
- } else {
- llvm_unreachable("Unhandled atomic bit width");
}
// Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
} else {
switch (Op) {
default:
@@ -583,7 +617,6 @@
// original instruction.
B.SetInsertPoint(&I);
- const bool NeedResult = !I.use_empty();
if (NeedResult) {
// Create a PHI node to get our new atomic result into the exit block.
PHINode *const PHI = B.CreatePHI(Ty, 2);
@@ -621,7 +654,8 @@
// from the first lane, to get our lane's index into the atomic result.
Value *LaneOffset = nullptr;
if (ValDivergent) {
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
} else {
switch (Op) {
default:
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
new file mode 100644
index 0000000..61b1d22
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -0,0 +1,528 @@
+//===- AMDGPUAttributor.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+
+#define DEBUG_TYPE "amdgpu-attributor"
+
+using namespace llvm;
+
+static constexpr StringLiteral ImplicitAttrNames[] = {
+ // X ids unnecessarily propagated to kernels.
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
+
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly,
+ bool &IsQueuePtr) {
+ switch (ID) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-item-id-x";
+ case Intrinsic::amdgcn_workgroup_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-group-id-x";
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ return "amdgpu-work-item-id-y";
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ return "amdgpu-work-item-id-z";
+ case Intrinsic::amdgcn_workgroup_id_y:
+ case Intrinsic::r600_read_tgid_y:
+ return "amdgpu-work-group-id-y";
+ case Intrinsic::amdgcn_workgroup_id_z:
+ case Intrinsic::r600_read_tgid_z:
+ return "amdgpu-work-group-id-z";
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return "amdgpu-dispatch-ptr";
+ case Intrinsic::amdgcn_dispatch_id:
+ return "amdgpu-dispatch-id";
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ return "amdgpu-kernarg-segment-ptr";
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return "amdgpu-implicitarg-ptr";
+ case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ // TODO: Does not require queue ptr on gfx9+
+ case Intrinsic::trap:
+ case Intrinsic::debugtrap:
+ IsQueuePtr = true;
+ return "amdgpu-queue-ptr";
+ default:
+ return "";
+ }
+}
+
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+ return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool isDSAddress(const Constant *C) {
+ const GlobalValue *GV = dyn_cast<GlobalValue>(C);
+ if (!GV)
+ return false;
+ unsigned AS = GV->getAddressSpace();
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
+}
+
+class AMDGPUInformationCache : public InformationCache {
+public:
+ AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
+ BumpPtrAllocator &Allocator,
+ SetVector<Function *> *CGSCC, TargetMachine &TM)
+ : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
+ TargetMachine &TM;
+
+ enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
+
+ /// Check if the subtarget has aperture regs.
+ bool hasApertureRegs(Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.hasApertureRegs();
+ }
+
+private:
+ /// Check if the ConstantExpr \p CE requires queue ptr attribute.
+ static bool visitConstExpr(const ConstantExpr *CE) {
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+ return castRequiresQueuePtr(SrcAS);
+ }
+ return false;
+ }
+
+ /// Get the constant access bitmap for \p C.
+ uint8_t getConstantAccess(const Constant *C) {
+ auto It = ConstantStatus.find(C);
+ if (It != ConstantStatus.end())
+ return It->second;
+
+ uint8_t Result = 0;
+ if (isDSAddress(C))
+ Result = DS_GLOBAL;
+
+ if (const auto *CE = dyn_cast<ConstantExpr>(C))
+ if (visitConstExpr(CE))
+ Result |= ADDR_SPACE_CAST;
+
+ for (const Use &U : C->operands()) {
+ const auto *OpC = dyn_cast<Constant>(U);
+ if (!OpC)
+ continue;
+
+ Result |= getConstantAccess(OpC);
+ }
+ return Result;
+ }
+
+public:
+ /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
+ bool needsQueuePtr(const Constant *C, Function &Fn) {
+ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
+ bool HasAperture = hasApertureRegs(Fn);
+
+ // No need to explore the constants.
+ if (!IsNonEntryFunc && HasAperture)
+ return false;
+
+ uint8_t Access = getConstantAccess(C);
+
+ // We need to trap on DS globals in non-entry functions.
+ if (IsNonEntryFunc && (Access & DS_GLOBAL))
+ return true;
+
+ return !HasAperture && (Access & ADDR_SPACE_CAST);
+ }
+
+private:
+ /// Used to determine if the Constant needs a queue ptr attribute.
+ DenseMap<const Constant *, uint8_t> ConstantStatus;
+};
+
+struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDAttributes &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName().
+ const std::string getName() const override { return "AAAMDAttributes"; }
+
+ /// See AbstractAttribute::getIdAddr().
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDAttributes.
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ virtual const DenseSet<StringRef> &getAttributes() const = 0;
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+const char AAAMDAttributes::ID = 0;
+
+struct AAAMDWorkGroupSize
+ : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName().
+ const std::string getName() const override { return "AAAMDWorkGroupSize"; }
+
+ /// See AbstractAttribute::getIdAddr().
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDAttributes.
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+const char AAAMDWorkGroupSize::ID = 0;
+
+struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
+ AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
+ : AAAMDWorkGroupSize(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ CallingConv::ID CC = F->getCallingConv();
+
+ if (CC != CallingConv::AMDGPU_KERNEL)
+ return;
+
+ bool InitialValue = false;
+ if (F->hasFnAttribute("uniform-work-group-size"))
+ InitialValue = F->getFnAttribute("uniform-work-group-size")
+ .getValueAsString()
+ .equals("true");
+
+ if (InitialValue)
+ indicateOptimisticFixpoint();
+ else
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ auto CheckCallSite = [&](AbstractCallSite CS) {
+ Function *Caller = CS.getInstruction()->getFunction();
+ LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName()
+ << "->" << getAssociatedFunction()->getName() << "\n");
+
+ const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>(
+ *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+
+ Change = Change | clampStateAndIndicateChange(this->getState(),
+ CallerInfo.getState());
+
+ return true;
+ };
+
+ bool AllCallSitesKnown = true;
+ if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+ indicatePessimisticFixpoint();
+
+ return Change;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+ AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
+ getAssumed() ? "true" : "false"));
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ bool isValidState() const override {
+ // This state is always valid, even when the state is false.
+ return true;
+ }
+
+ const std::string getAsStr() const override {
+ return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A);
+ llvm_unreachable("AAAMDWorkGroupSize is only valid for function position");
+}
+
+struct AAAMDAttributesFunction : public AAAMDAttributes {
+ AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
+ : AAAMDAttributes(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ CallingConv::ID CC = F->getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+
+ // Don't add attributes to instrinsics
+ if (F->isIntrinsic()) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ // Ignore functions with graphics calling conventions, these are currently
+ // not allowed to have kernel arguments.
+ if (AMDGPU::isGraphics(F->getCallingConv())) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ for (StringRef Attr : ImplicitAttrNames) {
+ if (F->hasFnAttribute(Attr))
+ Attributes.insert(Attr);
+ }
+
+ // TODO: We shouldn't need this in the future.
+ if (CallingConvSupportsAllImplicits &&
+ F->hasAddressTaken(nullptr, true, true, true)) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ Attributes.insert(AttrName);
+ }
+ }
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
+ CallingConv::ID CC = F->getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+ auto AddAttribute = [&](StringRef AttrName) {
+ if (Attributes.insert(AttrName).second)
+ Change = ChangeStatus::CHANGED;
+ };
+
+ // Check for Intrinsics and propagate attributes.
+ const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+ *this, this->getIRPosition(), DepClassTy::REQUIRED);
+
+ // We have to assume that we can reach a function with these attributes.
+ // We do not consider inline assembly as a unknown callee.
+ if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ AddAttribute(AttrName);
+ }
+ }
+
+ bool NeedsQueuePtr = false;
+ bool HasCall = false;
+ for (Function *Callee : AAEdges.getOptimisticEdges()) {
+ Intrinsic::ID IID = Callee->getIntrinsicID();
+ if (IID != Intrinsic::not_intrinsic) {
+ if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
+ AddAttribute("amdgpu-kernarg-segment-ptr");
+ continue;
+ }
+
+ bool NonKernelOnly = false;
+ StringRef AttrName =
+ intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr);
+
+ if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly))
+ AddAttribute(AttrName);
+
+ continue;
+ }
+
+ HasCall = true;
+ const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
+ *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+ const DenseSet<StringRef> &CalleeAttributes = AAAMD.getAttributes();
+ // Propagate implicit attributes from called function.
+ for (StringRef AttrName : ImplicitAttrNames)
+ if (CalleeAttributes.count(AttrName))
+ AddAttribute(AttrName);
+ }
+
+ HasCall |= AAEdges.hasUnknownCallee();
+ if (!IsNonEntryFunc && HasCall)
+ AddAttribute("amdgpu-calls");
+
+ // Check the function body.
+ auto CheckAlloca = [&](Instruction &I) {
+ AddAttribute("amdgpu-stack-objects");
+ return false;
+ };
+
+ bool UsedAssumedInformation = false;
+ A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca},
+ UsedAssumedInformation);
+
+ // If we found that we need amdgpu-queue-ptr, nothing else to do.
+ if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+
+ auto CheckAddrSpaceCasts = [&](Instruction &I) {
+ unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
+ if (castRequiresQueuePtr(SrcAS)) {
+ NeedsQueuePtr = true;
+ return false;
+ }
+ return true;
+ };
+
+ bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
+
+ // `checkForAllInstructions` is much more cheaper than going through all
+ // instructions, try it first.
+
+ // amdgpu-queue-ptr is not needed if aperture regs is present.
+ if (!HasApertureRegs)
+ A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
+ {Instruction::AddrSpaceCast},
+ UsedAssumedInformation);
+
+ // If we found that we need amdgpu-queue-ptr, nothing else to do.
+ if (NeedsQueuePtr) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+
+ if (!IsNonEntryFunc && HasApertureRegs)
+ return Change;
+
+ for (BasicBlock &BB : *F) {
+ for (Instruction &I : BB) {
+ for (const Use &U : I.operands()) {
+ if (const auto *C = dyn_cast<Constant>(U)) {
+ if (InfoCache.needsQueuePtr(C, *F)) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+ }
+ }
+ }
+ }
+
+ return Change;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+ for (StringRef AttrName : Attributes)
+ AttrList.push_back(Attribute::get(Ctx, AttrName));
+
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ const std::string getAsStr() const override {
+ return "AMDInfo[" + std::to_string(Attributes.size()) + "]";
+ }
+
+ const DenseSet<StringRef> &getAttributes() const override {
+ return Attributes;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+private:
+ DenseSet<StringRef> Attributes;
+};
+
+AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
+ llvm_unreachable("AAAMDAttributes is only valid for function position");
+}
+
+class AMDGPUAttributor : public ModulePass {
+public:
+ AMDGPUAttributor() : ModulePass(ID) {}
+
+ /// doInitialization - Virtual method overridden by subclasses to do
+ /// any necessary initialization before any pass is run.
+ bool doInitialization(Module &) override {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ report_fatal_error("TargetMachine is required");
+
+ TM = &TPC->getTM<TargetMachine>();
+ return false;
+ }
+
+ bool runOnModule(Module &M) override {
+ SetVector<Function *> Functions;
+ AnalysisGetter AG;
+ for (Function &F : M)
+ Functions.insert(&F);
+
+ CallGraphUpdater CGUpdater;
+ BumpPtrAllocator Allocator;
+ AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
+ Attributor A(Functions, InfoCache, CGUpdater);
+
+ for (Function &F : M) {
+ A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
+ }
+
+ ChangeStatus Change = A.run();
+ return Change == ChangeStatus::CHANGED;
+ }
+
+ StringRef getPassName() const override { return "AMDGPU Attributor"; }
+ TargetMachine *TM;
+ static char ID;
+};
+
+char AMDGPUAttributor::ID = 0;
+
+Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
+INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 852a05b..b9faad4 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -29,44 +29,39 @@
namespace {
-struct AMDGPUValueHandler : public CallLowering::ValueHandler {
- AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
- MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
- : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
-
- /// Wrapper around extendRegister to ensure we extend to a full 32-bit
- /// register.
- Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
- if (VA.getLocVT().getSizeInBits() < 32) {
- // 16-bit types are reported as legal for 32-bit registers. We need to
- // extend and do a 32-bit copy to avoid the verifier complaining about it.
- return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
- }
-
- return extendRegister(ValVReg, VA);
+/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
+static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
+ Register ValVReg, CCValAssign &VA) {
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to
+ // extend and do a 32-bit copy to avoid the verifier complaining about it.
+ return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
}
-};
-struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
+ return Handler.extendRegister(ValVReg, VA);
+}
+
+struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : OutgoingValueHandler(B, MRI), MIB(MIB) {}
MachineInstrBuilder MIB;
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
llvm_unreachable("not implemented");
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
llvm_unreachable("not implemented");
}
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
- Register ExtReg = extendRegisterMin32(ValVReg, VA);
+ Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
// ends up in a VGPR.
@@ -83,27 +78,23 @@
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}
-
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info,
- ISD::ArgFlagsTy Flags,
- CCState &State) override {
- return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
- }
};
-struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
+struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
uint64_t StackUsed = 0;
- AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
+ AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
+ : IncomingValueHandler(B, MRI) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- int FI = MFI.CreateFixedObject(Size, Offset, true);
+
+ // Byval is assumed to be writable memory, but other stack passed arguments
+ // are not.
+ const bool IsImmutable = !Flags.isByVal();
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
auto AddrReg = MIRBuilder.buildFrameIndex(
LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
@@ -119,35 +110,24 @@
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
+
+ // If we have signext/zeroext, it applies to the whole 32-bit register
+ // before truncation.
+ auto Extended =
+ buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
+ MIRBuilder.buildTrunc(ValVReg, Extended);
return;
}
- switch (VA.getLocInfo()) {
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- default:
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
- }
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- // The reported memory location may be wider than the value.
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
-
- // FIXME: Get alignment
auto MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -159,9 +139,8 @@
};
struct FormalArgHandler : public AMDGPUIncomingArgHandler {
- FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
+ FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
+ : AMDGPUIncomingArgHandler(B, MRI) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -170,8 +149,8 @@
struct CallReturnHandler : public AMDGPUIncomingArgHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -180,10 +159,7 @@
MachineInstrBuilder MIB;
};
-struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
- MachineInstrBuilder MIB;
- CCAssignFn *AssignFnVarArg;
-
+struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
/// For tail calls, the byte offset of the call's argument area from the
/// callee's. Unused elsewhere.
int FPDiff;
@@ -195,20 +171,23 @@
AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
- CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
bool IsTailCall = false, int FPDiff = 0)
- : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
- }
+ : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
+ IsTailCall(IsTailCall) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
MachineFunction &MF = MIRBuilder.getMF();
const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
const LLT S32 = LLT::scalar(32);
if (IsTailCall) {
- llvm_unreachable("implement me");
+ Offset += FPDiff;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
+ MPO = MachinePointerInfo::getFixedStack(MF, FI);
+ return FIReg.getReg(0);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -226,35 +205,29 @@
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg = extendRegisterMin32(ValVReg, VA);
+ Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
uint64_t LocMemOffset = VA.getLocMemOffset();
const auto &ST = MF.getSubtarget<GCNSubtarget>();
auto MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, Size,
- commonAlignment(ST.getStackAlignment(), LocMemOffset));
+ MPO, MachineMemOperand::MOStore, MemTy,
+ commonAlignment(ST.getStackAlignment(), LocMemOffset));
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
- void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
- uint64_t MemSize, MachinePointerInfo &MPO,
- CCValAssign &VA) override {
+ void assignValueToAddress(const CallLowering::ArgInfo &Arg,
+ unsigned ValRegIndex, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
- ? extendRegister(Arg.Regs[0], VA)
- : Arg.Regs[0];
-
- // If we extended the value type we might need to adjust the MMO's
- // Size. This happens if ComputeValueVTs widened a small type value to a
- // legal register type (e.g. s8->s16)
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
- assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
+ ? extendRegister(Arg.Regs[ValRegIndex], VA)
+ : Arg.Regs[ValRegIndex];
+ assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
}
};
}
@@ -277,149 +250,6 @@
}
}
-// FIXME: This should move to generic code.
-void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B,
- const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- CallingConv::ID CallConv) const {
- const SITargetLowering &TLI = *getTLI<SITargetLowering>();
- LLVMContext &Ctx = OrigArg.Ty->getContext();
-
- SmallVector<EVT, 4> SplitVTs;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
-
- assert(OrigArg.Regs.size() == SplitVTs.size());
-
- if (SplitVTs.size() == 0)
- return;
-
- if (SplitVTs.size() == 1) {
- // No splitting to do, but we want to replace the original type (e.g. [1 x
- // double] -> double).
- SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags[0], OrigArg.IsFixed);
- return;
- }
-
- // Create one ArgInfo for each virtual register in the original ArgInfo.
- assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
-
- bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
- OrigArg.Ty, CallConv, false);
- for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
- Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
- SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
- OrigArg.IsFixed);
- if (NeedsRegBlock)
- SplitArgs.back().Flags[0].setInConsecutiveRegs();
- }
-
- SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
-}
-
-void AMDGPUCallLowering::processSplitArgs(
- MachineIRBuilder &B, const ArgInfo &OrigArg,
- const SmallVectorImpl<ArgInfo> &SplitArg,
- SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
- CallingConv::ID CallConv, bool IsOutgoing,
- SplitArgTy PerformArgSplit) const {
- LLVMContext &Ctx = OrigArg.Ty->getContext();
- const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
- // FIXME: This is mostly nasty pre-processing before handleAssignments. Most
- // of this should be performed by handleAssignments.
-
- for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
- const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
- Register Reg = OrigArg.Regs[SplitIdx];
- EVT VT = EVT::getEVT(CurSplitArg.Ty);
- LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
-
- unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
- MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
-
- if (NumParts == 1) {
- // No splitting to do, but we want to replace the original type (e.g. [1 x
- // double] -> double).
- SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
- OrigArg.IsFixed);
- continue;
- }
-
- SmallVector<Register, 8> SplitRegs;
- Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
- LLT PartLLT = getLLTForType(*PartTy, DL);
- MachineRegisterInfo &MRI = *B.getMRI();
-
- // FIXME: Should we be reporting all of the part registers for a single
- // argument, and let handleAssignments take care of the repacking?
- for (unsigned i = 0; i < NumParts; ++i) {
- Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
- SplitRegs.push_back(PartReg);
- SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
- }
-
- PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
- }
-}
-
-// TODO: Move to generic code
-static void unpackRegsToOrigType(MachineIRBuilder &B,
- ArrayRef<Register> DstRegs,
- Register SrcReg,
- const CallLowering::ArgInfo &Info,
- LLT SrcTy,
- LLT PartTy) {
- assert(DstRegs.size() > 1 && "Nothing to unpack");
-
- const unsigned PartSize = PartTy.getSizeInBits();
-
- if (SrcTy.isVector() && !PartTy.isVector() &&
- PartSize > SrcTy.getElementType().getSizeInBits()) {
- // Vector was scalarized, and the elements extended.
- auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
- for (int i = 0, e = DstRegs.size(); i != e; ++i)
- B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
- return;
- }
-
- LLT GCDTy = getGCDType(SrcTy, PartTy);
- if (GCDTy == PartTy) {
- // If this already evenly divisible, we can create a simple unmerge.
- B.buildUnmerge(DstRegs, SrcReg);
- return;
- }
-
- MachineRegisterInfo &MRI = *B.getMRI();
- LLT DstTy = MRI.getType(DstRegs[0]);
- LLT LCMTy = getLCMType(SrcTy, PartTy);
-
- const unsigned LCMSize = LCMTy.getSizeInBits();
- const unsigned DstSize = DstTy.getSizeInBits();
- const unsigned SrcSize = SrcTy.getSizeInBits();
-
- Register UnmergeSrc = SrcReg;
- if (LCMSize != SrcSize) {
- // Widen to the common type.
- Register Undef = B.buildUndef(SrcTy).getReg(0);
- SmallVector<Register, 8> MergeParts(1, SrcReg);
- for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
- MergeParts.push_back(Undef);
-
- UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
- }
-
- // Unmerge to the original registers and pad with dead defs.
- SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
- for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
- Size += DstSize) {
- UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
- }
-
- B.buildUnmerge(UnmergeResults, UnmergeSrc);
-}
-
bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
@@ -458,18 +288,12 @@
assert(VRegs.size() == SplitEVTs.size() &&
"For each split Type there should be exactly one VReg.");
- // We pre-process the return value decomposed into EVTs.
- SmallVector<ArgInfo, 8> PreSplitRetInfos;
-
- // Further processing is applied to split the arguments from PreSplitRetInfos
- // into 32-bit pieces in SplitRetInfos before passing off to
- // handleAssignments.
SmallVector<ArgInfo, 8> SplitRetInfos;
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
EVT VT = SplitEVTs[i];
Register Reg = VRegs[i];
- ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
+ ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0);
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
if (VT.isScalarInteger()) {
@@ -497,23 +321,15 @@
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
}
- splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC);
-
- // FIXME: This splitting should mostly be done by handleAssignments
- processSplitArgs(B, RetInfo,
- PreSplitRetInfos, SplitRetInfos, DL, CC, true,
- [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
- LLT PartLLT, int VTSplitIdx) {
- unpackRegsToOrigType(B, Regs, SrcReg,
- PreSplitRetInfos[VTSplitIdx], LLTy,
- PartLLT);
- });
- PreSplitRetInfos.clear();
+ splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
}
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
- AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
- return handleAssignments(B, SplitRetInfos, RetHandler);
+
+ OutgoingValueAssigner Assigner(AssignFn);
+ AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
+ return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
+ CC, F.isVarArg());
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -568,7 +384,6 @@
}
void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
- Type *ParamTy,
uint64_t Offset) const {
MachineFunction &MF = B.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -582,26 +397,45 @@
B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
}
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
- uint64_t Offset, Align Alignment,
- Register DstReg) const {
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
+ uint64_t Offset,
+ Align Alignment) const {
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
- lowerParameterPtr(PtrReg, B, ParamTy, Offset);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- TypeSize, Alignment);
+ SmallVector<ArgInfo, 32> SplitArgs;
+ SmallVector<uint64_t> FieldOffsets;
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
- B.buildLoad(DstReg, PtrReg, *MMO);
+ unsigned Idx = 0;
+ for (ArgInfo &SplitArg : SplitArgs) {
+ Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+ lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
+
+ LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
+ if (SplitArg.Flags[0].isPointer()) {
+ // Compensate for losing pointeriness in splitValueTypes.
+ LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
+ ArgTy.getScalarSizeInBits());
+ ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy)
+ : PtrTy;
+ }
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ ArgTy, commonAlignment(Alignment, FieldOffsets[Idx]));
+
+ assert(SplitArg.Regs.size() == 1);
+
+ B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO);
+ ++Idx;
+ }
}
// Allocate special inputs passed in user SGPRs.
@@ -665,9 +499,10 @@
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
const DataLayout &DL = F.getParent()->getDataLayout();
+ Info->allocateModuleLDSGlobal(F.getParent());
+
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -706,24 +541,19 @@
assert(VRegs[i].size() == 1 &&
"expected only one register for byval pointers");
if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
- lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
+ lowerParameterPtr(VRegs[i][0], B, ArgOffset);
} else {
const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
- lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
+ lowerParameterPtr(PtrReg, B, ArgOffset);
B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
}
} else {
- ArrayRef<Register> OrigArgRegs = VRegs[i];
- Register ArgReg =
- OrigArgRegs.size() == 1
- ? OrigArgRegs[0]
- : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
-
- lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
- if (OrigArgRegs.size() > 1)
- unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+ ArgInfo OrigArg(VRegs[i], Arg, i);
+ const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
+ setArgFlags(OrigArg, OrigArgIdx, DL, F);
+ lowerParameter(B, OrigArg, ArgOffset, Alignment);
}
++i;
@@ -734,117 +564,6 @@
return true;
}
-/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
-static MachineInstrBuilder mergeVectorRegsToResultRegs(
- MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
- MachineRegisterInfo &MRI = *B.getMRI();
- LLT LLTy = MRI.getType(DstRegs[0]);
- LLT PartLLT = MRI.getType(SrcRegs[0]);
-
- // Deal with v3s16 split into v2s16
- LLT LCMTy = getLCMType(LLTy, PartLLT);
- if (LCMTy == LLTy) {
- // Common case where no padding is needed.
- assert(DstRegs.size() == 1);
- return B.buildConcatVectors(DstRegs[0], SrcRegs);
- }
-
- const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
- Register Undef = B.buildUndef(PartLLT).getReg(0);
-
- // Build vector of undefs.
- SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
-
- // Replace the first sources with the real registers.
- std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
-
- auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
- int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
-
- SmallVector<Register, 8> PadDstRegs(NumDst);
- std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
-
- // Create the excess dead defs for the unmerge.
- for (int I = DstRegs.size(); I != NumDst; ++I)
- PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
-
- return B.buildUnmerge(PadDstRegs, Widened);
-}
-
-// TODO: Move this to generic code
-static void packSplitRegsToOrigType(MachineIRBuilder &B,
- ArrayRef<Register> OrigRegs,
- ArrayRef<Register> Regs,
- LLT LLTy,
- LLT PartLLT) {
- MachineRegisterInfo &MRI = *B.getMRI();
-
- if (!LLTy.isVector() && !PartLLT.isVector()) {
- assert(OrigRegs.size() == 1);
- LLT OrigTy = MRI.getType(OrigRegs[0]);
-
- unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
- if (SrcSize == OrigTy.getSizeInBits())
- B.buildMerge(OrigRegs[0], Regs);
- else {
- auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
- B.buildTrunc(OrigRegs[0], Widened);
- }
-
- return;
- }
-
- if (LLTy.isVector() && PartLLT.isVector()) {
- assert(OrigRegs.size() == 1);
- assert(LLTy.getElementType() == PartLLT.getElementType());
- mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
- return;
- }
-
- assert(LLTy.isVector() && !PartLLT.isVector());
-
- LLT DstEltTy = LLTy.getElementType();
-
- // Pointer information was discarded. We'll need to coerce some register types
- // to avoid violating type constraints.
- LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
-
- assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
-
- if (DstEltTy == PartLLT) {
- // Vector was trivially scalarized.
-
- if (RealDstEltTy.isPointer()) {
- for (Register Reg : Regs)
- MRI.setType(Reg, RealDstEltTy);
- }
-
- B.buildBuildVector(OrigRegs[0], Regs);
- } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
- // Deal with vector with 64-bit elements decomposed to 32-bit
- // registers. Need to create intermediate 64-bit elements.
- SmallVector<Register, 8> EltMerges;
- int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
-
- assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
-
- for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
- auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
- // Fix the type in case this is really a vector of pointers.
- MRI.setType(Merge.getReg(0), RealDstEltTy);
- EltMerges.push_back(Merge.getReg(0));
- Regs = Regs.drop_front(PartsPerElt);
- }
-
- B.buildBuildVector(OrigRegs[0], EltMerges);
- } else {
- // Vector was split, and elements promoted to a wider type.
- LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
- auto BV = B.buildBuildVector(BVType, Regs);
- B.buildTrunc(OrigRegs[0], BV);
- }
-}
-
bool AMDGPUCallLowering::lowerFormalArguments(
MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
FunctionLoweringInfo &FLI) const {
@@ -867,6 +586,7 @@
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
+ Info->allocateModuleLDSGlobal(F.getParent());
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -885,7 +605,6 @@
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
- SmallVector<ArgInfo, 8> SplitArg;
SmallVector<ArgInfo, 32> SplitArgs;
unsigned Idx = 0;
unsigned PSInputNum = 0;
@@ -931,23 +650,11 @@
}
}
- ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
setArgFlags(OrigArg, OrigArgIdx, DL, F);
- SplitArg.clear();
- splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
-
- processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
- // FIXME: We should probably be passing multiple registers
- // to handleAssignments to do this
- [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
- LLT PartLLT, int VTSplitIdx) {
- assert(DstReg == VRegs[Idx][VTSplitIdx]);
- packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
- LLTy, PartLLT);
- });
-
+ splitToValueTypes(OrigArg, SplitArgs, DL, CC);
++Idx;
}
@@ -1004,10 +711,16 @@
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
- FormalArgHandler Handler(B, MRI, AssignFn);
- if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
+ IncomingValueAssigner Assigner(AssignFn);
+ if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ FormalArgHandler Handler(B, MRI);
+ if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
+ return false;
+
+ uint64_t StackOffset = Assigner.StackOffset;
+
if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.
TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
@@ -1022,6 +735,12 @@
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
+ // When we tail call, we need to check if the callee's arguments will fit on
+ // the caller's stack. So, whenever we lower formal arguments, we should keep
+ // track of this information, since we might lower a tail call in this
+ // function later.
+ Info->setBytesInStackArgArea(StackOffset);
+
// Move back to the end of the basic block.
B.setMBB(MBB);
@@ -1184,7 +903,7 @@
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall) {
- return AMDGPU::SI_CALL;
+ return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
}
// Add operands to call instruction to track the callee.
@@ -1208,6 +927,317 @@
return true;
}
+bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const {
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ // If the calling conventions match, then everything must be the same.
+ if (CalleeCC == CallerCC)
+ return true;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ // Make sure that the caller and callee preserve all of the same registers.
+ auto TRI = ST.getRegisterInfo();
+
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+
+ // Check if the caller and callee will handle arguments in the same way.
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ CCAssignFn *CalleeAssignFnFixed;
+ CCAssignFn *CalleeAssignFnVarArg;
+ std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
+ getAssignFnsForCC(CalleeCC, TLI);
+
+ CCAssignFn *CallerAssignFnFixed;
+ CCAssignFn *CallerAssignFnVarArg;
+ std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
+ getAssignFnsForCC(CallerCC, TLI);
+
+ // FIXME: We are not accounting for potential differences in implicitly passed
+ // inputs, but only the fixed ABI is supported now anyway.
+ IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
+ CalleeAssignFnVarArg);
+ IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
+ CallerAssignFnVarArg);
+ return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
+}
+
+bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // If there are no outgoing arguments, then we are done.
+ if (OutArgs.empty())
+ return true;
+
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ // We have outgoing arguments. Make sure that we can tail call with them.
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+
+ if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
+ LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+ return false;
+ }
+
+ // Make sure that they can fit on the caller's stack.
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+ LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+ return false;
+ }
+
+ // Verify that the parameters in callee-saved registers match.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
+ MachineIRBuilder &B, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // Must pass all target-independent checks in order to tail call optimize.
+ if (!Info.IsTailCall)
+ return false;
+
+ MachineFunction &MF = B.getMF();
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ // Kernels aren't callable, and don't have a live in return address so it
+ // doesn't make sense to do a tail call with entry functions.
+ if (!CallerPreserved)
+ return false;
+
+ if (!mayTailCallThisCC(CalleeCC)) {
+ LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
+ return false;
+ }
+
+ if (any_of(CallerF.args(), [](const Argument &A) {
+ return A.hasByValAttr() || A.hasSwiftErrorAttr();
+ })) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
+ "or swifterror arguments\n");
+ return false;
+ }
+
+ // If we have -tailcallopt, then we're done.
+ if (MF.getTarget().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+
+ // Verify that the incoming and outgoing arguments from the callee are
+ // safe to tail call.
+ if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Caller and callee have incompatible calling conventions.\n");
+ return false;
+ }
+
+ if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
+ return true;
+}
+
+// Insert outgoing implicit arguments for a call, by inserting copies to the
+// implicit argument registers and adding the necessary implicit uses to the
+// call instruction.
+void AMDGPUCallLowering::handleImplicitCallArguments(
+ MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
+ const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
+ ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
+ if (!ST.enableFlatScratch()) {
+ // Insert copies for the SRD. In the HSA case, this should be an identity
+ // copy.
+ auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
+ FuncInfo.getScratchRSrcReg());
+ MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+ }
+
+ for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
+ MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
+ CallInst.addReg(ArgReg.first, RegState::Implicit);
+ }
+}
+
+bool AMDGPUCallLowering::lowerTailCall(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+ // True when we're tail calling, but without -tailcallopt.
+ bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // Find out which ABI gets to decide where things go.
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ MachineInstrBuilder CallSeqStart;
+ if (!IsSibCall)
+ CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
+
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ if (!addCallTargetOperands(MIB, MIRBuilder, Info))
+ return false;
+
+ // Byte offset for the tail call. When we are sibcalling, this will always
+ // be 0.
+ MIB.addImm(0);
+
+ // Tell the call which registers are clobbered.
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
+ MIB.addRegMask(Mask);
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0.
+ int FPDiff = 0;
+
+ // This will be 0 for sibcalls, potentially nonzero for tail calls produced
+ // by -tailcallopt. For sibcalls, the memory operands for the call are
+ // already available in the caller's incoming argument space.
+ unsigned NumBytes = 0;
+ if (!IsSibCall) {
+ // We aren't sibcalling, so we need to compute FPDiff. We need to do this
+ // before handling assignments, because FPDiff must be known for memory
+ // arguments.
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
+
+ // FIXME: Not accounting for callee implicit inputs
+ OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
+ if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
+ return false;
+
+ // The callee will pop the argument stack as a tail call. Thus, we must
+ // keep it 16-byte aligned.
+ NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(isAligned(ST.getStackAlignment(), FPDiff) &&
+ "unaligned stack on tail call");
+ }
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
+
+ // We could pass MIB and directly add the implicit uses to the call
+ // now. However, as an aesthetic choice, place implicit argument operands
+ // after the ordinary user argument registers.
+ SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
+
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ // With a fixed ABI, allocate fixed registers before user arguments.
+ if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
+ return false;
+ }
+
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+
+ if (!determineAssignments(Assigner, OutArgs, CCInfo))
+ return false;
+
+ // Do the actual argument marshalling.
+ AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
+ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
+ return false;
+
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
+
+ // If we have -tailcallopt, we need to adjust the stack. We'll do the call
+ // sequence start and end here.
+ if (!IsSibCall) {
+ MIB->getOperand(1).setImm(FPDiff);
+ CallSeqStart.addImm(NumBytes).addImm(0);
+ // End the call sequence *before* emitting the call. Normally, we would
+ // tidy the frame up after the call. However, here, we've laid out the
+ // parameters so that when SP is reset, they will be in the correct
+ // location.
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+
+ // FIXME: We should define regbankselectable call instructions to handle
+ // divergent call targets.
+ if (MIB->getOperand(0).isReg()) {
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
+ MIB->getDesc(), MIB->getOperand(0), 0));
+ }
+
+ MF.getFrameInfo().setHasTailCall();
+ Info.LoweredTailCall = true;
+ return true;
+}
+
bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Info.IsVarArg) {
@@ -1223,39 +1253,24 @@
MachineRegisterInfo &MRI = MF.getRegInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- CallingConv::ID CallConv = F.getCallingConv();
if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- CallConv != CallingConv::AMDGPU_Gfx) {
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
return false;
}
- if (AMDGPU::isShader(CallConv)) {
- LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
- return false;
- }
-
SmallVector<ArgInfo, 8> OutArgs;
+ for (auto &OrigArg : Info.OrigArgs)
+ splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
- SmallVector<ArgInfo, 8> SplitArg;
- for (auto &OrigArg : Info.OrigArgs) {
- splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv);
-
- processSplitArgs(
- MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
- // FIXME: We should probably be passing multiple registers to
- // handleAssignments to do this
- [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
- int VTSplitIdx) {
- unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
- });
-
- SplitArg.clear();
- }
+ SmallVector<ArgInfo, 8> InArgs;
+ if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
+ splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
// If we can lower as a tail call, do that instead.
- bool CanTailCallOpt = false;
+ bool CanTailCallOpt =
+ isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
// We must emit a tail call if we have musttail.
if (Info.IsMustTailCall && !CanTailCallOpt) {
@@ -1263,6 +1278,9 @@
return false;
}
+ if (CanTailCallOpt)
+ return lowerTailCall(MIRBuilder, Info, OutArgs);
+
// Find out which ABI gets to decide where things go.
CCAssignFn *AssignFnFixed;
CCAssignFn *AssignFnVarArg;
@@ -1295,7 +1313,8 @@
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1303,26 +1322,18 @@
// Do the actual argument marshalling.
SmallVector<Register, 8> PhysRegs;
- AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, false);
- if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
+
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+ if (!determineAssignments(Assigner, OutArgs, CCInfo))
+ return false;
+
+ AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
+ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
return false;
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!ST.enableFlatScratch()) {
- // Insert copies for the SRD. In the HSA case, this should be an identity
- // copy.
- auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
- MFI->getScratchRSrcReg());
- MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
- }
-
- for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
- MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
- MIB.addReg(ArgReg.first, RegState::Implicit);
- }
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -1340,55 +1351,32 @@
1));
}
- auto OrigInsertPt = MIRBuilder.getInsertPt();
-
// Now we can add the actual call instruction to the correct position.
MIRBuilder.insertInstr(MIB);
- // Insert this now to give us an anchor point for managing the insert point.
- MachineInstrBuilder CallSeqEnd =
- MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
-
- SmallVector<ArgInfo, 8> InArgs;
- if (!Info.CanLowerReturn) {
- insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
- Info.DemoteRegister, Info.DemoteStackIndex);
- } else if (!Info.OrigRet.Ty->isVoidTy()) {
- SmallVector<ArgInfo, 8> PreSplitRetInfos;
-
- splitToValueTypes(
- MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
-
- processSplitArgs(MIRBuilder, Info.OrigRet,
- PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
- [&](ArrayRef<Register> Regs, Register DstReg,
- LLT LLTy, LLT PartLLT, int VTSplitIdx) {
- assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
- packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
- Regs, LLTy, PartLLT);
- });
- }
-
- // Make sure the raw argument copies are inserted before the marshalling to
- // the original types.
- MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
-
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
Info.IsVarArg);
- CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, InArgs, Handler))
+ IncomingValueAssigner Assigner(RetAssignFn);
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
return false;
}
uint64_t CalleePopBytes = NumBytes;
- CallSeqEnd.addImm(0)
+
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
+ .addImm(0)
.addImm(CalleePopBytes);
- // Restore the insert point to after the call sequence.
- MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
+ if (!Info.CanLowerReturn) {
+ insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
+ Info.DemoteRegister, Info.DemoteStackIndex);
+ }
+
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 1312388..569c6d7 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -19,28 +19,16 @@
namespace llvm {
class AMDGPUTargetLowering;
+class GCNSubtarget;
class MachineInstrBuilder;
+class SIMachineFunctionInfo;
class AMDGPUCallLowering final : public CallLowering {
- void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,
+ void lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
uint64_t Offset) const;
- void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
- Align Alignment, Register DstReg) const;
-
- /// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
-
- void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, CallingConv::ID CallConv) const;
-
- void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
- const SmallVectorImpl<ArgInfo> &SplitArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, CallingConv::ID CallConv,
- bool IsOutgoing,
- SplitArgTy PerformArgSplit) const;
+ void lowerParameter(MachineIRBuilder &B, ArgInfo &AI, uint64_t Offset,
+ Align Alignment) const;
bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
@@ -68,6 +56,29 @@
SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
CallLoweringInfo &Info) const;
+ bool
+ doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
+ MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const;
+
+ bool
+ areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ /// Returns true if the call can be lowered as a tail call.
+ bool
+ isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ void handleImplicitCallArguments(
+ MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
+ const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
+ ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;
+
+ bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 250c427..90b5239 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -34,16 +34,13 @@
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
]>>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
]>;
def RetCC_SI_Gfx : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i32>>,
+ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
// 0-3 are reserved for the stack buffer descriptor
// 32 is reserved for the stack pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
@@ -74,14 +71,6 @@
VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
]>>>,
-
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;
def CC_SI_SHADER : CallingConv<[
@@ -118,6 +107,7 @@
]>;
def RetCC_SI_Shader : CallingConv<[
+ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, i16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -175,6 +165,10 @@
(sequence "VGPR%u", 248, 255))
>;
+def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
+ (sequence "AGPR%u", 32, 255)
+>;
+
def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 105)
>;
@@ -184,6 +178,13 @@
(sequence "VGPR%u", 0, 255)
>;
+def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs<
+ (sequence "AGPR%u", 0, 255)
+>;
+def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs<
+ (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs)
+>;
+
// Just to get the regmask, not for calling convention purposes.
def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
(add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
@@ -193,6 +194,10 @@
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;
+def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
+ (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
+>;
+
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
@@ -205,13 +210,7 @@
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
]>;
// Calling convention for leaf functions
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 2556996..60e79c2 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -200,6 +201,7 @@
AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
bool visitFDiv(BinaryOperator &I);
+ bool visitXor(BinaryOperator &I);
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
@@ -807,9 +809,34 @@
return !!NewFDiv;
}
+bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
+ // Match the Xor instruction, its type and its operands
+ IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
+ ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
+ if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
+ return visitBinaryOperator(I);
+
+ // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic
+ // has only one use
+ if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
+ !IntrinsicCall->hasOneUse())
+ return visitBinaryOperator(I);
+
+ // "Not" the second argument of the intrinsic call
+ ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
+ if (!Arg)
+ return visitBinaryOperator(I);
+
+ IntrinsicCall->setOperand(
+ 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
+ I.replaceAllUsesWith(IntrinsicCall);
+ I.eraseFromParent();
+ return true;
+}
+
static bool hasUnsafeFPMath(const Function &F) {
Attribute Attr = F.getFnAttribute("unsafe-fp-math");
- return Attr.getValueAsString() == "true";
+ return Attr.getValueAsBool();
}
static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a839917..c6273ad 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -37,25 +37,54 @@
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
+
+def clamp_i64_to_i16 : GICombineRule<
+ (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
+ (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
+ [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
+ (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
+
+def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">;
+
+def int_minmax_to_med3 : GICombineRule<
+ (defs root:$min_or_max, med3_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SMAX,
+ G_SMIN,
+ G_UMAX,
+ G_UMIN):$min_or_max,
+ [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+
+def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
+
+def remove_fcanonicalize : GICombineRule<
+ (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
+ [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
+ (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
+
// Combines which should only apply on SI/VI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
-
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
- "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
+ "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
+ let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
}
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,
- uchar_to_float, cvt_f32_ubyteN]> {
+ uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];
}
def AMDGPURegBankCombinerHelper : GICombinerHelper<
- "AMDGPUGenRegBankCombinerHelper", []> {
+ "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
+ let StateClass = "AMDGPURegBankCombinerHelperState";
+ let AdditionalArguments = [];
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
index 041d6de..87b459f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
@@ -14,3 +17,5 @@
std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation();
} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bba0373..521c8f2 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -70,10 +70,10 @@
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
- GIComplexPatternEquiv<FLATOffset>;
-def gi_flat_offset_signed :
- GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
- GIComplexPatternEquiv<FLATOffsetSigned>;
+ GIComplexPatternEquiv<FlatOffset>;
+def gi_global_offset :
+ GIComplexOperandMatcher<s64, "selectGlobalOffset">,
+ GIComplexPatternEquiv<GlobalOffset>;
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
@@ -86,7 +86,7 @@
GIComplexPatternEquiv<MUBUFScratchOffen>;
def gi_flat_scratch_offset :
- GIComplexOperandMatcher<s32, "selectFlatOffsetSigned">,
+ GIComplexOperandMatcher<s32, "selectScratchOffset">,
GIComplexPatternEquiv<ScratchOffset>;
def gi_flat_scratch_saddr :
@@ -113,14 +113,6 @@
GIComplexOperandMatcher<s64, "selectMUBUFOffset">,
GIComplexPatternEquiv<MUBUFOffset>;
-def gi_mubuf_addr64_atomic :
- GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">,
- GIComplexPatternEquiv<MUBUFAddr64Atomic>;
-
-def gi_mubuf_offset_atomic :
- GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">,
- GIComplexPatternEquiv<MUBUFOffsetAtomic>;
-
def gi_smrd_buffer_imm :
GIComplexOperandMatcher<s64, "selectSMRDBufferImm">,
GIComplexPatternEquiv<SMRDBufferImm>;
@@ -136,6 +128,8 @@
def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
let CheckMMOIsNonAtomic = 1;
+ let IfSignExtend = G_SEXTLOAD;
+ let IfZeroExtend = G_ZEXTLOAD;
}
def : GINodeEquiv<G_STORE, AMDGPUst_glue> {
@@ -174,6 +168,10 @@
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
+def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
+def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
+def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
+
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
@@ -216,6 +214,8 @@
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
@@ -302,16 +302,16 @@
defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
}
-def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">,
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i32timm>;
-def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">,
+def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i16timm>;
-def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">,
+def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i8timm>;
-def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">,
+def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i1timm>;
def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
@@ -323,17 +323,14 @@
def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
GISDNodeXFormEquiv<IMMPopCount>;
-def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
- GISDNodeXFormEquiv<extract_glc>;
-
-def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
- GISDNodeXFormEquiv<extract_slc>;
-
-def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
- GISDNodeXFormEquiv<extract_dlc>;
+def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
+ GISDNodeXFormEquiv<extract_cpol>;
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;
+def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
+ GISDNodeXFormEquiv<set_glc>;
+
def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index b3bafc5..cabdc69 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -41,6 +41,20 @@
return std::make_pair(Def->getOperand(1).getReg(), Offset);
}
+ // Handle G_PTRTOINT (G_PTR_ADD base, const) case
+ if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) {
+ MachineInstr *Base;
+ if (mi_match(Def->getOperand(1).getReg(), MRI,
+ m_GPtrAdd(m_MInstr(Base), m_ICst(Offset)))) {
+ // If Base was int converted to pointer, simply return int and offset.
+ if (Base->getOpcode() == TargetOpcode::G_INTTOPTR)
+ return std::make_pair(Base->getOperand(1).getReg(), Offset);
+
+ // Register returned here will be of pointer type.
+ return std::make_pair(Base->getOperand(0).getReg(), Offset);
+ }
+ }
+
return std::make_pair(Reg, 0);
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 404e0fc..14d3a3f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 39f9092..8eeda7b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -226,8 +226,8 @@
void MetadataStreamerV2::emitVersion() {
auto &Version = HSAMetadata.mVersion;
- Version.push_back(VersionMajor);
- Version.push_back(VersionMinor);
+ Version.push_back(VersionMajorV2);
+ Version.push_back(VersionMinorV2);
}
void MetadataStreamerV2::emitPrintf(const Module &Mod) {
@@ -435,7 +435,8 @@
return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
}
-void MetadataStreamerV2::begin(const Module &Mod) {
+void MetadataStreamerV2::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
emitPrintf(Mod);
}
@@ -608,8 +609,8 @@
void MetadataStreamerV3::emitVersion() {
auto Version = HSAMetadataDoc->getArrayNode();
- Version.push_back(Version.getDocument()->getNode(VersionMajor));
- Version.push_back(Version.getDocument()->getNode(VersionMinor));
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV3));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV3));
getRootMetadata("amdhsa.version") = Version;
}
@@ -881,7 +882,8 @@
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
-void MetadataStreamerV3::begin(const Module &Mod) {
+void MetadataStreamerV3::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
@@ -921,6 +923,30 @@
Kernels.push_back(Kern);
}
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV4
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV4::emitVersion() {
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+ getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerV4::emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID) {
+ getRootMetadata("amdhsa.target") =
+ HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
+}
+
+void MetadataStreamerV4::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
+ emitVersion();
+ emitTargetID(TargetID);
+ emitPrintf(Mod);
+ getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+}
+
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1c6db14..4824b4c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"
@@ -40,7 +41,8 @@
virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
- virtual void begin(const Module &Mod) = 0;
+ virtual void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) = 0;
virtual void end() = 0;
@@ -48,8 +50,9 @@
const SIProgramInfo &ProgramInfo) = 0;
};
-class MetadataStreamerV3 final : public MetadataStreamer {
-private:
+// TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3.
+class MetadataStreamerV3 : public MetadataStreamer {
+protected:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
@@ -108,7 +111,8 @@
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
- void begin(const Module &Mod) override;
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
void end() override;
@@ -116,6 +120,21 @@
const SIProgramInfo &ProgramInfo) override;
};
+// TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4.
+class MetadataStreamerV4 final : public MetadataStreamerV3 {
+ void emitVersion();
+
+ void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
+
+public:
+ MetadataStreamerV4() = default;
+ ~MetadataStreamerV4() = default;
+
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
+};
+
+// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2.
class MetadataStreamerV2 final : public MetadataStreamer {
private:
Metadata HSAMetadata;
@@ -172,7 +191,8 @@
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
- void begin(const Module &Mod) override;
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
void end() override;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 340f4ac..a3106de 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -107,6 +107,10 @@
bool EnableLateStructurizeCFG;
+ // Instructions that will be lowered with a final instruction that zeros the
+ // high result bits.
+ bool fp16SrcZerosHighBits(unsigned Opc) const;
+
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -188,15 +192,9 @@
SDValue &Offset1, unsigned Size) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
+ SDValue &Idxen, SDValue &Addr64) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
- SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const;
- bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
- SDValue &SLC) const;
+ SDValue &SOffset, SDValue &Offset) const;
bool SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
@@ -204,17 +202,17 @@
SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
- SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- template <bool IsSigned>
+ bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, uint64_t FlatVariant) const;
bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
+ bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
+ bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
@@ -322,6 +320,16 @@
// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);
+
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (!Idx->isOne())
+ return false;
+ Out = In.getOperand(0);
+ return true;
+ }
+ }
+
if (In.getOpcode() != ISD::TRUNCATE)
return false;
@@ -341,6 +349,13 @@
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
+ return In.getOperand(0);
+ }
+ }
+
if (In.getOpcode() == ISD::TRUNCATE) {
SDValue Src = In.getOperand(0);
if (Src.getValueType().getSizeInBits() == 32)
@@ -391,6 +406,68 @@
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+ // XXX - only need to list legal operations.
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCANONICALIZE:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+ case ISD::FP_ROUND:
+ // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+ // high bits on gfx9.
+ // TODO: If we had the source node we could see if the source was fma/mad
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case ISD::FMA:
+ case ISD::FMAD:
+ case AMDGPUISD::DIV_FIXUP:
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
@@ -1374,13 +1451,10 @@
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &Offen, SDValue &Idxen,
+ SDValue &Addr64) const {
// Subtarget prefers to use flat instruction
// FIXME: This should be a pattern predicate and not reach here
if (Subtarget->useFlatForGlobal())
@@ -1388,14 +1462,6 @@
SDLoc DL(Addr);
- if (!GLC.getNode())
- GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- if (!SLC.getNode())
- SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
- DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1472,9 +1538,7 @@
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE,
- SDValue &DLC, SDValue &SWZ) const {
+ SDValue &Offset) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
@@ -1482,8 +1546,7 @@
if (!Subtarget->hasAddr64())
return false;
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1500,21 +1563,6 @@
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset,
- SDValue &SLC) const {
- SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
@@ -1551,13 +1599,7 @@
AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
- // In a call sequence, stores to the argument stack area are relative to the
- // stack pointer.
- const MachinePointerInfo &PtrInfo
- = cast<MemSDNode>(Parent)->getPointerInfo();
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
return true;
}
@@ -1600,44 +1642,65 @@
return true;
}
+static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
+ if (Val.getOpcode() != ISD::CopyFromReg)
+ return false;
+ auto RC =
+ TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
+ return RC && TRI.isSGPRClass(RC);
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
- ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
- if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
- return false;
-
- SDLoc DL(Addr);
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SDLoc DL(Addr);
+
+ // CopyFromReg <sgpr>
+ if (IsCopyFromSGPR(*TRI, Addr)) {
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+ }
+
+ ConstantSDNode *CAddr;
+ if (Addr.getOpcode() == ISD::ADD) {
+ // Add (CopyFromReg <sgpr>) <constant>
+ CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
+ return false;
+ if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
+ return false;
+
+ SOffset = Addr.getOperand(0);
+ } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
+ SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
+ // <constant>
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ } else {
+ return false;
+ }
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-
- // FIXME: Get from MachinePointerInfo? We should only be using the frame
- // offset if we know this is in a call sequence.
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
-
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &SOffset, SDValue &Offset,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+ SDValue &SOffset, SDValue &Offset
+ ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1656,21 +1719,6 @@
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset
- ) const {
- SDValue GLC, SLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset,
- SDValue &SLC) const {
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
static MemSDNode* findMemSDNode(SDNode *N) {
@@ -1685,24 +1733,25 @@
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
-template <bool IsSigned>
-bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset) const {
+bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
+ SDValue &VAddr, SDValue &Offset,
+ uint64_t FlatVariant) const {
int64_t OffsetVal = 0;
unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (Subtarget->hasFlatInstOffsets() &&
- (!Subtarget->hasFlatSegmentOffsetBug() ||
- AS != AMDGPUAS::FLAT_ADDRESS)) {
+ bool CanHaveFlatSegmentOffsetBug =
+ Subtarget->hasFlatSegmentOffsetBug() &&
+ FlatVariant == SIInstrFlags::FLAT &&
+ (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
+
+ if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1)) {
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
@@ -1719,8 +1768,8 @@
SDLoc DL(N);
uint64_t RemainderOffset;
- std::tie(OffsetVal, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
@@ -1777,6 +1826,25 @@
return true;
}
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
+}
+
+bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
+ SIInstrFlags::FlatScratch);
+}
+
// If this matches zero_extend i32:x, return x
static SDValue matchZExtFromI32(SDValue Op) {
if (Op.getOpcode() != ISD::ZERO_EXTEND)
@@ -1802,88 +1870,87 @@
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
- } else if (!LHS->isDivergent() && COffsetVal > 0) {
- SDLoc SL(N);
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
- // (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
+ } else if (!LHS->isDivergent()) {
+ if (COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
- if (isUInt<32>(RemainderOffset)) {
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
- CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- SAddr = LHS;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
- return true;
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ SAddr = LHS;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
+ !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
+ if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return false;
}
}
// Match the variable offset.
- if (Addr.getOpcode() != ISD::ADD) {
- if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
- isa<ConstantSDNode>(Addr))
- return false;
+ if (Addr.getOpcode() == ISD::ADD) {
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- SAddr = Addr;
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
- CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
- return true;
- }
+ if (!LHS->isDivergent()) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ SAddr = LHS;
+ VOffset = ZextRHS;
+ }
+ }
- LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
+ if (!SAddr && !RHS->isDivergent()) {
+ // add (zero_extend (i32 vgpr)), (i64 sgpr)
+ if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ SAddr = RHS;
+ VOffset = ZextLHS;
+ }
+ }
- if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
- SAddr = LHS;
- VOffset = ZextRHS;
+ if (SAddr) {
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
}
}
- if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
- SAddr = RHS;
- VOffset = ZextLHS;
- }
- }
-
- if (!SAddr)
+ if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+ isa<ConstantSDNode>(Addr))
return false;
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ SAddr = Addr;
+ SDNode *VMov =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
return true;
}
-// Match (32-bit SGPR base) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
- SDValue Addr,
- SDValue &SAddr,
- SDValue &Offset) const {
- if (Addr->isDivergent())
- return false;
-
- SAddr = Addr;
- int64_t COffsetVal = 0;
-
- if (CurDAG->isBaseWithConstantOffset(Addr)) {
- COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
- SAddr = Addr.getOperand(0);
- }
-
+static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
} else if (SAddr.getOpcode() == ISD::ADD &&
@@ -1893,35 +1960,54 @@
auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
FI->getValueType(0));
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
MVT::i32, TFI, SAddr.getOperand(1)),
0);
}
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ return SAddr;
+}
- if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- int64_t RemainderOffset = COffsetVal;
- int64_t ImmField = 0;
- const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
- // Use signed division by a power of two to truncate towards 0.
- int64_t D = 1LL << (NumBits - 1);
- RemainderOffset = (COffsetVal / D) * D;
- ImmField = COffsetVal - RemainderOffset;
+// Match (32-bit SGPR base) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &Offset) const {
+ if (Addr->isDivergent())
+ return false;
- assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
- assert(RemainderOffset + ImmField == COffsetVal);
+ SDLoc DL(Addr);
- COffsetVal = ImmField;
+ int64_t COffsetVal = 0;
- SDLoc DL(N);
- SDValue AddOffset =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
- SAddr, AddOffset), 0);
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+ SAddr = Addr.getOperand(0);
+ } else {
+ SAddr = Addr;
}
- Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+ SAddr = SelectSAddrFI(CurDAG, SAddr);
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+ if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
+
+ COffsetVal = SplitImmOffset;
+
+ SDValue AddOffset =
+ SAddr.getOpcode() == ISD::TargetFrameIndex
+ ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
+ : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
+ SAddr, AddOffset),
+ 0);
+ }
+
+ Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
return true;
}
@@ -2364,35 +2450,32 @@
MachineSDNode *CmpSwap = nullptr;
if (Subtarget->hasAddr64()) {
- SDValue SRsrc, VAddr, SOffset, Offset, SLC;
+ SDValue SRsrc, VAddr, SOffset, Offset;
- if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+ if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
// XXX - Do we care about glue operands?
- SDValue Ops[] = {
- CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
+ Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
}
if (!CmpSwap) {
- SDValue SRsrc, SOffset, Offset, SLC;
- if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+ SDValue SRsrc, SOffset, Offset;
+ if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
- SDValue Ops[] = {
- CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
+ SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
@@ -2623,7 +2706,11 @@
Opcode = AMDGPU::SOFT_WQM;
break;
case Intrinsic::amdgcn_wwm:
- Opcode = AMDGPU::WWM;
+ case Intrinsic::amdgcn_strict_wwm:
+ Opcode = AMDGPU::STRICT_WWM;
+ break;
+ case Intrinsic::amdgcn_strict_wqm:
+ Opcode = AMDGPU::STRICT_WQM;
break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
@@ -2773,18 +2860,62 @@
if (isExtractHiElt(Hi, Hi))
Mods |= SISrcMods::OP_SEL_1;
+ unsigned VecSize = Src.getValueSizeInBits();
Lo = stripExtractLoElt(Lo);
Hi = stripExtractLoElt(Hi);
+ if (Lo.getValueSizeInBits() > VecSize) {
+ Lo = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Lo);
+ }
+
+ if (Hi.getValueSizeInBits() > VecSize) {
+ Hi = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Hi);
+ }
+
+ assert(Lo.getValueSizeInBits() <= VecSize &&
+ Hi.getValueSizeInBits() <= VecSize);
+
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
- Src = Lo;
+ if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+ Src = Lo;
+ } else {
+ assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
+
+ SDLoc SL(In);
+ SDValue Undef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
+ Lo.getValueType()), 0);
+ auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
+ : AMDGPU::SReg_64RegClassID;
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(RC, SL, MVT::i32),
+ Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
+
+ Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+ Src.getValueType(), Ops), 0);
+ }
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
+ if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
+ uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+ .bitcastToAPInt().getZExtValue();
+ if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
+ Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+ }
+
Mods = VecMods;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b4b477..d68488c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -78,6 +78,12 @@
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
@@ -99,9 +105,15 @@
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
@@ -173,12 +185,14 @@
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
@@ -198,6 +212,12 @@
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::STORE, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
@@ -219,6 +239,12 @@
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
@@ -261,6 +287,11 @@
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
+
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
@@ -325,8 +356,14 @@
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -335,6 +372,10 @@
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
@@ -343,6 +384,8 @@
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
@@ -412,8 +455,7 @@
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
- };
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@@ -454,8 +496,7 @@
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
- };
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
@@ -505,6 +546,12 @@
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -846,9 +893,9 @@
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && VT == MVT::f16) ||
- (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
+ // Report this based on the end legalized type.
+ VT = VT.getScalarType();
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -1257,8 +1304,9 @@
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
- case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
- case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
@@ -1304,7 +1352,8 @@
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isModuleEntryFunction()) {
+ if (!MFI->isModuleEntryFunction() &&
+ !GV->getName().equals("llvm.amdgcn.module.lds")) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
@@ -1368,6 +1417,14 @@
SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+
+ // For these types, we have some TableGen patterns except if the index is 1
+ if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
+ (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
+ Start != 1)
+ return Op;
+
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
@@ -2579,33 +2636,77 @@
return LowerINT_TO_FP64(Op, DAG, true);
}
-SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
- SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+ assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
- SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
- MVT::f64);
- SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
- MVT::f64);
+ // The basic idea of converting a floating point number into a pair of 32-bit
+ // integers is illustrated as follows:
+ //
+ // tf := trunc(val);
+ // hif := floor(tf * 2^-32);
+ // lof := tf - hif * 2^32; // lof is always positive due to floor.
+ // hi := fptoi(hif);
+ // lo := fptoi(lof);
+ //
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
+ SDValue Sign;
+ if (Signed && SrcVT == MVT::f32) {
+ // However, a 32-bit floating point number has only 23 bits mantissa and
+ // it's not enough to hold all the significant bits of `lof` if val is
+ // negative. To avoid the loss of precision, We need to take the absolute
+ // value after truncating and flip the result back based on the original
+ // signedness.
+ Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
+ DAG.getConstant(31, SL, MVT::i32));
+ Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
+ }
+
+ SDValue K0, K1;
+ if (SrcVT == MVT::f64) {
+ K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
+ SL, SrcVT);
+ K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
+ SL, SrcVT);
+ } else {
+ K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
+ SrcVT);
+ K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
+ SrcVT);
+ }
// TODO: Should this propagate fast-math-flags?
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
- SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+ SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
+ SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
- SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
-
- SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
- MVT::i32, FloorMul);
+ SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT,
+ SL, MVT::i32, FloorMul);
SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
- SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+ if (Signed && SrcVT == MVT::f32) {
+ assert(Sign);
+ // Flip the result based on the signedness, which is either all 0s or 1s.
+ Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
+ // r := xor(r, sign) - sign;
+ Result =
+ DAG.getNode(ISD::SUB, SL, MVT::i64,
+ DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
+ }
+
+ return Result;
}
SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
@@ -2707,44 +2808,37 @@
return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
}
-SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
-
- // TODO: Factor out code common with LowerFP_TO_UINT.
-
+ unsigned OpOpcode = Op.getOpcode();
EVT SrcVT = Src.getValueType();
+ EVT DestVT = Op.getValueType();
+
+ // Will be selected natively
+ if (SrcVT == MVT::f16 && DestVT == MVT::i16)
+ return Op;
+
+ // Promote i16 to i32
+ if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+ SDLoc DL(Op);
+
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
+ }
+
if (SrcVT == MVT::f16 ||
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
- SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ unsigned Ext =
+ OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
}
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, true);
-
- return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
-
- // TODO: Factor out code common with LowerFP_TO_SINT.
-
- EVT SrcVT = Src.getValueType();
- if (SrcVT == MVT::f16 ||
- (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
- SDLoc DL(Op);
-
- SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
- }
-
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, false);
+ if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+ return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
return SDValue();
}
@@ -2787,8 +2881,8 @@
AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
-static SDValue simplifyI24(SDNode *Node24,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue simplifyMul24(SDNode *Node24,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
@@ -2890,9 +2984,8 @@
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- LN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
SDValue Ops[2];
if (VT.isVector())
@@ -2946,9 +3039,8 @@
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- SN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3010,7 +3102,7 @@
switch (IID) {
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_mul_u24:
- return simplifyI24(N, DCI);
+ return simplifyMul24(N, DCI);
case Intrinsic::amdgcn_fract:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
@@ -3312,6 +3404,13 @@
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ if (!N->isDivergent())
+ return SDValue();
+
unsigned Size = VT.getSizeInBits();
if (VT.isVector() || Size > 64)
return SDValue();
@@ -3362,6 +3461,15 @@
if (!Subtarget->hasMulI24() || VT.isVector())
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3386,6 +3494,15 @@
if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3985,11 +4102,8 @@
case AMDGPUISD::MUL_I24:
case AMDGPUISD::MUL_U24:
case AMDGPUISD::MULHI_I24:
- case AMDGPUISD::MULHI_U24: {
- if (SDValue V = simplifyI24(N, DCI))
- return V;
- return SDValue();
- }
+ case AMDGPUISD::MULHI_U24:
+ return simplifyMul24(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::FNEG:
@@ -4159,8 +4273,13 @@
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP =
+ DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
MachineMemOperand::MODereferenceable);
return Store;
@@ -4297,7 +4416,6 @@
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -4350,6 +4468,8 @@
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4425,8 +4545,7 @@
break;
}
- case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT: {
+ case AMDGPUISD::FP_TO_FP16: {
unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
@@ -4573,7 +4692,6 @@
case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16;
case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT:
return 16;
default:
return 1;
@@ -4727,3 +4845,8 @@
return AtomicExpansionKind::None;
}
}
+
+bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
+ unsigned Opc, LLT Ty1, LLT Ty2) const {
+ return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ce3618f..e61021d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -64,10 +64,9 @@
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
@@ -328,6 +327,9 @@
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
+ LLT Ty2) const override;
};
namespace AMDGPUISD {
@@ -458,9 +460,6 @@
// are known 0.
FP_TO_FP16,
- // Wrapper around fp16 results that are known to zero the high bits.
- FP16_ZEXT,
-
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
@@ -523,6 +522,8 @@
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_FMIN,
+ BUFFER_ATOMIC_FMAX,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 894677e..0f9cb71 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -132,7 +132,6 @@
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
-def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
@@ -213,6 +212,8 @@
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
SDTIntToFPOp, []>;
+def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32",
+ AMDGPUIntPackOp, []>;
// urecip - This operation is a helper for integer division, it returns the
// result of 1 / a as a fractional unsigned integer.
@@ -311,7 +312,7 @@
SDTCisInt<4>]>,
[]>;
-def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
@@ -461,3 +462,7 @@
def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),
[(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
(AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;
+
+def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
+ (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index bd577a6..323aaaf 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -18,6 +18,7 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -59,11 +60,13 @@
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
-void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) {
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
MRI = &MF.getRegInfo();
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- InstructionSelector::setupMF(MF, KB, CoverageInfo);
+ InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
}
bool AMDGPUInstructionSelector::isVCC(Register Reg,
@@ -136,20 +139,29 @@
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
- Register MaskedReg = MRI->createVirtualRegister(SrcRC);
+ Optional<ValueAndVReg> ConstVal =
+ getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
+ if (ConstVal) {
+ unsigned MovOpc =
+ STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
+ .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
+ } else {
+ Register MaskedReg = MRI->createVirtualRegister(SrcRC);
- // We can't trust the high bits at this point, so clear them.
+ // We can't trust the high bits at this point, so clear them.
- // TODO: Skip masking high bits if def is known boolean.
+ // TODO: Skip masking high bits if def is known boolean.
- unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
- AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
- BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
- .addImm(1)
- .addReg(SrcReg);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
- .addImm(0)
- .addReg(MaskedReg);
+ unsigned AndOpc =
+ TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ .addImm(1)
+ .addReg(SrcReg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(MaskedReg);
+ }
if (!MRI->getRegClassOrNull(SrcReg))
MRI->setRegClass(SrcReg, SrcRC);
@@ -578,7 +590,7 @@
return true;
const LLT S32 = LLT::scalar(32);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
if (MRI->getType(Dst) != V2S16)
@@ -743,6 +755,30 @@
return true;
}
+bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
+ Register WidthReg = MI.getOperand(3).getReg();
+
+ assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
+ "scalar BFX instructions are expanded in regbankselect");
+ assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
+ "64-bit vector BFX instructions are expanded in regbankselect");
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
+ unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
+ .addReg(SrcReg)
+ .addReg(OffsetReg)
+ .addReg(WidthReg);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
if (STI.getLDSBankCount() != 16)
return selectImpl(MI, *CoverageInfo);
@@ -916,8 +952,11 @@
return constrainCopyLikeIntrin(I, AMDGPU::WQM);
case Intrinsic::amdgcn_softwqm:
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+ case Intrinsic::amdgcn_strict_wwm:
case Intrinsic::amdgcn_wwm:
- return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
+ case Intrinsic::amdgcn_strict_wqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
case Intrinsic::amdgcn_div_scale:
@@ -1375,7 +1414,24 @@
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
+
+ if (STI.needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ MIB.addReg(NewVR, 0, AMDGPU::sub0);
+ MIB.addReg(NewVR, RegState::Implicit);
+ } else {
+ MIB.addReg(VSrc);
+ }
+
if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
return false;
}
@@ -1446,24 +1502,6 @@
return TexFailCtrl == 0;
}
-static bool parseCachePolicy(uint64_t Value,
- bool *GLC, bool *SLC, bool *DLC) {
- if (GLC) {
- *GLC = (Value & 0x1) ? 1 : 0;
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = (Value & 0x2) ? 1 : 0;
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = (Value & 0x4) ? 1 : 0;
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
@@ -1504,8 +1542,8 @@
const bool IsA16 = (Flags & 1) != 0;
const bool IsG16 = (Flags & 2) != 0;
- // A16 implies 16 bit gradients
- if (IsA16 && !IsG16)
+ // A16 implies 16 bit gradients if subtarget doesn't support G16
+ if (IsA16 && !STI.hasG16() && !IsG16)
return false;
unsigned DMask = 0;
@@ -1589,21 +1627,11 @@
// TODO: Check this in verifier.
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
- bool GLC = false;
- bool SLC = false;
- bool DLC = false;
- if (BaseOpcode->Atomic) {
- GLC = true; // TODO no-return optimization
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- } else {
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- }
+ unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return false;
int NumVAddrRegs = 0;
int NumVAddrDwords = 0;
@@ -1661,8 +1689,10 @@
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
MIB.addDef(TmpReg);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
- .addReg(TmpReg, RegState::Kill, SubReg);
+ if (!MRI->use_empty(VDataOut)) {
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+ }
} else {
MIB.addDef(VDataOut); // vdata output
@@ -1689,11 +1719,8 @@
if (IsGFX10Plus)
MIB.addImm(DimInfo->Encoding);
MIB.addImm(Unorm);
- if (IsGFX10Plus)
- MIB.addImm(DLC);
- MIB.addImm(GLC);
- MIB.addImm(SLC);
+ MIB.addImm(CPol);
MIB.addImm(IsA16 && // a16 or r128
STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
if (IsGFX10Plus)
@@ -1706,6 +1733,38 @@
if (BaseOpcode->HasD16)
MIB.addImm(IsD16 ? -1 : 0);
+ if (IsTexFail) {
+ // An image load instruction with TFE/LWE only conditionally writes to its
+ // result registers. Initialize them to zero so that we always get well
+ // defined result values.
+ assert(VDataOut && !VDataIn);
+ Register Tied = MRI->cloneVirtualRegister(VDataOut);
+ Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
+ .addImm(0);
+ auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
+ if (STI.usePRTStrictNull()) {
+ // With enable-prt-strict-null enabled, initialize all result registers to
+ // zero.
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts)
+ RegSeq.addReg(Zero).addImm(Sub);
+ } else {
+ // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
+ // result register.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts.drop_back(1))
+ RegSeq.addReg(Undef).addImm(Sub);
+ RegSeq.addReg(Zero).addImm(Parts.back());
+ }
+ MIB.addReg(Tied, RegState::Implicit);
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ }
+
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -1733,7 +1792,7 @@
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
- return selectGlobalAtomicFaddIntrinsic(I);
+ return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -1848,7 +1907,7 @@
return false;
}
- if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -2336,6 +2395,13 @@
bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
MachineInstr &I) const {
+ if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
+ const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+ unsigned AS = PtrTy.getAddressSpace();
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS)
+ return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
+ }
+
initM0(I);
return selectImpl(I, *CoverageInfo);
}
@@ -2386,8 +2452,7 @@
MIB.addImm(0);
MIB.addImm(Offset);
- MIB.addImm(1); // glc
- MIB.addImm(0); // slc
+ MIB.addImm(AMDGPU::CPol::GLC);
MIB.cloneMemRefs(MI);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
@@ -2772,7 +2837,7 @@
Register Src1Reg = MI.getOperand(2).getReg();
ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
return false;
@@ -2895,6 +2960,8 @@
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
+ if (STI.hasGFX90AInsts())
+ return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2951,7 +3018,7 @@
if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
- Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex.getReg())
.addImm(AMDGPU::sub0)
@@ -2968,7 +3035,7 @@
I.add(MI.getOperand(2)); // rsrc
I.add(SOffset);
I.addImm(Offset);
- renderExtractSLC(I, MI, 7);
+ I.addImm(MI.getOperand(7).getImm()); // cpol
I.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -2976,8 +3043,14 @@
return true;
}
-bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
- MachineInstr &MI) const{
+bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
+ MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
+
+ if (STI.hasGFX90AInsts()) {
+ // gfx90a adds return versions of the global atomic fadd instructions so no
+ // special handling is required.
+ return selectImpl(MI, *CoverageInfo);
+ }
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2994,16 +3067,16 @@
// FIXME: This is only needed because tablegen requires number of dst operands
// in match and replace pattern to be the same. Otherwise patterns can be
// exported from SDag path.
- auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
+ auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
- Register Data = MI.getOperand(3).getReg();
+ Register Data = DataOp.getReg();
const unsigned Opc = MRI->getType(Data).isVector() ?
AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
.addReg(Addr.first)
.addReg(Data)
.addImm(Addr.second)
- .addImm(0) // SLC
+ .addImm(0) // cpol
.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -3140,6 +3213,9 @@
return selectBVHIntrinsic(I);
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
+ case AMDGPU::G_SBFX:
+ case AMDGPU::G_UBFX:
+ return selectG_SBFX_UBFX(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3282,7 +3358,7 @@
if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
// It's possible to see an f32 fneg here, but unlikely.
// TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::vector(2, 16)) {
+ MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
@@ -3408,9 +3484,9 @@
}};
}
-template <bool Signed>
std::pair<Register, int>
-AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
+ uint64_t FlatVariant) const {
MachineInstr *MI = Root.getParent();
auto Default = std::make_pair(Root.getReg(), 0);
@@ -3426,7 +3502,7 @@
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
- if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
+ if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
return Default;
return std::make_pair(PtrBase, ConstOffset);
@@ -3434,7 +3510,7 @@
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3443,8 +3519,18 @@
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
+AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3483,39 +3569,56 @@
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
- } else if (ConstOffset > 0) {
+ } else {
auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
if (!PtrBaseDef)
return None;
if (isSGPR(PtrBaseDef->Reg)) {
- // Offset is too large.
- //
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
- // + (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
+ if (ConstOffset > 0) {
+ // Offset is too large.
+ //
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
+ ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
- if (isUInt<32>(RemainderOffset)) {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register HighBits
- = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (isUInt<32>(RemainderOffset)) {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register HighBits =
+ MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- HighBits)
- .addImm(RemainderOffset);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(RemainderOffset);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
- }};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ }};
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
+ !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
+ if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return None;
}
}
}
@@ -3525,57 +3628,50 @@
return None;
// Match the variable offset.
- if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
- // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
- // drop this.
- if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
- AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
- return None;
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ // Look through the SGPR->VGPR copy.
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- const Register SAddr = AddrDef->Reg;
- if (!isSGPR(SAddr))
- return None;
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- VOffset)
- .addImm(0);
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
- }};
+ // It's possible voffset is an SGPR here, but the copy to VGPR will be
+ // inserted later.
+ if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ }}};
+ }
+ }
}
- // Look through the SGPR->VGPR copy.
- Register SAddr =
- getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
- if (!SAddr || !isSGPR(SAddr))
+ // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+ // drop this.
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+ AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
return None;
- Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // It's possible voffset is an SGPR here, but the copy to VGPR will be
- // inserted later.
- Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
- if (!VOffset)
- return None;
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
- return {{[=](MachineInstrBuilder &MIB) { // saddr
- MIB.addReg(SAddr);
- },
- [=](MachineInstrBuilder &MIB) { // voffset
- MIB.addReg(VOffset);
- },
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- }}};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
}
InstructionSelector::ComplexRendererFns
@@ -3590,7 +3686,8 @@
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0 &&
- TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
}
@@ -3624,9 +3721,9 @@
const DebugLoc &DL = I.getDebugLoc();
SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
- .addFrameIndex(FI)
- .addReg(RHSDef->Reg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
+ .addFrameIndex(FI)
+ .addReg(RHSDef->Reg);
}
}
@@ -3639,11 +3736,6 @@
}};
}
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
@@ -3685,23 +3777,19 @@
Optional<int> FI;
Register VAddr = Root.getReg();
if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
- if (isBaseWithConstantOffset(Root, *MRI)) {
- const MachineOperand &LHS = RootDef->getOperand(1);
- const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t PossibleOffset =
- RHSDef->getOperand(1).getCImm()->getSExtValue();
- if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
- (!STI.privateMemoryResourceIsRangeChecked() ||
- KnownBits->signBitIsZero(LHS.getReg()))) {
- if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
- FI = LHSDef->getOperand(1).getIndex();
- else
- VAddr = LHS.getReg();
- Offset = PossibleOffset;
- }
+ Register PtrBase;
+ int64_t ConstOffset;
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+ if (ConstOffset != 0) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
+ (!STI.privateMemoryResourceIsRangeChecked() ||
+ KnownBits->signBitIsZero(PtrBase))) {
+ const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
+ if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+ FI = PtrBaseDef->getOperand(1).getIndex();
+ else
+ VAddr = PtrBase;
+ Offset = ConstOffset;
}
} else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
FI = RootDef->getOperand(1).getIndex();
@@ -3769,18 +3857,13 @@
const MachineFunction *MF = MBB->getParent();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
- const MachineMemOperand *MMO = *MI->memoperands_begin();
- const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
},
[=](MachineInstrBuilder &MIB) { // soffset
- if (isStackPtrRelative(PtrInfo))
- MIB.addReg(Info->getStackPtrOffsetReg());
- else
- MIB.addImm(0);
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
@@ -4130,10 +4213,8 @@
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
addZeroImm // swz
}};
}
@@ -4158,11 +4239,9 @@
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
- addZeroImm // swz
+ addZeroImm, // swz
}};
}
@@ -4194,7 +4273,9 @@
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(AMDGPU::CPol::GLC); // cpol
+ }
}};
}
@@ -4218,7 +4299,7 @@
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
}};
}
@@ -4308,25 +4389,11 @@
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
-void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
-}
-
-void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
- assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
-}
-
-void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
- assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
}
void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
@@ -4336,6 +4403,13 @@
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
}
+void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
+}
+
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index d70f180..cb05a1c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -36,6 +36,8 @@
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class AMDGPUTargetMachine;
+class BlockFrequencyInfo;
+class ProfileSummaryInfo;
class GCNSubtarget;
class MachineInstr;
class MachineIRBuilder;
@@ -45,6 +47,7 @@
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
+class TargetRegisterClass;
class AMDGPUInstructionSelector final : public InstructionSelector {
private:
@@ -59,8 +62,9 @@
bool select(MachineInstr &I) override;
static const char *getName();
- void setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) override;
+ void setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) override;
private:
struct GEPInfo {
@@ -105,6 +109,7 @@
bool selectG_PTR_ADD(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
+ bool selectG_SBFX_UBFX(MachineInstr &I) const;
bool selectInterpP1F16(MachineInstr &MI) const;
bool selectWritelane(MachineInstr &MI) const;
@@ -143,7 +148,8 @@
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
- bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
+ bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
+ MachineOperand &DataOp) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
@@ -187,14 +193,15 @@
InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand &Root) const;
- template <bool Signed>
- std::pair<Register, int>
- selectFlatOffsetImpl(MachineOperand &Root) const;
+ std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
+ uint64_t FlatVariant) const;
InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectFlatOffsetSigned(MachineOperand &Root) const;
+ selectGlobalOffset(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
@@ -274,26 +281,6 @@
void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
- void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
@@ -302,14 +289,13 @@
void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
- void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
- void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
- void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
+ void renderExtractCPol(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 8ef9c99..119c408 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -83,8 +83,7 @@
// Add a predicate to the list if does not already exist to deduplicate it.
class PredConcat<list<Predicate> lst, Predicate pred> {
list<Predicate> ret =
- !listconcat([pred], !filter(item, lst,
- !ne(!cast<string>(item), !cast<string>(pred))));
+ !listconcat([pred], !filter(item, lst, !ne(item, pred)));
}
class PredicateControl {
@@ -185,6 +184,28 @@
}];
}
+class is_canonicalized<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1),
+ (op $src0, $src1),
+ [{
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
+
+ return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) &&
+ Lowering.isCanonicalized(*CurDAG, N->getOperand(1));
+ }]> {
+
+ // TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class
+ let GISelPredicateCode = [{
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
+ TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+ }];
+}
+
+
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
@@ -596,12 +617,6 @@
(vt rc:$addr)
>;
-// fshr pattern
-class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
- (fshr i32:$src0, i32:$src1, i32:$src2),
- (BIT_ALIGN $src0, $src1, $src2)
->;
-
// rotr pattern
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 8aea33c..4971b01 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -165,10 +165,12 @@
PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
auto *NewPtr = IRB.CreateBitCast(
- IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
- Offset - Adjust),
+ IRB.CreateConstGEP1_64(
+ IRB.getInt8Ty(),
+ IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
+ Offset - Adjust),
Int32PtrTy);
- LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+ LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
NewLd->copyMetadata(LI);
NewLd->setMetadata(LLVMContext::MD_range, nullptr);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9f359c2..c1a9b30 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -18,7 +18,9 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -47,7 +49,7 @@
static LLT getPow2VectorType(LLT Ty) {
unsigned NElts = Ty.getNumElements();
unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
- return Ty.changeNumElements(Pow2NElts);
+ return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
}
// Round the number of bits to the next power of two bits
@@ -93,7 +95,8 @@
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getElementType();
- return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
+ return std::make_pair(TypeIdx,
+ LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
};
}
@@ -104,7 +107,9 @@
unsigned Size = Ty.getSizeInBits();
unsigned Pieces = (Size + 63) / 64;
unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
- return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
+ return std::make_pair(
+ TypeIdx,
+ LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
};
}
@@ -122,7 +127,7 @@
assert(EltSize < 32);
const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
- return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
+ return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
};
}
@@ -136,7 +141,7 @@
return LLT::scalar(Size);
}
- return LLT::scalarOrVector(Size / 32, 32);
+ return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
@@ -151,7 +156,8 @@
const LLT Ty = Query.Types[TypeIdx];
unsigned Size = Ty.getSizeInBits();
assert(Size % 32 == 0);
- return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
+ return std::make_pair(
+ TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
};
}
@@ -220,11 +226,13 @@
};
}
-static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
+// If we have a truncating store or an extending load with a data size larger
+// than 32-bits, we need to reduce to a 32-bit type.
+static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
- Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
+ Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
};
}
@@ -257,15 +265,14 @@
}
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
- const LegalityQuery &Query,
- unsigned Opcode) {
+ const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
- const bool IsLoad = Opcode != AMDGPU::G_STORE;
+ const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
unsigned RegSize = Ty.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
@@ -273,6 +280,10 @@
if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
+ // Do not handle extending vector loads.
+ if (Ty.isVector() && MemSize != RegSize)
+ return false;
+
// TODO: We should be able to widen loads if the alignment is high enough, but
// we also need to modify the memory access size.
#if 0
@@ -341,33 +352,37 @@
return EltSize != 32 && EltSize != 64;
}
-static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
- unsigned Opcode) {
+static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
- return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
+ return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
!loadStoreBitcastWorkaround(Ty);
}
/// Return true if a load or store of the type should be lowered with a bitcast
/// to a different type.
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
- const unsigned MemSizeInBits) {
+ const LLT MemTy) {
+ const unsigned MemSizeInBits = MemTy.getSizeInBits();
const unsigned Size = Ty.getSizeInBits();
- if (Size != MemSizeInBits)
- return Size <= 32 && Ty.isVector();
+ if (Size != MemSizeInBits)
+ return Size <= 32 && Ty.isVector();
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
return true;
- return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+
+ // Don't try to handle bitcasting vector ext loads for now.
+ return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
+ (Size <= 32 || isRegisterSize(Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
}
/// Return true if we should legalize a load by widening an odd sized memory
/// access up to the alignment. Note this case when the memory access itself
/// changes, not the size of the result register.
-static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
+static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
unsigned AlignInBits, unsigned AddrSpace,
unsigned Opcode) {
+ unsigned SizeInBits = MemoryTy.getSizeInBits();
// We don't want to widen cases that are naturally legal.
if (isPowerOf2_32(SizeInBits))
return false;
@@ -403,7 +418,7 @@
if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
return false;
- return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
+ return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
Query.MMODescrs[0].AlignInBits,
Query.Types[1].getAddressSpace(), Opcode);
}
@@ -427,35 +442,35 @@
const LLT S512 = LLT::scalar(512);
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
- const LLT V2S8 = LLT::vector(2, 8);
- const LLT V2S16 = LLT::vector(2, 16);
- const LLT V4S16 = LLT::vector(4, 16);
+ const LLT V2S8 = LLT::fixed_vector(2, 8);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ const LLT V4S16 = LLT::fixed_vector(4, 16);
- const LLT V2S32 = LLT::vector(2, 32);
- const LLT V3S32 = LLT::vector(3, 32);
- const LLT V4S32 = LLT::vector(4, 32);
- const LLT V5S32 = LLT::vector(5, 32);
- const LLT V6S32 = LLT::vector(6, 32);
- const LLT V7S32 = LLT::vector(7, 32);
- const LLT V8S32 = LLT::vector(8, 32);
- const LLT V9S32 = LLT::vector(9, 32);
- const LLT V10S32 = LLT::vector(10, 32);
- const LLT V11S32 = LLT::vector(11, 32);
- const LLT V12S32 = LLT::vector(12, 32);
- const LLT V13S32 = LLT::vector(13, 32);
- const LLT V14S32 = LLT::vector(14, 32);
- const LLT V15S32 = LLT::vector(15, 32);
- const LLT V16S32 = LLT::vector(16, 32);
- const LLT V32S32 = LLT::vector(32, 32);
+ const LLT V2S32 = LLT::fixed_vector(2, 32);
+ const LLT V3S32 = LLT::fixed_vector(3, 32);
+ const LLT V4S32 = LLT::fixed_vector(4, 32);
+ const LLT V5S32 = LLT::fixed_vector(5, 32);
+ const LLT V6S32 = LLT::fixed_vector(6, 32);
+ const LLT V7S32 = LLT::fixed_vector(7, 32);
+ const LLT V8S32 = LLT::fixed_vector(8, 32);
+ const LLT V9S32 = LLT::fixed_vector(9, 32);
+ const LLT V10S32 = LLT::fixed_vector(10, 32);
+ const LLT V11S32 = LLT::fixed_vector(11, 32);
+ const LLT V12S32 = LLT::fixed_vector(12, 32);
+ const LLT V13S32 = LLT::fixed_vector(13, 32);
+ const LLT V14S32 = LLT::fixed_vector(14, 32);
+ const LLT V15S32 = LLT::fixed_vector(15, 32);
+ const LLT V16S32 = LLT::fixed_vector(16, 32);
+ const LLT V32S32 = LLT::fixed_vector(32, 32);
- const LLT V2S64 = LLT::vector(2, 64);
- const LLT V3S64 = LLT::vector(3, 64);
- const LLT V4S64 = LLT::vector(4, 64);
- const LLT V5S64 = LLT::vector(5, 64);
- const LLT V6S64 = LLT::vector(6, 64);
- const LLT V7S64 = LLT::vector(7, 64);
- const LLT V8S64 = LLT::vector(8, 64);
- const LLT V16S64 = LLT::vector(16, 64);
+ const LLT V2S64 = LLT::fixed_vector(2, 64);
+ const LLT V3S64 = LLT::fixed_vector(3, 64);
+ const LLT V4S64 = LLT::fixed_vector(4, 64);
+ const LLT V5S64 = LLT::fixed_vector(5, 64);
+ const LLT V6S64 = LLT::fixed_vector(6, 64);
+ const LLT V7S64 = LLT::fixed_vector(7, 64);
+ const LLT V8S64 = LLT::fixed_vector(8, 64);
+ const LLT V16S64 = LLT::fixed_vector(16, 64);
std::initializer_list<LLT> AllS32Vectors =
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
@@ -495,8 +510,8 @@
const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
- setAction({G_BRCOND, S1}, Legal); // VCC branches
- setAction({G_BRCOND, S32}, Legal); // SCC branches
+ // s1 for VCC branches, s32 for SCC branches.
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
@@ -579,11 +594,12 @@
.lower();
}
- getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
- .customFor({S32, S64})
- .clampScalar(0, S32, S64)
- .widenScalarToNextPow2(0, 32)
- .scalarize(0);
+ getActionDefinitionsBuilder(
+ {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
+ .customFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0, 32)
+ .scalarize(0);
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
.legalFor({S32})
@@ -643,7 +659,7 @@
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
- setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
@@ -653,7 +669,7 @@
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
- setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+ getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
@@ -809,7 +825,7 @@
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S64}})
+ .customFor({{S64, S32}, {S64, S64}})
.narrowScalarFor({{S64, S16}}, changeTo(0, S32));
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
@@ -817,6 +833,7 @@
FPToI.minScalar(1, S32);
FPToI.minScalar(0, S32)
+ .widenScalarToNextPow2(0, 32)
.scalarize(0)
.lower();
@@ -935,10 +952,13 @@
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
+ // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
+ // RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
- .legalFor({S32})
- .clampScalar(0, S32, S32)
- .scalarize(0);
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
@@ -951,7 +971,7 @@
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16, V2S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
@@ -960,7 +980,7 @@
.scalarize(0)
.lower();
} else {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
.widenScalarToNextPow2(0)
.minScalar(0, S16)
@@ -979,7 +999,7 @@
.scalarize(0)
.lower();
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32})
.minScalar(0, S32)
.widenScalarToNextPow2(0)
@@ -1029,7 +1049,7 @@
const LLT DstTy = Query.Types[0];
// Split vector extloads.
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
if (MemSize < DstTy.getSizeInBits())
@@ -1078,35 +1098,35 @@
auto &Actions = getActionDefinitionsBuilder(Op);
// Explicitly list some common cases.
// TODO: Does this help compile time at all?
- Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
- {V2S32, GlobalPtr, 64, GlobalAlign32},
- {V4S32, GlobalPtr, 128, GlobalAlign32},
- {S64, GlobalPtr, 64, GlobalAlign32},
- {V2S64, GlobalPtr, 128, GlobalAlign32},
- {V2S16, GlobalPtr, 32, GlobalAlign32},
- {S32, GlobalPtr, 8, GlobalAlign8},
- {S32, GlobalPtr, 16, GlobalAlign16},
+ Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
+ {V2S32, GlobalPtr, V2S32, GlobalAlign32},
+ {V4S32, GlobalPtr, V4S32, GlobalAlign32},
+ {S64, GlobalPtr, S64, GlobalAlign32},
+ {V2S64, GlobalPtr, V2S64, GlobalAlign32},
+ {V2S16, GlobalPtr, V2S16, GlobalAlign32},
+ {S32, GlobalPtr, S8, GlobalAlign8},
+ {S32, GlobalPtr, S16, GlobalAlign16},
- {S32, LocalPtr, 32, 32},
- {S64, LocalPtr, 64, 32},
- {V2S32, LocalPtr, 64, 32},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 16},
- {V2S16, LocalPtr, 32, 32},
+ {S32, LocalPtr, S32, 32},
+ {S64, LocalPtr, S64, 32},
+ {V2S32, LocalPtr, V2S32, 32},
+ {S32, LocalPtr, S8, 8},
+ {S32, LocalPtr, S16, 16},
+ {V2S16, LocalPtr, S32, 32},
- {S32, PrivatePtr, 32, 32},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 16},
- {V2S16, PrivatePtr, 32, 32},
+ {S32, PrivatePtr, S32, 32},
+ {S32, PrivatePtr, S8, 8},
+ {S32, PrivatePtr, S16, 16},
+ {V2S16, PrivatePtr, S32, 32},
- {S32, ConstantPtr, 32, GlobalAlign32},
- {V2S32, ConstantPtr, 64, GlobalAlign32},
- {V4S32, ConstantPtr, 128, GlobalAlign32},
- {S64, ConstantPtr, 64, GlobalAlign32},
- {V2S32, ConstantPtr, 32, GlobalAlign32}});
+ {S32, ConstantPtr, S32, GlobalAlign32},
+ {V2S32, ConstantPtr, V2S32, GlobalAlign32},
+ {V4S32, ConstantPtr, V4S32, GlobalAlign32},
+ {S64, ConstantPtr, S64, GlobalAlign32},
+ {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
Actions.legalIf(
[=](const LegalityQuery &Query) -> bool {
- return isLoadStoreLegal(ST, Query, Op);
+ return isLoadStoreLegal(ST, Query);
});
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
@@ -1125,7 +1145,7 @@
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
return shouldBitcastLoadStoreType(ST, Query.Types[0],
- Query.MMODescrs[0].SizeInBits);
+ Query.MMODescrs[0].MemoryTy);
}, bitcastToRegisterType(0));
if (!IsStore) {
@@ -1148,7 +1168,7 @@
const LLT PtrTy = Query.Types[1];
const unsigned DstSize = DstTy.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
// Split extloads.
if (DstSize > MemSize)
@@ -1196,16 +1216,18 @@
// FIXME: 3 element stores scalarized on SI
// Split if it's too large for the address space.
- if (Query.MMODescrs[0].SizeInBits > MaxSize) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ if (MemSize > MaxSize) {
unsigned NumElts = DstTy.getNumElements();
unsigned EltSize = EltTy.getSizeInBits();
if (MaxSize % EltSize == 0) {
return std::make_pair(
- 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
+ 0, LLT::scalarOrVector(
+ ElementCount::getFixed(MaxSize / EltSize), EltTy));
}
- unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
+ unsigned NumPieces = MemSize / MaxSize;
// FIXME: Refine when odd breakdowns handled
// The scalars will need to be re-legalized.
@@ -1213,12 +1235,11 @@
NumElts % NumPieces != 0)
return std::make_pair(0, EltTy);
- return std::make_pair(0,
- LLT::vector(NumElts / NumPieces, EltTy));
+ return std::make_pair(
+ 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
}
// FIXME: We could probably handle weird extending loads better.
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
if (DstTy.getSizeInBits() > MemSize)
return std::make_pair(0, EltTy);
@@ -1230,48 +1251,58 @@
// should be OK, since the new parts will be further legalized.
unsigned FloorSize = PowerOf2Floor(DstSize);
return std::make_pair(
- 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
+ 0, LLT::scalarOrVector(
+ ElementCount::getFixed(FloorSize / EltSize), EltTy));
}
// Need to split because of alignment.
unsigned Align = Query.MMODescrs[0].AlignInBits;
if (EltSize > Align &&
(EltSize / Align < DstTy.getNumElements())) {
- return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
+ return std::make_pair(
+ 0, LLT::fixed_vector(EltSize / Align, EltTy));
}
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
.lowerIfMemSizeNotPow2()
- .minScalar(0, S32);
-
- if (IsStore)
- Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
-
- Actions
- .widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
- .lower();
+ .minScalar(0, S32)
+ .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
+ .widenScalarToNextPow2(0)
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
+ // FIXME: Unaligned accesses not lowered.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
- {S32, GlobalPtr, 16, 2 * 8},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 16},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 16},
- {S32, ConstantPtr, 8, 8},
- {S32, ConstantPtr, 16, 2 * 8}});
+ .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
+ {S32, GlobalPtr, S16, 2 * 8},
+ {S32, LocalPtr, S8, 8},
+ {S32, LocalPtr, S16, 16},
+ {S32, PrivatePtr, S8, 8},
+ {S32, PrivatePtr, S16, 16},
+ {S32, ConstantPtr, S8, 8},
+ {S32, ConstantPtr, S16, 2 * 8}})
+ .legalIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return isLoadStoreLegal(ST, Query);
+ });
+
if (ST.hasFlatAddressSpace()) {
ExtLoads.legalForTypesWithMemDesc(
- {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
+ {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
}
+ // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
+ // 64-bits.
+ //
+ // TODO: Should generalize bitcast action into coerce, which will also cover
+ // inserting addrspacecasts.
+ ExtLoads.customIf(typeIs(1, Constant32Ptr));
+
ExtLoads.clampScalar(0, S32, S32)
.widenScalarToNextPow2(0)
- .unsupportedIfMemSizeNotPow2()
.lower();
auto &Atomics = getActionDefinitionsBuilder(
@@ -1286,10 +1317,14 @@
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomics()) {
- getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
- .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasGFX90AInsts())
+ Atomic.legalFor({{S64, LocalPtr}});
}
+ if (ST.hasAtomicFaddInsts())
+ Atomic.legalFor({{S32, GlobalPtr}});
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
@@ -1302,19 +1337,21 @@
// Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
- .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
- GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
- LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
- .clampScalar(0, S16, S64)
- .scalarize(1)
- .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .fewerElementsIf(numElementsNotEven(0), scalarize(0))
- .clampMaxNumElements(0, S32, 2)
- .clampMaxNumElements(0, LocalPtr, 2)
- .clampMaxNumElements(0, PrivatePtr, 2)
- .scalarize(0)
- .widenScalarToNextPow2(0)
- .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
+ .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
+ LocalPtr, FlatPtr, PrivatePtr,
+ LLT::fixed_vector(2, LocalPtr),
+ LLT::fixed_vector(2, PrivatePtr)},
+ {S1, S32})
+ .clampScalar(0, S16, S64)
+ .scalarize(1)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(numElementsNotEven(0), scalarize(0))
+ .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, LocalPtr, 2)
+ .clampMaxNumElements(0, PrivatePtr, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0)
+ .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
// be more flexible with the shift amount type.
@@ -1393,7 +1430,8 @@
const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
return std::make_pair(
- VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
+ VecTypeIdx,
+ LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
})
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
@@ -1590,17 +1628,44 @@
.clampScalar(0, S32, S64)
.lower();
+ // TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
.legalFor({{S32, S32}})
+ .lowerFor({{V2S16, V2S16}})
+ .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
.scalarize(0)
.lower();
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder(G_FSHL)
+ .lowerFor({{V2S16, V2S16}})
+ .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder(G_FSHL)
+ .scalarize(0)
+ .lower();
+ }
+
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
getActionDefinitionsBuilder(G_FENCE)
.alwaysLegal();
+ getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+ .scalarize(0)
+ .minScalar(0, S32)
+ .lower();
+
+ getActionDefinitionsBuilder({G_SBFX, G_UBFX})
+ .legalFor({{S32, S32}, {S64, S32}})
+ .clampScalar(1, S32, S32)
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+
getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
@@ -1614,16 +1679,14 @@
G_SADDO, G_SSUBO,
// TODO: Implement
- G_FMINIMUM, G_FMAXIMUM,
- G_FSHL
- }).lower();
+ G_FMINIMUM, G_FMAXIMUM}).lower();
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
- computeTables();
+ getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -1668,6 +1731,8 @@
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeGlobalValue(MI, MRI, B);
case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_SEXTLOAD:
+ case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
@@ -1675,10 +1740,12 @@
return legalizeFDIV(MI, MRI, B);
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
- return legalizeUDIV_UREM(MI, MRI, B);
+ case TargetOpcode::G_UDIVREM:
+ return legalizeUnsignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_SDIV:
case TargetOpcode::G_SREM:
- return legalizeSDIV_SREM(MI, MRI, B);
+ case TargetOpcode::G_SDIVREM:
+ return legalizeSignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
case TargetOpcode::G_FLOG:
@@ -1751,7 +1818,7 @@
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- 4, commonAlignment(Align(64), StructOffset));
+ LLT::scalar(32), commonAlignment(Align(64), StructOffset));
Register LoadAddr;
@@ -2021,9 +2088,10 @@
// TODO: Copied from DAG implementation. Verify logic and document how this
// actually works.
-bool AMDGPULegalizerInfo::legalizeFPTOI(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool Signed) const {
+bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool Signed) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -2031,24 +2099,57 @@
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+ const LLT SrcLT = MRI.getType(Src);
+ assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
unsigned Flags = MI.getFlags();
- auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
- auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
- auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
+ // The basic idea of converting a floating point number into a pair of 32-bit
+ // integers is illustrated as follows:
+ //
+ // tf := trunc(val);
+ // hif := floor(tf * 2^-32);
+ // lof := tf - hif * 2^32; // lof is always positive due to floor.
+ // hi := fptoi(hif);
+ // lo := fptoi(lof);
+ //
+ auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
+ MachineInstrBuilder Sign;
+ if (Signed && SrcLT == S32) {
+ // However, a 32-bit floating point number has only 23 bits mantissa and
+ // it's not enough to hold all the significant bits of `lof` if val is
+ // negative. To avoid the loss of precision, We need to take the absolute
+ // value after truncating and flip the result back based on the original
+ // signedness.
+ Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
+ Trunc = B.buildFAbs(S32, Trunc, Flags);
+ }
+ MachineInstrBuilder K0, K1;
+ if (SrcLT == S64) {
+ K0 = B.buildFConstant(S64,
+ BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
+ K1 = B.buildFConstant(S64,
+ BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
+ } else {
+ K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
+ K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
+ }
- auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
- auto FloorMul = B.buildFFloor(S64, Mul, Flags);
- auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
+ auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
+ auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
+ auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = Signed ?
- B.buildFPTOSI(S32, FloorMul) :
- B.buildFPTOUI(S32, FloorMul);
+ auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
+ : B.buildFPTOUI(S32, FloorMul);
auto Lo = B.buildFPTOUI(S32, Fma);
- B.buildMerge(Dst, { Lo, Hi });
+ if (Signed && SrcLT == S32) {
+ // Flip the result based on the signedness, which is either all 0s or 1s.
+ Sign = B.buildMerge(S64, {Sign, Sign});
+ // r := xor({lo, hi}, sign) - sign;
+ B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
+ } else
+ B.buildMerge(Dst, {Lo, Hi});
MI.eraseFromParent();
return true;
@@ -2141,7 +2242,7 @@
bool AMDGPULegalizerInfo::legalizeShuffleVector(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
@@ -2258,7 +2359,8 @@
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isModuleEntryFunction()) {
+ if (!MFI->isModuleEntryFunction() &&
+ !GV->getName().equals("llvm.amdgcn.module.lds")) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
@@ -2334,11 +2436,12 @@
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
+ LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
MachinePointerInfo::getGOT(MF),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- 8 /*Size*/, Align(8));
+ LoadTy, Align(8));
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
@@ -2355,7 +2458,8 @@
static LLT widenToNextPowerOf2(LLT Ty) {
if (Ty.isVector())
- return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
+ return Ty.changeElementCount(
+ ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
}
@@ -2378,17 +2482,21 @@
return true;
}
+ if (MI.getOpcode() != AMDGPU::G_LOAD)
+ return false;
+
Register ValReg = MI.getOperand(0).getReg();
LLT ValTy = MRI.getType(ValReg);
MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned ValSize = ValTy.getSizeInBits();
- const unsigned MemSize = 8 * MMO->getSize();
+ const LLT MemTy = MMO->getMemoryType();
const Align MemAlign = MMO->getAlign();
+ const unsigned MemSize = MemTy.getSizeInBits();
const unsigned AlignInBits = 8 * MemAlign.value();
// Widen non-power-of-2 loads to the alignment if needed
- if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
+ if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
const unsigned WideMemSize = PowerOf2Ceil(MemSize);
// This was already the correct extending load result type, so just adjust
@@ -2472,7 +2580,7 @@
"this should not have been custom lowered");
LLT ValTy = MRI.getType(CmpVal);
- LLT VecTy = LLT::vector(2, ValTy);
+ LLT VecTy = LLT::fixed_vector(2, ValTy);
Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
@@ -2624,7 +2732,7 @@
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Dst) == LLT::vector(2, 16));
+ assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
@@ -2762,11 +2870,11 @@
return false;
}
-void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
- Register DstReg,
- Register X,
- Register Y,
- bool IsDiv) const {
+void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
+ Register DstDivReg,
+ Register DstRemReg,
+ Register X,
+ Register Y) const {
const LLT S1 = LLT::scalar(1);
const LLT S32 = LLT::scalar(32);
@@ -2792,28 +2900,17 @@
// First quotient/remainder refinement.
auto One = B.buildConstant(S32, 1);
auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
- if (IsDiv)
+ if (DstDivReg)
Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
// Second quotient/remainder refinement.
Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
- if (IsDiv)
- B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
- else
- B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
-}
+ if (DstDivReg)
+ B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
-bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
- Register DstReg = MI.getOperand(0).getReg();
- Register Num = MI.getOperand(1).getReg();
- Register Den = MI.getOperand(2).getReg();
- legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
- MI.eraseFromParent();
- return true;
+ if (DstRemReg)
+ B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
}
// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
@@ -2859,11 +2956,11 @@
return {ResultLo.getReg(0), ResultHi.getReg(0)};
}
-void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
- Register DstReg,
- Register Numer,
- Register Denom,
- bool IsDiv) const {
+void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
+ Register DstDivReg,
+ Register DstRemReg,
+ Register Numer,
+ Register Denom) const {
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT S1 = LLT::scalar(1);
@@ -2959,57 +3056,74 @@
// endif C6
// endif C3
- if (IsDiv) {
+ if (DstDivReg) {
auto Sel1 = B.buildSelect(
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
- B.buildSelect(DstReg,
- B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
- } else {
+ B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ Sel1, MulHi3);
+ }
+
+ if (DstRemReg) {
auto Sel2 = B.buildSelect(
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
- B.buildSelect(DstReg,
- B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
+ B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ Sel2, Sub1);
}
}
-bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstDivReg, DstRemReg;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case AMDGPU::G_UDIV: {
+ DstDivReg = MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::G_UREM: {
+ DstRemReg = MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::G_UDIVREM: {
+ DstDivReg = MI.getOperand(0).getReg();
+ DstRemReg = MI.getOperand(1).getReg();
+ break;
+ }
+ }
+
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
- Register DstReg = MI.getOperand(0).getReg();
- Register Num = MI.getOperand(1).getReg();
- Register Den = MI.getOperand(2).getReg();
- LLT Ty = MRI.getType(DstReg);
+ const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
+ Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
+ Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
if (Ty == S32)
- legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+ legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
else if (Ty == S64)
- legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
+ legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
MI.eraseFromParent();
return true;
-
}
-bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- Register DstReg = MI.getOperand(0).getReg();
- const LLT Ty = MRI.getType(DstReg);
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
if (Ty != S32 && Ty != S64)
return false;
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
-
- Register LHS = MI.getOperand(1).getReg();
- Register RHS = MI.getOperand(2).getReg();
+ const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
+ Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
+ Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
@@ -3021,20 +3135,45 @@
LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
- Register UDivRem = MRI.createGenericVirtualRegister(Ty);
+ Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case AMDGPU::G_SDIV: {
+ DstDivReg = MI.getOperand(0).getReg();
+ TmpDivReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ case AMDGPU::G_SREM: {
+ DstRemReg = MI.getOperand(0).getReg();
+ TmpRemReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ case AMDGPU::G_SDIVREM: {
+ DstDivReg = MI.getOperand(0).getReg();
+ DstRemReg = MI.getOperand(1).getReg();
+ TmpDivReg = MRI.createGenericVirtualRegister(Ty);
+ TmpRemReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ }
+
if (Ty == S32)
- legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
+ legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
- legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
+ legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
- Register Sign;
- if (IsDiv)
- Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
- else
- Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+ if (DstDivReg) {
+ auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
+ auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
+ B.buildSub(DstDivReg, SignXor, Sign);
+ }
- UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
- B.buildSub(DstReg, UDivRem, Sign);
+ if (DstRemReg) {
+ auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+ auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
+ B.buildSub(DstRemReg, SignXor, Sign);
+ }
MI.eraseFromParent();
return true;
@@ -3511,18 +3650,21 @@
// (the offset that is excluded from bounds checking and swizzling, to go in
// the instruction's soffset field). This function takes the first kind of
// offset and figures out how to split it between voffset and immoffset.
-std::tuple<Register, unsigned, unsigned>
+std::pair<Register, unsigned>
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
const unsigned MaxImm = 4095;
Register BaseReg;
- unsigned TotalConstOffset;
+ unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = *B.getMRI();
- std::tie(BaseReg, TotalConstOffset) =
- AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+ std::tie(BaseReg, ImmOffset) =
+ AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
- unsigned ImmOffset = TotalConstOffset;
+ // If BaseReg is a pointer, convert it to int.
+ if (MRI.getType(BaseReg).isPointer())
+ BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
// If the immediate value is too big for the immoffset field, put the value
// and -4096 into the immoffset field so that the value that is copied/added
@@ -3550,7 +3692,32 @@
if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);
- return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+ return std::make_pair(BaseReg, ImmOffset);
+}
+
+/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
+void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
+ Register VOffset, Register SOffset,
+ unsigned ImmOffset, Register VIndex,
+ MachineRegisterInfo &MRI) const {
+ Optional<ValueAndVReg> MaybeVOffsetVal =
+ getConstantVRegValWithLookThrough(VOffset, MRI);
+ Optional<ValueAndVReg> MaybeSOffsetVal =
+ getConstantVRegValWithLookThrough(SOffset, MRI);
+ Optional<ValueAndVReg> MaybeVIndexVal =
+ getConstantVRegValWithLookThrough(VIndex, MRI);
+ // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
+ // update the MMO with that offset. The stride is unknown so we can only do
+ // this if VIndex is constant 0.
+ if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
+ MaybeVIndexVal->Value == 0) {
+ uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
+ MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
+ MMO->setOffset(TotalOffset);
+ } else {
+ // We don't have a constant combined offset to use in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ }
}
/// Handle register layout difference for f16 images for some subtargets.
@@ -3572,7 +3739,8 @@
int NumElts = StoreVT.getNumElements();
- return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
+ .getReg(0);
}
if (ImageStore && ST.hasImageStoreD16Bug()) {
@@ -3581,7 +3749,8 @@
Reg = B.buildBitcast(S32, Reg).getReg(0);
PackedRegs.push_back(Reg);
PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
+ .getReg(0);
}
if (StoreVT.getNumElements() == 3) {
@@ -3590,18 +3759,19 @@
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
- Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
- return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
+ Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
+ return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
+ Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
auto Unmerge = B.buildUnmerge(S32, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
+ .getReg(0);
}
llvm_unreachable("invalid data type");
@@ -3651,7 +3821,6 @@
const int MemSize = MMO->getSize();
unsigned ImmOffset;
- unsigned TotalOffset;
// The typed intrinsics add an immediate after the registers.
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
@@ -3663,6 +3832,8 @@
if (HasVIndex) {
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
+ } else {
+ VIndex = B.buildConstant(S32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -3676,9 +3847,8 @@
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
if (IsTyped) {
@@ -3701,9 +3871,6 @@
}
}
- if (!VIndex)
- VIndex = B.buildConstant(S32, 0).getReg(0);
-
auto MIB = B.buildInstr(Opc)
.addUse(VData) // vdata
.addUse(RSrc) // rsrc
@@ -3730,7 +3897,7 @@
bool IsTyped) const {
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
- const int MemSize = MMO->getSize();
+ const LLT MemTy = MMO->getMemoryType();
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
@@ -3746,6 +3913,8 @@
if (HasVIndex) {
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
+ } else {
+ VIndex = B.buildConstant(S32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -3759,16 +3928,14 @@
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
unsigned ImmOffset;
- unsigned TotalOffset;
LLT Ty = MRI.getType(Dst);
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
const bool Unpacked = ST.hasUnpackedD16VMem();
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
@@ -3779,11 +3946,11 @@
Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
} else {
- switch (MemSize) {
- case 1:
+ switch (MemTy.getSizeInBits()) {
+ case 8:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
- case 2:
+ case 16:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
@@ -3794,7 +3961,8 @@
Register LoadDstReg;
- bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
+ bool IsExtLoad =
+ (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
LLT UnpackedTy = Ty.changeElementSize(32);
if (IsExtLoad)
@@ -3804,9 +3972,6 @@
else
LoadDstReg = Dst;
- if (!VIndex)
- VIndex = B.buildConstant(S32, 0).getReg(0);
-
auto MIB = B.buildInstr(Opc)
.addDef(LoadDstReg) // vdata
.addUse(RSrc) // rsrc
@@ -3898,9 +4063,16 @@
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -3940,6 +4112,8 @@
if (HasVIndex) {
VIndex = MI.getOperand(4 + OpOffset).getReg();
++OpOffset;
+ } else {
+ VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
}
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
@@ -3949,13 +4123,8 @@
MachineMemOperand *MMO = *MI.memoperands_begin();
unsigned ImmOffset;
- unsigned TotalOffset;
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
-
- if (!VIndex)
- VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
@@ -3980,14 +4149,16 @@
return true;
}
-/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
+/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
/// vector with s16 typed elements.
-static void packImageA16AddressToDwords(
- MachineIRBuilder &B, MachineInstr &MI,
- SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
- const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
+static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
+ SmallVectorImpl<Register> &PackedAddrs,
+ unsigned ArgOffset,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ bool IsA16, bool IsG16) {
const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ auto EndIdx = Intr->VAddrEnd;
for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
@@ -3996,7 +4167,10 @@
Register AddrReg = SrcOp.getReg();
- if (I < Intr->GradientStart) {
+ if ((I < Intr->GradientStart) ||
+ (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
+ (I >= Intr->CoordStart && !IsA16)) {
+ // Handle any gradient or coordinate operands that should not be packed
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
@@ -4041,16 +4215,16 @@
int NumAddrRegs = AddrRegs.size();
if (NumAddrRegs != 1) {
- // Round up to 8 elements for v5-v7
- // FIXME: Missing intermediate sized register classes and instructions.
- if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+ // Above 8 elements round up to next power of 2 (i.e. 16).
+ if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
auto Undef = B.buildUndef(S32);
AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
NumAddrRegs = RoundedNumRegs;
}
- auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
+ auto VAddr =
+ B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
}
@@ -4091,7 +4265,7 @@
MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
unsigned DMask = 0;
@@ -4146,7 +4320,7 @@
if (BaseOpcode->AtomicX2) {
Register VData1 = MI.getOperand(3).getReg();
// The two values are packed in one register.
- LLT PackedTy = LLT::vector(2, Ty);
+ LLT PackedTy = LLT::fixed_vector(2, Ty);
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
MI.getOperand(2).setReg(Concat.getReg(0));
MI.getOperand(3).setReg(AMDGPU::NoRegister);
@@ -4194,35 +4368,30 @@
}
// Rewrite the addressing register layout before doing anything else.
- if (IsA16 || IsG16) {
- if (IsA16) {
- // Target must support the feature and gradients need to be 16 bit too
- if (!ST.hasA16() || !IsG16)
- return false;
- } else if (!ST.hasG16())
- return false;
+ if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
+ // 16 bit gradients are supported, but are tied to the A16 control
+ // so both gradients and addresses must be 16 bit
+ return false;
+ }
+ if (IsA16 && !ST.hasA16()) {
+ // A16 not supported
+ return false;
+ }
+
+ if (IsA16 || IsG16) {
if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
- // Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
- packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
- PackEndIdx);
- if (!IsA16) {
- // Add uncompressed address
- for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
- int AddrReg = MI.getOperand(ArgOffset + I).getReg();
- assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
- PackedRegs.push_back(AddrReg);
- }
- }
+ packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
+ IsG16);
// See also below in the non-a16 branch
- const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
+ PackedRegs.size() <= ST.getNSAMaxSize();
if (!UseNSA && PackedRegs.size() > 1) {
- LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
+ LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
@@ -4256,7 +4425,8 @@
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
+ CorrectedNumVAddrs <= ST.getNSAMaxSize();
if (!UseNSA && Intr->NumVAddrs > 1)
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
@@ -4299,7 +4469,8 @@
return false;
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
- const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
+ const LLT AdjustedTy =
+ Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
@@ -4313,15 +4484,17 @@
LLT RegTy;
if (IsD16 && ST.hasUnpackedD16VMem()) {
- RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
- TFETy = LLT::vector(AdjustedNumElts + 1, 32);
+ RoundedTy =
+ LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
+ TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
RegTy = S32;
} else {
unsigned EltSize = EltTy.getSizeInBits();
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
- RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
- TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+ RoundedTy = LLT::scalarOrVector(
+ ElementCount::getFixed(RoundedSize / EltSize), EltSize);
+ TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
}
@@ -4435,10 +4608,10 @@
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
// Deal with the one annoying legal case.
- const LLT V3S16 = LLT::vector(3, 16);
+ const LLT V3S16 = LLT::fixed_vector(3, 16);
if (Ty == V3S16) {
padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
- auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
+ auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
return true;
}
@@ -4460,7 +4633,7 @@
Observer.changingInstr(MI);
- if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+ if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
Dst = MI.getOperand(0).getReg();
@@ -4502,27 +4675,55 @@
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
- if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !ST.isTrapHandlerEnabled()) {
- B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
- } else {
- // Pass queue pointer to trap handler as input, and insert trap instruction
- // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
- MachineRegisterInfo &MRI = *B.getMRI();
+ if (!ST.isTrapHandlerEnabled() ||
+ ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
+ return legalizeTrapEndpgm(MI, MRI, B);
- Register LiveIn =
- MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
- return false;
-
- Register SGPR01(AMDGPU::SGPR0_SGPR1);
- B.buildCopy(SGPR01, LiveIn);
- B.buildInstr(AMDGPU::S_TRAP)
- .addImm(GCNSubtarget::TrapIDLLVMTrap)
- .addReg(SGPR01, RegState::Implicit);
+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return legalizeTrapHsaQueuePtr(MI, MRI, B);
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return ST.supportsGetDoorbellID() ?
+ legalizeTrapHsa(MI, MRI, B) :
+ legalizeTrapHsaQueuePtr(MI, MRI, B);
+ }
}
+ llvm_unreachable("Unknown trap handler");
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ // Pass queue pointer to trap handler as input, and insert trap instruction
+ // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+ Register LiveIn =
+ MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
+ return false;
+
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
+ B.buildCopy(SGPR01, LiveIn);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
+ .addReg(SGPR01, RegState::Implicit);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapHsa(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
MI.eraseFromParent();
return true;
}
@@ -4531,8 +4732,8 @@
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
// Is non-HSA path or trap-handler disabled? then, report a warning
// accordingly
- if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !ST.isTrapHandlerEnabled()) {
+ if (!ST.isTrapHandlerEnabled() ||
+ ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
"debugtrap handler not supported",
MI.getDebugLoc(), DS_Warning);
@@ -4540,7 +4741,8 @@
Ctx.diagnose(NoTrap);
} else {
// Insert debug-trap instruction
- B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
}
MI.eraseFromParent();
@@ -4561,6 +4763,14 @@
Register RayInvDir = MI.getOperand(6).getReg();
Register TDescr = MI.getOperand(7).getReg();
+ if (!ST.hasGFX10_AEncoding()) {
+ DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
+ "intrinsic not supported on subtarget",
+ MI.getDebugLoc());
+ B.getMF().getFunction().getContext().diagnose(BadIntrin);
+ return false;
+ }
+
bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
@@ -4810,6 +5020,11 @@
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_atomic_inc:
return legalizeAtomicIncDec(MI, B, true);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 87e8b21..d4fefd8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -99,25 +99,19 @@
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
- bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
- void legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
- Register DstReg, Register Num, Register Den,
- bool IsRem) const;
- bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
- bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg,
+ Register DstRemReg, Register Num,
+ Register Den) const;
- void legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
- Register DstReg, Register Numer, Register Denom,
- bool IsDiv) const;
+ void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg,
+ Register DstRemReg, Register Numer,
+ Register Denom) const;
- bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
- bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -148,8 +142,11 @@
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;
- std::tuple<Register, unsigned, unsigned>
- splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+ std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const;
+ void updateBufferMMO(MachineMemOperand *MMO, Register VOffset,
+ Register SOffset, unsigned ImmOffset, Register VIndex,
+ MachineRegisterInfo &MRI) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg, bool ImageStore = false) const;
@@ -183,6 +180,12 @@
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 6b7f572..1ee6933 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -17,6 +17,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -476,7 +477,7 @@
return true;
const Function *F = CI->getParent()->getParent();
Attribute Attr = F->getFnAttribute("unsafe-fp-math");
- return Attr.getValueAsString() == "true";
+ return Attr.getValueAsBool();
}
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
@@ -1369,9 +1370,9 @@
StringRef CPU = TM->getTargetCPU();
StringRef Features = TM->getTargetFeatureString();
- if ((CPU.empty() || CPU.equals_lower("generic")) &&
+ if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
(Features.empty() ||
- Features.find_lower("wavefrontsize") == StringRef::npos))
+ Features.find_insensitive("wavefrontsize") == StringRef::npos))
return false;
Function *F = CI->getParent()->getParent();
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 646087c..32262ea 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -19,10 +19,16 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+static cl::opt<bool> EnableOCLManglingMismatchWA(
+ "amdgpu-enable-ocl-mangling-mismatch-workaround", cl::init(true),
+ cl::ReallyHidden,
+ cl::desc("Enable the workaround for OCL name mangling mismatch."));
+
namespace {
enum EManglingParam {
@@ -826,7 +832,8 @@
unsigned AS = UseAddrSpace
? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
: 0;
- if (AS != 0) os << "U3AS" << AS;
+ if (EnableOCLManglingMismatchWA || AS != 0)
+ os << "U3AS" << AS;
Ptr = p;
p.PtrKind = 0;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 8fb4f93..0f157e5 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 9ab6a52..08a1b97 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -67,7 +67,7 @@
const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
const bool HasUniformWorkGroupSize =
- F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+ F->getFnAttribute("uniform-work-group-size").getValueAsBool();
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
return false;
@@ -249,9 +249,9 @@
}
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
- "AMDGPU IR optimizations", false, false)
-INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
- false, false)
+ "AMDGPU Kernel Attributes", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+ "AMDGPU Kernel Attributes", false, false)
char AMDGPULowerKernelAttributes::ID = 0;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
new file mode 100644
index 0000000..70ecea8
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -0,0 +1,400 @@
+//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
+// type which would cause allocating huge memory for struct instance within
+// every kernel. Hence, before running this pass, it is advisable to run the
+// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
+// non-kernel functions by pointers and thereby minimizes the unnecessary per
+// kernel allocation of LDS memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-module-lds"
+
+using namespace llvm;
+
+static cl::opt<bool> SuperAlignLDSGlobals(
+ "amdgpu-super-align-lds-globals",
+ cl::desc("Increase alignment of LDS if it is not on align boundary"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+class AMDGPULowerModuleLDS : public ModulePass {
+
+ static void removeFromUsedList(Module &M, StringRef Name,
+ SmallPtrSetImpl<Constant *> &ToRemove) {
+ GlobalVariable *GV = M.getNamedGlobal(Name);
+ if (!GV || ToRemove.empty()) {
+ return;
+ }
+
+ SmallVector<Constant *, 16> Init;
+ auto *CA = cast<ConstantArray>(GV->getInitializer());
+ for (auto &Op : CA->operands()) {
+ // ModuleUtils::appendToUsed only inserts Constants
+ Constant *C = cast<Constant>(Op);
+ if (!ToRemove.contains(C->stripPointerCasts())) {
+ Init.push_back(C);
+ }
+ }
+
+ if (Init.size() == CA->getNumOperands()) {
+ return; // none to remove
+ }
+
+ GV->eraseFromParent();
+
+ for (Constant *C : ToRemove) {
+ C->removeDeadConstantUsers();
+ }
+
+ if (!Init.empty()) {
+ ArrayType *ATy =
+ ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+ GV =
+ new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, Init), Name);
+ GV->setSection("llvm.metadata");
+ }
+ }
+
+ static void
+ removeFromUsedLists(Module &M,
+ const std::vector<GlobalVariable *> &LocalVars) {
+ SmallPtrSet<Constant *, 32> LocalVarsSet;
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+ LocalVarsSet.insert(C);
+ }
+ }
+ removeFromUsedList(M, "llvm.used", LocalVarsSet);
+ removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+ }
+
+ static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+ GlobalVariable *SGV) {
+ // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
+ // that might call a function which accesses a field within it. This is
+ // presently approximated to 'all kernels' if there are any such functions
+ // in the module. This implicit use is reified as an explicit use here so
+ // that later passes, specifically PromoteAlloca, account for the required
+ // memory without any knowledge of this transform.
+
+ // An operand bundle on llvm.donothing works because the call instruction
+ // survives until after the last pass that needs to account for LDS. It is
+ // better than inline asm as the latter survives until the end of codegen. A
+ // totally robust solution would be a function with the same semantics as
+ // llvm.donothing that takes a pointer to the instance and is lowered to a
+ // no-op after LDS is allocated, but that is not presently necessary.
+
+ LLVMContext &Ctx = Func->getContext();
+
+ Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+
+ Function *Decl =
+ Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+ Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
+ SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+
+ Builder.CreateCall(FTy, Decl, {},
+ {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
+ "");
+ }
+
+private:
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+public:
+ static char ID;
+
+ AMDGPULowerModuleLDS() : ModulePass(ID) {
+ initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ UsedList = AMDGPU::getUsedList(M);
+
+ bool Changed = processUsedLDS(M);
+
+ for (Function &F : M.functions()) {
+ // Only lower compute kernels' LDS.
+ if (!AMDGPU::isKernel(F.getCallingConv()))
+ continue;
+ Changed |= processUsedLDS(M, &F);
+ }
+
+ UsedList.clear();
+ return Changed;
+ }
+
+private:
+ bool processUsedLDS(Module &M, Function *F = nullptr) {
+ LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+
+ // Find variables to move into new struct instance
+ std::vector<GlobalVariable *> FoundLocalVars =
+ AMDGPU::findVariablesToLower(M, F);
+
+ if (FoundLocalVars.empty()) {
+ // No variables to rewrite, no changes made.
+ return false;
+ }
+
+ // Increase the alignment of LDS globals if necessary to maximise the chance
+ // that we can use aligned LDS instructions to access them.
+ if (SuperAlignLDSGlobals) {
+ for (auto *GV : FoundLocalVars) {
+ Align Alignment = AMDGPU::getAlign(DL, GV);
+ TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
+
+ if (GVSize > 8) {
+ // We might want to use a b96 or b128 load/store
+ Alignment = std::max(Alignment, Align(16));
+ } else if (GVSize > 4) {
+ // We might want to use a b64 load/store
+ Alignment = std::max(Alignment, Align(8));
+ } else if (GVSize > 2) {
+ // We might want to use a b32 load/store
+ Alignment = std::max(Alignment, Align(4));
+ } else if (GVSize > 1) {
+ // We might want to use a b16 load/store
+ Alignment = std::max(Alignment, Align(2));
+ }
+
+ GV->setAlignment(Alignment);
+ }
+ }
+
+ SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+ LayoutFields.reserve(FoundLocalVars.size());
+ for (GlobalVariable *GV : FoundLocalVars) {
+ OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
+ AMDGPU::getAlign(DL, GV));
+ LayoutFields.emplace_back(F);
+ }
+
+ performOptimizedStructLayout(LayoutFields);
+
+ std::vector<GlobalVariable *> LocalVars;
+ LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+ {
+ // This usually won't need to insert any padding, perhaps avoid the alloc
+ uint64_t CurrentOffset = 0;
+ for (size_t I = 0; I < LayoutFields.size(); I++) {
+ GlobalVariable *FGV = static_cast<GlobalVariable *>(
+ const_cast<void *>(LayoutFields[I].Id));
+ Align DataAlign = LayoutFields[I].Alignment;
+
+ uint64_t DataAlignV = DataAlign.value();
+ if (uint64_t Rem = CurrentOffset % DataAlignV) {
+ uint64_t Padding = DataAlignV - Rem;
+
+ // Append an array of padding bytes to meet alignment requested
+ // Note (o + (a - (o % a)) ) % a == 0
+ // (offset + Padding ) % align == 0
+
+ Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+ LocalVars.push_back(new GlobalVariable(
+ M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+ "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+ false));
+ CurrentOffset += Padding;
+ }
+
+ LocalVars.push_back(FGV);
+ CurrentOffset += LayoutFields[I].Size;
+ }
+ }
+
+ std::vector<Type *> LocalVarTypes;
+ LocalVarTypes.reserve(LocalVars.size());
+ std::transform(
+ LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+ [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+ std::string VarName(
+ F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
+ : "llvm.amdgcn.module.lds");
+ StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
+
+ Align StructAlign =
+ AMDGPU::getAlign(DL, LocalVars[0]);
+
+ GlobalVariable *SGV = new GlobalVariable(
+ M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+ VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+ false);
+ SGV->setAlignment(StructAlign);
+ if (!F) {
+ appendToCompilerUsed(
+ M, {static_cast<GlobalValue *>(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+ }
+
+ // The verifier rejects used lists containing an inttoptr of a constant
+ // so remove the variables from these lists before replaceAllUsesWith
+ removeFromUsedLists(M, LocalVars);
+
+ // Replace uses of ith variable with a constantexpr to the ith field of the
+ // instance that will be allocated by AMDGPUMachineFunction
+ Type *I32 = Type::getInt32Ty(Ctx);
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ GlobalVariable *GV = LocalVars[I];
+ Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
+ if (F) {
+ // Replace all constant uses with instructions if they belong to the
+ // current kernel.
+ for (User *U : make_early_inc_range(GV->users())) {
+ if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+ AMDGPU::replaceConstantUsesInFunction(C, F);
+ }
+
+ GV->removeDeadConstantUsers();
+
+ GV->replaceUsesWithIf(GEP, [F](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ return I && I->getFunction() == F;
+ });
+ } else {
+ GV->replaceAllUsesWith(GEP);
+ }
+ if (GV->use_empty()) {
+ UsedList.erase(GV);
+ GV->eraseFromParent();
+ }
+
+ uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
+ Align A = commonAlignment(StructAlign, Off);
+ refineUsesAlignment(GEP, A, DL);
+ }
+
+ // Mark kernels with asm that reads the address of the allocated structure
+ // This is not necessary for lowering. This lets other passes, specifically
+ // PromoteAlloca, accurately calculate how much LDS will be used by the
+ // kernel after lowering.
+ if (!F) {
+ IRBuilder<> Builder(Ctx);
+ SmallPtrSet<Function *, 32> Kernels;
+ for (auto &I : M.functions()) {
+ Function *Func = &I;
+ if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
+ markUsedByKernel(Builder, Func, SGV);
+ Kernels.insert(Func);
+ }
+ }
+ }
+ return true;
+ }
+
+ void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
+ unsigned MaxDepth = 5) {
+ if (!MaxDepth || A == 1)
+ return;
+
+ for (User *U : Ptr->users()) {
+ if (auto *LI = dyn_cast<LoadInst>(U)) {
+ LI->setAlignment(std::max(A, LI->getAlign()));
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getPointerOperand() == Ptr)
+ SI->setAlignment(std::max(A, SI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
+ // None of atomicrmw operations can work on pointers, but let's
+ // check it anyway in case it will or we will process ConstantExpr.
+ if (AI->getPointerOperand() == Ptr)
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
+ if (AI->getPointerOperand() == Ptr)
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
+ APInt Off(BitWidth, 0);
+ if (GEP->getPointerOperand() == Ptr &&
+ GEP->accumulateConstantOffset(DL, Off)) {
+ Align GA = commonAlignment(A, Off.getLimitedValue());
+ refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+ }
+ continue;
+ }
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getOpcode() == Instruction::BitCast ||
+ I->getOpcode() == Instruction::AddrSpaceCast)
+ refineUsesAlignment(I, A, DL, MaxDepth - 1);
+ }
+ }
+ }
+};
+
+} // namespace
+char AMDGPULowerModuleLDS::ID = 0;
+
+char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
+ "Lower uses of LDS variables from non-kernel functions", false,
+ false)
+
+ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
+ return new AMDGPULowerModuleLDS();
+}
+
+PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index a8cba3f..3dd27f1 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -41,9 +41,6 @@
const TargetSubtargetInfo &ST;
const AsmPrinter &AP;
- const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const;
-
public:
AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
const AsmPrinter &AP);
@@ -95,54 +92,21 @@
}
}
-const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
- const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const {
- const MCExpr *DestBBSym
- = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
- const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
-
- // FIXME: The first half of this assert should be removed. This should
- // probably be PC relative instead of using the source block symbol, and
- // therefore the indirect branch expansion should use a bundle.
- assert(
- skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() ==
- AMDGPU::S_GETPC_B64 &&
- ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
-
- // s_getpc_b64 returns the address of next instruction.
- const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
- SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
-
- if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD)
- return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
-
- assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD);
- return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
-}
-
bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
MCOperand &MCOp) const {
switch (MO.getType()) {
default:
- llvm_unreachable("unknown operand type");
+ break;
case MachineOperand::MO_Immediate:
MCOp = MCOperand::createImm(MO.getImm());
return true;
case MachineOperand::MO_Register:
MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
return true;
- case MachineOperand::MO_MachineBasicBlock: {
- if (MO.getTargetFlags() != 0) {
- MCOp = MCOperand::createExpr(
- getLongBranchBlockExpr(*MO.getParent()->getParent(), MO));
- } else {
- MCOp = MCOperand::createExpr(
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
- }
-
return true;
- }
case MachineOperand::MO_GlobalAddress: {
const GlobalValue *GV = MO.getGlobal();
SmallString<128> SymbolName;
@@ -168,7 +132,15 @@
case MachineOperand::MO_RegisterMask:
// Regmasks are like implicit defs.
return false;
+ case MachineOperand::MO_MCSymbol:
+ if (MO.getTargetFlags() == SIInstrInfo::MO_FAR_BRANCH_OFFSET) {
+ MCSymbol *Sym = MO.getMCSymbol();
+ MCOp = MCOperand::createExpr(Sym->getVariableValue());
+ return true;
+ }
+ break;
}
+ llvm_unreachable("unknown operand type");
}
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
@@ -274,24 +246,9 @@
++I;
}
} else {
- // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are
+ // We don't want these pseudo instructions encoded. They are
// placeholder terminator instructions and should only be printed as
// comments.
- if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
- if (isVerbose()) {
- SmallVector<char, 16> BBStr;
- raw_svector_ostream Str(BBStr);
-
- const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
- const MCSymbolRefExpr *Expr
- = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
- Expr->print(Str, MAI);
- OutStreamer->emitRawComment(Twine(" mask branch ") + BBStr);
- }
-
- return;
- }
-
if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
if (isVerbose())
OutStreamer->emitRawComment(" return to shader part epilog");
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index b6a69b2..697513b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1419,11 +1419,7 @@
static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
unsigned Index) {
- for (auto i : PHIRegionIndices) {
- if (i == Index)
- return true;
- }
- return false;
+ return llvm::is_contained(PHIRegionIndices, Index);
}
bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 717145b..0c743a7 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -28,12 +28,10 @@
const Function &F = MF.getFunction();
Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
- MemoryBound = MemBoundAttr.isStringAttribute() &&
- MemBoundAttr.getValueAsString() == "true";
+ MemoryBound = MemBoundAttr.getValueAsBool();
Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
- WaveLimiter = WaveLimitAttr.isStringAttribute() &&
- WaveLimitAttr.getValueAsString() == "true";
+ WaveLimiter = WaveLimitAttr.getValueAsBool();
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
@@ -64,6 +62,18 @@
return Offset;
}
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+ if (isModuleEntryFunction()) {
+ const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
+ if (GV) {
+ unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+ (void)Offset;
+ assert(Offset == 0 &&
+ "Module LDS expected to be allocated before other LDS");
+ }
+ }
+}
+
void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
const GlobalVariable &GV) {
assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 07cac77..10ff500 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -94,6 +94,7 @@
}
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+ void allocateModuleLDSGlobal(const Module *M);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index 82c6d75..ad198a3 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
@@ -17,3 +20,5 @@
std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation();
} // llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 756bc94..8af7979 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -26,22 +26,6 @@
const char NoteNameV2[] = "AMD";
const char NoteNameV3[] = "AMDGPU";
-// TODO: Remove this file once we drop code object v2.
-enum NoteType{
- NT_AMDGPU_HSA_RESERVED_0 = 0,
- NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
- NT_AMDGPU_HSA_HSAIL = 2,
- NT_AMDGPU_HSA_ISA = 3,
- NT_AMDGPU_HSA_PRODUCER = 4,
- NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
- NT_AMDGPU_HSA_EXTENSION = 6,
- NT_AMDGPU_HSA_RESERVED_7 = 7,
- NT_AMDGPU_HSA_RESERVED_8 = 8,
- NT_AMDGPU_HSA_RESERVED_9 = 9,
- NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
- NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
-};
-
} // End namespace ElfNote
} // End namespace AMDGPU
} // End namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 2f6220e4..2aa0229 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
@@ -208,19 +209,22 @@
for (auto &B : F) {
LastAccess = MemAccessInfo();
for (auto &I : B) {
- if (getMemoryInstrPtr(&I)) {
+ if (const Value *Ptr = getMemoryInstrPtr(&I)) {
+ unsigned Size = divideCeil(
+ Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ 32);
if (isIndirectAccess(&I))
- ++FI.IAMInstCount;
+ FI.IAMInstCost += Size;
if (isLargeStride(&I))
- ++FI.LSMInstCount;
- ++FI.MemInstCount;
- ++FI.InstCount;
+ FI.LSMInstCost += Size;
+ FI.MemInstCost += Size;
+ FI.InstCost += Size;
continue;
}
if (auto *CB = dyn_cast<CallBase>(&I)) {
Function *Callee = CB->getCalledFunction();
if (!Callee || Callee->isDeclaration()) {
- ++FI.InstCount;
+ ++FI.InstCost;
continue;
}
if (&F == Callee) // Handle immediate recursion
@@ -230,10 +234,10 @@
if (Loc == FIM.end())
continue;
- FI.MemInstCount += Loc->second.MemInstCount;
- FI.InstCount += Loc->second.InstCount;
- FI.IAMInstCount += Loc->second.IAMInstCount;
- FI.LSMInstCount += Loc->second.LSMInstCount;
+ FI.MemInstCost += Loc->second.MemInstCost;
+ FI.InstCost += Loc->second.InstCost;
+ FI.IAMInstCost += Loc->second.IAMInstCost;
+ FI.LSMInstCost += Loc->second.LSMInstCost;
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
TargetLoweringBase::AddrMode AM;
auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
@@ -243,9 +247,9 @@
GEP->getPointerAddressSpace()))
// Offset will likely be folded into load or store
continue;
- ++FI.InstCount;
+ ++FI.InstCost;
} else {
- ++FI.InstCount;
+ ++FI.InstCost;
}
}
}
@@ -263,11 +267,11 @@
const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
<< '\n'
- << " IAMInst: " << Info->IAMInstCount << '\n'
- << " LSMInst: " << Info->LSMInstCount << '\n'
- << " TotalInst: " << Info->InstCount << '\n');
+ << " IAMInst cost: " << Info->IAMInstCost << '\n'
+ << " LSMInst cost: " << Info->LSMInstCost << '\n'
+ << " TotalInst cost: " << Info->InstCost << '\n');
if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
@@ -285,13 +289,12 @@
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+ return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}
bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
- FI.LSMInstCount * LSWeight) *
- 100 / FI.InstCount) > LimitWaveThresh;
+ return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
+ FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
}
bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 99dbf50..31ff80f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -37,12 +37,11 @@
bool needsWaveLimiter(const Function *F) const;
struct FuncInfo {
- unsigned MemInstCount;
- unsigned InstCount;
- unsigned IAMInstCount; // Indirect access memory instruction count
- unsigned LSMInstCount; // Large stride memory instruction count
- FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
- LSMInstCount(0) {}
+ unsigned MemInstCost;
+ unsigned InstCost;
+ unsigned IAMInstCost; // Indirect access memory instruction count
+ unsigned LSMInstCost; // Large stride memory instruction count
+ FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 09e2c76..728be81 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -66,6 +66,8 @@
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
void applyCvtF32UByteN(MachineInstr &MI,
const CvtF32UByteMatchInfo &MatchInfo);
+
+ bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
};
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
@@ -245,6 +247,14 @@
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
+ MachineInstr &MI, Register &Reg) {
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+ Reg = MI.getOperand(1).getReg();
+ return TLI->isCanonicalized(Reg, MF);
+}
+
class AMDGPUPostLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index e4b628b..13f09ab 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -12,6 +12,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -26,6 +29,141 @@
using namespace llvm;
using namespace MIPatternMatch;
+class AMDGPUPreLegalizerCombinerHelper {
+protected:
+ MachineIRBuilder &B;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ CombinerHelper &Helper;
+
+public:
+ AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+
+ struct ClampI64ToI16MatchInfo {
+ int64_t Cmp1 = 0;
+ int64_t Cmp2 = 0;
+ Register Origin;
+ };
+
+ bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo);
+
+ void applyClampI64ToI16(MachineInstr &MI,
+ const ClampI64ToI16MatchInfo &MatchInfo);
+};
+
+bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
+
+ // Try to find a pattern where an i64 value should get clamped to short.
+ const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
+ if (SrcType != LLT::scalar(64))
+ return false;
+
+ const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+ if (DstType != LLT::scalar(16))
+ return false;
+
+ Register Base;
+
+ auto IsApplicableForCombine = [&MatchInfo]() -> bool {
+ const auto Cmp1 = MatchInfo.Cmp1;
+ const auto Cmp2 = MatchInfo.Cmp2;
+ const auto Diff = std::abs(Cmp2 - Cmp1);
+
+ // If the difference between both comparison values is 0 or 1, there is no
+ // need to clamp.
+ if (Diff == 0 || Diff == 1)
+ return false;
+
+ const int64_t Min = std::numeric_limits<int16_t>::min();
+ const int64_t Max = std::numeric_limits<int16_t>::max();
+
+ // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
+ return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
+ (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
+ };
+
+ // Try to match a combination of min / max MIR opcodes.
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
+ if (mi_match(Base, MRI,
+ m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
+ return IsApplicableForCombine();
+ }
+ }
+
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
+ if (mi_match(Base, MRI,
+ m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
+ return IsApplicableForCombine();
+ }
+ }
+
+ return false;
+}
+
+// We want to find a combination of instructions that
+// gets generated when an i64 gets clamped to i16.
+// The corresponding pattern is:
+// G_MAX / G_MAX for i16 <= G_TRUNC i64.
+// This can be efficiently written as following:
+// v_cvt_pk_i16_i32 v0, v0, v1
+// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
+void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
+ MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
+
+ Register Src = MatchInfo.Origin;
+ assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
+ LLT::scalar(64));
+ const LLT S32 = LLT::scalar(32);
+
+ B.setMBB(*MI.getParent());
+ B.setInstrAndDebugLoc(MI);
+
+ auto Unmerge = B.buildUnmerge(S32, Src);
+
+ assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
+
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ auto CvtPk =
+ B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
+ {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
+
+ auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
+ auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
+ auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
+ auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
+
+ auto Bitcast = B.buildBitcast({S32}, CvtPk);
+
+ auto Med3 = B.buildInstr(
+ AMDGPU::G_AMDGPU_SMED3, {S32},
+ {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
+ MI.getFlags());
+
+ B.buildTrunc(MI.getOperand(0).getReg(), Med3);
+
+ MI.eraseFromParent();
+}
+
+class AMDGPUPreLegalizerCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
+
+public:
+ AMDGPUPreLegalizerCombinerHelperState(
+ CombinerHelper &Helper,
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
+ : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
+};
+
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -59,12 +197,16 @@
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
- AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+ AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
+ AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
+ PreLegalizerHelper);
if (Generated.tryCombineAll(Observer, MI, B, Helper))
return true;
switch (MI.getOpcode()) {
+ case TargetOpcode::G_MEMCPY_INLINE:
+ return Helper.tryEmitMemcpyInline(MI);
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
@@ -109,6 +251,9 @@
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
}
+
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -130,8 +275,13 @@
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), KB, MDT);
+ // Enable CSE.
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
+
Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+ return C.combineMachineInstrs(MF, CSEInfo);
}
char AMDGPUPreLegalizerCombiner::ID = 0;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index c8bd9b9..7b6959b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -323,7 +323,8 @@
Type *SizetTy = Type::getInt32Ty(Ctx);
Type *Tys_alloc[1] = {SizetTy};
- Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1);
+ Type *I8Ty = Type::getInt8Ty(Ctx);
+ Type *I8Ptr = PointerType::get(I8Ty, 1);
FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
FunctionCallee PrintfAllocFn =
M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
@@ -355,9 +356,8 @@
// basicblock splits after buffer overflow check
//
ConstantPointerNull *zeroIntPtr =
- ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1));
- ICmpInst *cmp =
- dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+ ConstantPointerNull::get(PointerType::get(I8Ty, 1));
+ auto *cmp = cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
if (!CI->use_empty()) {
Value *result =
Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
@@ -371,13 +371,9 @@
// store unique printf id in the buffer
//
- SmallVector<Value *, 1> ZeroIdxList;
- ConstantInt *zeroInt =
- ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
- ZeroIdxList.push_back(zeroInt);
-
GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
- nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch);
+ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
+ Brnch);
Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
Value *id_gep_cast =
@@ -385,14 +381,11 @@
new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
- SmallVector<Value *, 2> FourthIdxList;
- ConstantInt *fourInt =
- ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10));
-
- FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+ // 1st 4 bytes hold the printf_id
// the following GEP is the buffer pointer
- BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList,
- "PrintBuffGep", Brnch);
+ BufferIdx = GetElementPtrInst::Create(
+ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 4)), "PrintBuffGep",
+ Brnch);
Type *Int32Ty = Type::getInt32Ty(Ctx);
Type *Int64Ty = Type::getInt64Ty(Ctx);
@@ -533,7 +526,7 @@
(void)StBuff;
if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
break;
- BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset,
+ BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset,
"PrintBuffNextPtr", Brnch);
LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
<< *BufferIdx << '\n');
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2a6ea83..3f1f21a 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -126,8 +126,13 @@
char AMDGPUPromoteAlloca::ID = 0;
char AMDGPUPromoteAllocaToVector::ID = 0;
-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
- "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)
@@ -656,6 +661,11 @@
continue;
}
+ // Do not promote vector/aggregate type instructions. It is hard to track
+ // their users.
+ if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
+ return false;
+
if (!User->getType()->isPointerTy())
continue;
@@ -943,13 +953,15 @@
I.replaceAllUsesWith(Offset);
I.eraseFromParent();
+ SmallVector<IntrinsicInst *> DeferredIntrs;
+
for (Value *V : WorkList) {
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
Value *Src0 = CI->getOperand(0);
- Type *EltTy = Src0->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::getWithSamePointeeType(
+ cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
if (isa<ConstantPointerNull>(CI->getOperand(0)))
CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -965,8 +977,8 @@
if (isa<AddrSpaceCastInst>(V))
continue;
- Type *EltTy = V->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::getWithSamePointeeType(
+ cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
@@ -997,22 +1009,13 @@
// These intrinsics are for address space 0 only
Intr->eraseFromParent();
continue;
- case Intrinsic::memcpy: {
- MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
- Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
- MemCpy->getRawSource(), MemCpy->getSourceAlign(),
- MemCpy->getLength(), MemCpy->isVolatile());
- Intr->eraseFromParent();
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ // These have 2 pointer operands. In case if second pointer also needs
+ // to be replaced we defer processing of these intrinsics until all
+ // other values are processed.
+ DeferredIntrs.push_back(Intr);
continue;
- }
- case Intrinsic::memmove: {
- MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
- Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
- MemMove->getRawSource(), MemMove->getSourceAlign(),
- MemMove->getLength(), MemMove->isVolatile());
- Intr->eraseFromParent();
- continue;
- }
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(
@@ -1032,11 +1035,11 @@
continue;
case Intrinsic::objectsize: {
Value *Src = Intr->getOperand(0);
- Type *SrcTy = Src->getType()->getPointerElementType();
- Function *ObjectSize = Intrinsic::getDeclaration(Mod,
- Intrinsic::objectsize,
- { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
- );
+ Function *ObjectSize = Intrinsic::getDeclaration(
+ Mod, Intrinsic::objectsize,
+ {Intr->getType(),
+ PointerType::getWithSamePointeeType(
+ cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
CallInst *NewCall = Builder.CreateCall(
ObjectSize,
@@ -1050,6 +1053,27 @@
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
+
+ for (IntrinsicInst *Intr : DeferredIntrs) {
+ Builder.SetInsertPoint(Intr);
+ Intrinsic::ID ID = Intr->getIntrinsicID();
+ assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
+
+ MemTransferInst *MI = cast<MemTransferInst>(Intr);
+ auto *B =
+ Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
+ MI->getRawSource(), MI->getSourceAlign(),
+ MI->getLength(), MI->isVolatile());
+
+ for (unsigned I = 1; I != 3; ++I) {
+ if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
+ B->addDereferenceableAttr(I, Bytes);
+ }
+ }
+
+ Intr->eraseFromParent();
+ }
+
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index cd71c7a..0e4c261 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -249,7 +249,11 @@
if (!I)
continue;
CallBase *CI = dyn_cast<CallBase>(I);
- if (!CI)
+ // Only propagate attributes if F is the called function. Specifically,
+ // do not propagate attributes if F is passed as an argument.
+ // FIXME: handle bitcasted callee, e.g.
+ // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)()
+ if (!CI || CI->getCalledOperand() != &F)
continue;
Function *Caller = CI->getCaller();
if (!Caller || !Visited.insert(CI).second)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index d644c03..4e12e5c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -13,7 +13,9 @@
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -27,6 +29,126 @@
using namespace llvm;
using namespace MIPatternMatch;
+class AMDGPURegBankCombinerHelper {
+protected:
+ MachineIRBuilder &B;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ const RegisterBankInfo &RBI;
+ const TargetRegisterInfo &TRI;
+ CombinerHelper &Helper;
+
+public:
+ AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()),
+ RBI(*MF.getSubtarget().getRegBankInfo()),
+ TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
+
+ bool isVgprRegBank(Register Reg);
+
+ struct MinMaxMedOpc {
+ unsigned Min, Max, Med;
+ };
+
+ struct Med3MatchInfo {
+ unsigned Opc;
+ Register Val0, Val1, Val2;
+ };
+
+ MinMaxMedOpc getMinMaxPair(unsigned Opc);
+
+ template <class m_Cst>
+ bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
+ Register &Val, Register &K0, Register &K1);
+
+ bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+};
+
+bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
+ return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
+}
+
+AMDGPURegBankCombinerHelper::MinMaxMedOpc
+AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unsupported opcode");
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_SMIN:
+ return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3};
+ case AMDGPU::G_UMAX:
+ case AMDGPU::G_UMIN:
+ return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
+ }
+}
+
+template <class m_Cst>
+bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MinMaxMedOpc MMMOpc, Register &Val,
+ Register &K0, Register &K1) {
+ // 4 operand commutes of: min(max(Val, K0), K1).
+ // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
+ // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
+ // 4 operand commutes of: max(min(Val, K1), K0).
+ // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
+ // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
+ return mi_match(
+ MI, MRI,
+ m_any_of(
+ m_CommutativeBinOp(
+ MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)),
+ m_Cst(K1)),
+ m_CommutativeBinOp(
+ MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)),
+ m_Cst(K0))));
+}
+
+bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
+ MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (!isVgprRegBank(Dst))
+ return false;
+
+ if (MRI.getType(Dst).isVector())
+ return false;
+
+ MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
+ Register Val, K0, K1;
+ // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
+ if (!matchMed<ICstRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+ return false;
+
+ const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue();
+ const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue();
+ if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm))
+ return false;
+ if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm))
+ return false;
+
+ MatchInfo = {OpcodeTriple.Med, Val, K0, K1};
+ return true;
+}
+
+void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
+ Med3MatchInfo &MatchInfo) {
+ B.setInstrAndDebugLoc(MI);
+ B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
+ {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags());
+ MI.eraseFromParent();
+}
+
+class AMDGPURegBankCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+ AMDGPURegBankCombinerHelper &RegBankHelper;
+
+public:
+ AMDGPURegBankCombinerHelperState(CombinerHelper &Helper,
+ AMDGPURegBankCombinerHelper &RegBankHelper)
+ : Helper(Helper), RegBankHelper(RegBankHelper) {}
+};
#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenRegBankGICombiner.inc"
@@ -62,9 +184,11 @@
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
- AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg);
+ AMDGPURegBankCombinerHelper RegBankHelper(B, Helper);
+ AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper,
+ RegBankHelper);
- if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ if (Generated.tryCombineAll(Observer, MI, B))
return true;
return false;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 502356d..0e40056 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -443,9 +443,8 @@
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-
- // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
- return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
+ // Require 4-byte alignment.
+ return MMO->getAlign() >= Align(4) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -591,21 +590,6 @@
return AltMappings;
}
- case TargetOpcode::G_SMIN:
- case TargetOpcode::G_SMAX:
- case TargetOpcode::G_UMIN:
- case TargetOpcode::G_UMAX: {
- static const OpRegBankEntry<3> Table[2] = {
- { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-
- // Scalar requires cmp+select, and extends if 16-bit.
- // FIXME: Should there be separate costs for 32 and 16-bit
- { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
- };
-
- const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
- return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
- }
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
@@ -691,12 +675,13 @@
static LLT getHalfSizedType(LLT Ty) {
if (Ty.isVector()) {
- assert(Ty.getNumElements() % 2 == 0);
- return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
+ assert(Ty.getElementCount().isKnownMultipleOf(2));
+ return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
+ Ty.getElementType());
}
- assert(Ty.getSizeInBits() % 2 == 0);
- return LLT::scalar(Ty.getSizeInBits() / 2);
+ assert(Ty.getScalarSizeInBits() % 2 == 0);
+ return LLT::scalar(Ty.getScalarSizeInBits() / 2);
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
@@ -1139,8 +1124,8 @@
unsigned FirstPartNumElts = FirstSize / EltSize;
unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
- return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
- LLT::scalarOrVector(RemainderElts, EltTy)};
+ return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
+ LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
}
static LLT widen96To128(LLT Ty) {
@@ -1149,7 +1134,7 @@
LLT EltTy = Ty.getElementType();
assert(128 % EltTy.getSizeInBits() == 0);
- return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
+ return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
}
bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
@@ -1160,34 +1145,61 @@
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
- const RegisterBank *PtrBank =
- OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
- if (PtrBank == &AMDGPU::SGPRRegBank) {
- // If the pointer is an SGPR, we ordinarily have nothing to do.
- if (LoadSize != 96)
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::SGPRRegBank) {
+ // There are some special cases that we need to look at for 32 bit and 96
+ // bit SGPR loads otherwise we have nothing to do.
+ if (LoadSize != 32 && LoadSize != 96)
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned MemSize = 8 * MMO->getSize();
+ // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
+ // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
+ // scalar loads should have a load size of 32 but memory access size of less
+ // than 32.
+ if (LoadSize == 32 &&
+ (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
- // 96-bit loads are only available for vector loads. We need to split this
- // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);
- if (MMO->getAlign() < Align(16)) {
- LLT Part64, Part32;
- std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
- auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
- auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
- auto Undef = B.buildUndef(LoadTy);
- auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
- B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ if (LoadSize == 32) {
+ // This is an extending load from a sub-dword size. Widen the memory
+ // access size to 4 bytes and clear the extra high bits appropriately
+ const LLT S32 = LLT::scalar(32);
+ if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
+ // Must extend the sign bit into higher bits for a G_SEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else
+ // We do not need to touch the higher bits for regular loads.
+ B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
} else {
- LLT WiderTy = widen96To128(LoadTy);
- auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
- B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ // 96-bit loads are only available for vector loads. We need to split this
+ // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
+ if (MMO->getAlign() < Align(16)) {
+ LLT Part64, Part32;
+ std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
+ auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
+ auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
+
+ auto Undef = B.buildUndef(LoadTy);
+ auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
+ B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ } else {
+ LLT WiderTy = widen96To128(LoadTy);
+ auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
+ B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ }
}
MI.eraseFromParent();
@@ -1345,8 +1357,8 @@
AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
uint32_t SOffset, ImmOffset;
- if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- &RBI.Subtarget, Alignment)) {
+ if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+ &RBI.Subtarget, Alignment)) {
if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = Base;
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
@@ -1366,7 +1378,8 @@
}
// Handle the variable sgpr + vgpr case.
- if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
+ MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
+ if (Add && (int)Offset >= 0) {
Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
@@ -1519,8 +1532,8 @@
return true;
}
-bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
- const OperandsMapper &OpdMapper, bool Signed) const {
+bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
+ bool Signed) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
@@ -1532,19 +1545,69 @@
const LLT S32 = LLT::scalar(32);
+ unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
+ Register SrcReg = MI.getOperand(FirstOpnd).getReg();
+ Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
+ Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
+
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
if (Ty == S32)
return true;
- // TODO: 64-bit version is scalar only, so we need to expand this.
- return false;
- }
+ // There is no 64-bit vgpr bitfield extract instructions so the operation
+ // is expanded to a sequence of instructions that implement the operation.
+ ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, ApplyBank);
- Register SrcReg = MI.getOperand(2).getReg();
- Register OffsetReg = MI.getOperand(3).getReg();
- Register WidthReg = MI.getOperand(4).getReg();
+ const LLT S64 = LLT::scalar(64);
+ // Shift the source operand so that extracted bits start at bit 0.
+ auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
+ : B.buildLShr(S64, SrcReg, OffsetReg);
+ auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
+
+ // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
+ // if the width is a constant.
+ if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
+ // Use the 32-bit bitfield extract instruction if the width is a constant.
+ // Depending on the width size, use either the low or high 32-bits.
+ auto Zero = B.buildConstant(S32, 0);
+ auto WidthImm = ConstWidth->Value.getZExtValue();
+ if (WidthImm <= 32) {
+ // Use bitfield extract on the lower 32-bit source, and then sign-extend
+ // or clear the upper 32-bits.
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
+ auto Extend =
+ Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
+ B.buildMerge(DstReg, {Extract, Extend});
+ } else {
+ // Use bitfield extract on upper 32-bit source, and combine with lower
+ // 32-bit source.
+ auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
+ auto Extract =
+ Signed
+ ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
+ B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
+ // operations.
+ auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
+ auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ if (Signed)
+ B.buildAShr(S64, SignBit, ExtShift);
+ else
+ B.buildLShr(S64, SignBit, ExtShift);
+ MI.eraseFromParent();
+ return true;
+ }
// The scalar form packs the offset and width in a single operand.
@@ -1576,32 +1639,19 @@
return true;
}
-// FIXME: Duplicated from LegalizerHelper
-static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
+// Return a suitable opcode for extending the operands of Opc when widening.
+static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
- case TargetOpcode::G_SMIN:
- return CmpInst::ICMP_SLT;
- case TargetOpcode::G_SMAX:
- return CmpInst::ICMP_SGT;
- case TargetOpcode::G_UMIN:
- return CmpInst::ICMP_ULT;
- case TargetOpcode::G_UMAX:
- return CmpInst::ICMP_UGT;
- default:
- llvm_unreachable("not in integer min/max");
- }
-}
-
-static unsigned minMaxToExtend(unsigned Opc) {
- switch (Opc) {
+ case TargetOpcode::G_ASHR:
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
return TargetOpcode::G_SEXT;
+ case TargetOpcode::G_LSHR:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX:
return TargetOpcode::G_ZEXT;
default:
- llvm_unreachable("not in integer min/max");
+ return TargetOpcode::G_ANYEXT;
}
}
@@ -1628,30 +1678,6 @@
return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
}
-static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
- CmpInst::Predicate Pred,
- Register Dst, Register Src0,
- Register Src1) {
- const LLT CmpType = LLT::scalar(32);
- auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
- return B.buildSelect(Dst, Cmp, Src0, Src1);
-}
-
-// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
-void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
- MachineInstr &MI) const {
- Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
-
- const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
- MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
-
- Register CmpReg = Sel->getOperand(1).getReg();
- B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
- MI.eraseFromParent();
-}
-
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static bool substituteSimpleCopyRegs(
@@ -1688,7 +1714,7 @@
const LLT S32 = LLT::scalar(32);
int NumElts = StoreVT.getNumElements();
- return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
}
static std::pair<Register, unsigned>
@@ -1754,17 +1780,14 @@
return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
}
-static unsigned extractGLC(unsigned CachePolicy) {
- return CachePolicy & 1;
+static unsigned extractCPol(unsigned CachePolicy) {
+ return CachePolicy & AMDGPU::CPol::ALL;
}
-static unsigned extractSLC(unsigned CachePolicy) {
- return (CachePolicy >> 1) & 1;
+static unsigned extractSWZ(unsigned CachePolicy) {
+ return (CachePolicy >> 3) & 1;
}
-static unsigned extractDLC(unsigned CachePolicy) {
- return (CachePolicy >> 2) & 1;
-}
MachineInstr *
AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
@@ -1830,10 +1853,9 @@
MIB.addUse(RSrc)
.addUse(SOffset)
.addImm(ImmOffset)
- .addImm(extractGLC(CachePolicy))
- .addImm(extractSLC(CachePolicy))
+ .addImm(extractCPol(CachePolicy))
.addImm(0) // tfe: FIXME: Remove from inst
- .addImm(extractDLC(CachePolicy))
+ .addImm(extractSWZ(CachePolicy))
.cloneMemRefs(MI);
// FIXME: We need a way to report failure from applyMappingImpl.
@@ -2006,6 +2028,22 @@
return true;
}
+// Insert a cross regbank copy for a register if it already has a bank that
+// differs from the one we want to set.
+static Register constrainRegToBank(MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, Register &Reg,
+ const RegisterBank &Bank) {
+ const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
+ if (CurrBank && *CurrBank != Bank) {
+ Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
+ MRI.setRegBank(Copy, Bank);
+ return Copy;
+ }
+
+ MRI.setRegBank(Reg, Bank);
+ return Reg;
+}
+
bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
MachineInstr &MI, MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const {
@@ -2069,17 +2107,18 @@
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
for (unsigned L = 0; L < NumLanes; ++L) {
- auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
- UnmergeToEltTy.getReg(I * NumLanes + L));
+ Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
+ Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
+ Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
- for (unsigned N : { 0, 2, 3 })
- MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+ Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
+ MRI.setRegBank(Select, DstBank);
- Ops[I * NumLanes + L] = S->getOperand(0).getReg();
+ Ops[I * NumLanes + L] = Select;
}
}
- LLT MergeTy = LLT::vector(Ops.size(), EltTy);
+ LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
B.buildBuildVector(MI.getOperand(0), Ops);
} else {
@@ -2336,18 +2375,40 @@
MI.eraseFromParent();
return;
}
+ case AMDGPU::G_ABS: {
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
+
+ // There is no VALU abs instruction so we need to replace it with a sub and
+ // max combination.
+ if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, Apply);
+ LegalizerHelper Helper(*MF, Apply, B);
+
+ if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
+ llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
+ return;
+ }
+ LLVM_FALLTHROUGH;
+ }
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
- case AMDGPU::G_ASHR: {
+ case AMDGPU::G_ASHR:
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
- if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
+ if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
@@ -2365,10 +2426,11 @@
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
+ unsigned ExtendOp = getExtendOp(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi)
- = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
+ = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi)
- = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
+ = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
@@ -2390,73 +2452,6 @@
return;
}
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX: {
- Register DstReg = MI.getOperand(0).getReg();
- const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- if (DstBank == &AMDGPU::VGPRRegBank)
- break;
-
- MachineFunction *MF = MI.getParent()->getParent();
- MachineIRBuilder B(MI);
-
- // Turn scalar min/max into a compare and select.
- LLT Ty = MRI.getType(DstReg);
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
-
- if (Ty == V2S16) {
- ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- B.setChangeObserver(ApplySALU);
-
- // Need to widen to s32, and expand as cmp + select, and avoid producing
- // illegal vector extends or unmerges that would need further
- // legalization.
- //
- // TODO: Should we just readfirstlane? That should probably be handled
- // with a UniformVGPR register bank that wouldn't need special
- // consideration here.
-
- Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
-
- Register WideSrc0Lo, WideSrc0Hi;
- Register WideSrc1Lo, WideSrc1Hi;
-
- unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
-
- std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
- std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
-
- Register Lo = MRI.createGenericVirtualRegister(S32);
- Register Hi = MRI.createGenericVirtualRegister(S32);
- const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
- buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
- buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
-
- B.buildBuildVectorTrunc(Dst, {Lo, Hi});
- MI.eraseFromParent();
- } else if (Ty == S16) {
- ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- B.setChangeObserver(ApplySALU);
- LegalizerHelper Helper(*MF, ApplySALU, B);
-
- // Need to widen to s32, and expand as cmp + select.
- if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
- llvm_unreachable("widenScalar should have succeeded");
-
- // FIXME: This is relying on widenScalar leaving MI in place.
- lowerScalarMinMax(B, MI);
- } else
- lowerScalarMinMax(B, MI);
-
- return;
- }
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
@@ -2496,6 +2491,7 @@
return;
}
case AMDGPU::G_CTPOP:
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
const RegisterBank *DstBank =
@@ -2605,7 +2601,7 @@
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::vector(2, 16))
+ if (DstTy != LLT::fixed_vector(2, 16))
break;
assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
@@ -2737,7 +2733,7 @@
assert(DstTy.getSizeInBits() == 64);
- LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+ LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
@@ -2854,7 +2850,7 @@
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
- LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
+ LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@@ -2953,7 +2949,9 @@
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
@@ -3012,10 +3010,10 @@
return;
}
case Intrinsic::amdgcn_sbfe:
- applyMappingBFEIntrinsic(OpdMapper, true);
+ applyMappingBFE(OpdMapper, true);
return;
case Intrinsic::amdgcn_ubfe:
- applyMappingBFEIntrinsic(OpdMapper, false);
+ applyMappingBFE(OpdMapper, false);
return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
@@ -3107,6 +3105,12 @@
case AMDGPU::G_DYN_STACKALLOC:
applyMappingDynStackAlloc(MI, OpdMapper, MRI);
return;
+ case AMDGPU::G_SBFX:
+ applyMappingBFE(OpdMapper, /*Signed*/ true);
+ return;
+ case AMDGPU::G_UBFX:
+ applyMappingBFE(OpdMapper, /*Signed*/ false);
+ return;
default:
break;
}
@@ -3579,7 +3583,10 @@
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX:
+ case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
+ case AMDGPU::G_SBFX:
+ case AMDGPU::G_UBFX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
@@ -3621,6 +3628,8 @@
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
+ case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
+ case AMDGPU::G_AMDGPU_SMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
@@ -3679,7 +3688,7 @@
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- if (DstTy == LLT::vector(2, 16)) {
+ if (DstTy == LLT::fixed_vector(2, 16)) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
@@ -3706,10 +3715,10 @@
OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_BITCAST:
case AMDGPU::G_INTTOPTR:
case AMDGPU::G_PTRTOINT:
- case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -3919,7 +3928,9 @@
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -4033,6 +4044,7 @@
case Intrinsic::amdgcn_cvt_pk_u8_f32:
case Intrinsic::amdgcn_alignbit:
case Intrinsic::amdgcn_alignbyte:
+ case Intrinsic::amdgcn_perm:
case Intrinsic::amdgcn_fdot2:
case Intrinsic::amdgcn_sdot2:
case Intrinsic::amdgcn_udot2:
@@ -4052,7 +4064,9 @@
case Intrinsic::amdgcn_update_dpp:
case Intrinsic::amdgcn_mov_dpp8:
case Intrinsic::amdgcn_mov_dpp:
+ case Intrinsic::amdgcn_strict_wwm:
case Intrinsic::amdgcn_wwm:
+ case Intrinsic::amdgcn_strict_wqm:
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
case Intrinsic::amdgcn_set_inactive:
@@ -4176,7 +4190,14 @@
case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
- case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
+ case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
+ case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -4250,6 +4271,11 @@
}
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_csub:
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -4306,6 +4332,11 @@
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
+ case Intrinsic::amdgcn_live_mask: {
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
+ }
+ case Intrinsic::amdgcn_wqm_demote:
case Intrinsic::amdgcn_kill: {
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 1c14417..7e051e4 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -81,10 +81,7 @@
MachineRegisterInfo &MRI, int RSrcIdx) const;
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
- bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
- bool Signed) const;
-
- void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
+ bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 6c70b53..50999a4 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
+ [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
- [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
+ [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
new file mode 100644
index 0000000..dabb4d00
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -0,0 +1,460 @@
+//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces all the uses of LDS within non-kernel functions by
+// corresponding pointer counter-parts.
+//
+// The main motivation behind this pass is - to *avoid* subsequent LDS lowering
+// pass from directly packing LDS (assume large LDS) into a struct type which
+// would otherwise cause allocating huge memory for struct instance within every
+// kernel.
+//
+// Brief sketch of the algorithm implemented in this pass is as below:
+//
+// 1. Collect all the LDS defined in the module which qualify for pointer
+// replacement, say it is, LDSGlobals set.
+//
+// 2. Collect all the reachable callees for each kernel defined in the module,
+// say it is, KernelToCallees map.
+//
+// 3. FOR (each global GV from LDSGlobals set) DO
+// LDSUsedNonKernels = Collect all non-kernel functions which use GV.
+// FOR (each kernel K in KernelToCallees map) DO
+// ReachableCallees = KernelToCallees[K]
+// ReachableAndLDSUsedCallees =
+// SetIntersect(LDSUsedNonKernels, ReachableCallees)
+// IF (ReachableAndLDSUsedCallees is not empty) THEN
+// Pointer = Create a pointer to point-to GV if not created.
+// Initialize Pointer to point-to GV within kernel K.
+// ENDIF
+// ENDFOR
+// Replace all uses of GV within non kernel functions by Pointer.
+// ENFOR
+//
+// LLVM IR example:
+//
+// Input IR:
+//
+// @lds = internal addrspace(3) global [4 x i32] undef, align 16
+//
+// define internal void @f0() {
+// entry:
+// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds,
+// i32 0, i32 0
+// ret void
+// }
+//
+// define protected amdgpu_kernel void @k0() {
+// entry:
+// call void @f0()
+// ret void
+// }
+//
+// Output IR:
+//
+// @lds = internal addrspace(3) global [4 x i32] undef, align 16
+// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
+//
+// define internal void @f0() {
+// entry:
+// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2
+// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
+// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
+// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2,
+// i32 0, i32 0
+// ret void
+// }
+//
+// define protected amdgpu_kernel void @k0() {
+// entry:
+// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16),
+// i16 addrspace(3)* @lds.ptr, align 2
+// call void @f0()
+// ret void
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
+
+using namespace llvm;
+
+namespace {
+
+class ReplaceLDSUseImpl {
+ Module &M;
+ LLVMContext &Ctx;
+ const DataLayout &DL;
+ Constant *LDSMemBaseAddr;
+
+ DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
+ DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels;
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
+ DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers;
+ DenseMap<Function *, BasicBlock *> KernelToInitBB;
+ DenseMap<Function *, DenseMap<GlobalVariable *, Value *>>
+ FunctionToLDSToReplaceInst;
+
+ // Collect LDS which requires their uses to be replaced by pointer.
+ std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
+ // Collect LDS which requires module lowering.
+ std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
+
+ // Remove LDS which don't qualify for replacement.
+ LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(),
+ [&](GlobalVariable *GV) {
+ return shouldIgnorePointerReplacement(GV);
+ }),
+ LDSGlobals.end());
+
+ return LDSGlobals;
+ }
+
+ // Returns true if uses of given LDS global within non-kernel functions should
+ // be keep as it is without pointer replacement.
+ bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
+ // LDS whose size is very small and doesn`t exceed pointer size is not worth
+ // replacing.
+ if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
+ return true;
+
+ // LDS which is not used from non-kernel function scope or it is used from
+ // global scope does not qualify for replacement.
+ LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
+ return LDSToNonKernels[GV].empty();
+
+ // FIXME: When GV is used within all (or within most of the kernels), then
+ // it does not make sense to create a pointer for it.
+ }
+
+ // Insert new global LDS pointer which points to LDS.
+ GlobalVariable *createLDSPointer(GlobalVariable *GV) {
+ // LDS pointer which points to LDS is already created? return it.
+ auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr));
+ if (!PointerEntry.second)
+ return PointerEntry.first->second;
+
+ // We need to create new LDS pointer which points to LDS.
+ //
+ // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to
+ // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address.
+ auto *I16Ty = Type::getInt16Ty(Ctx);
+ GlobalVariable *LDSPointer = new GlobalVariable(
+ M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty),
+ GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal,
+ AMDGPUAS::LOCAL_ADDRESS);
+
+ LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
+
+ // Mark that an associated LDS pointer is created for LDS.
+ LDSToPointer[GV] = LDSPointer;
+
+ return LDSPointer;
+ }
+
+ // Split entry basic block in such a way that only lane 0 of each wave does
+ // the LDS pointer initialization, and return newly created basic block.
+ BasicBlock *activateLaneZero(Function *K) {
+ // If the entry basic block of kernel K is already splitted, then return
+ // newly created basic block.
+ auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr));
+ if (!BasicBlockEntry.second)
+ return BasicBlockEntry.first->second;
+
+ // Split entry basic block of kernel K.
+ auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+
+ Value *Mbcnt =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Builder.getInt32(-1), Builder.getInt32(0)});
+ Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0));
+ Instruction *WB = cast<Instruction>(
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
+
+ BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
+
+ // Mark that the entry basic block of kernel K is splitted.
+ KernelToInitBB[K] = NBB;
+
+ return NBB;
+ }
+
+ // Within given kernel, initialize given LDS pointer to point to given LDS.
+ void initializeLDSPointer(Function *K, GlobalVariable *GV,
+ GlobalVariable *LDSPointer) {
+ // If LDS pointer is already initialized within K, then nothing to do.
+ auto PointerEntry = KernelToLDSPointers.insert(
+ std::make_pair(K, SmallPtrSet<GlobalVariable *, 8>()));
+ if (!PointerEntry.second)
+ if (PointerEntry.first->second.contains(LDSPointer))
+ return;
+
+ // Insert instructions at EI which initialize LDS pointer to point-to LDS
+ // within kernel K.
+ //
+ // That is, convert pointer type of GV to i16, and then store this converted
+ // i16 value within LDSPointer which is of type i16*.
+ auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+ Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)),
+ LDSPointer);
+
+ // Mark that LDS pointer is initialized within kernel K.
+ KernelToLDSPointers[K].insert(LDSPointer);
+ }
+
+ // We have created an LDS pointer for LDS, and initialized it to point-to LDS
+ // within all relevent kernels. Now replace all the uses of LDS within
+ // non-kernel functions by LDS pointer.
+ void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
+ SmallVector<User *, 8> LDSUsers(GV->users());
+ for (auto *U : LDSUsers) {
+ // When `U` is a constant expression, it is possible that same constant
+ // expression exists within multiple instructions, and within multiple
+ // non-kernel functions. Collect all those non-kernel functions and all
+ // those instructions within which `U` exist.
+ auto FunctionToInsts =
+ AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
+
+ for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
+ FI != FE; ++FI) {
+ Function *F = FI->first;
+ auto &Insts = FI->second;
+ for (auto *I : Insts) {
+ // If `U` is a constant expression, then we need to break the
+ // associated instruction into a set of separate instructions by
+ // converting constant expressions into instructions.
+ SmallPtrSet<Instruction *, 8> UserInsts;
+
+ if (U == I) {
+ // `U` is an instruction, conversion from constant expression to
+ // set of instructions is *not* required.
+ UserInsts.insert(I);
+ } else {
+ // `U` is a constant expression, convert it into corresponding set
+ // of instructions.
+ auto *CE = cast<ConstantExpr>(U);
+ convertConstantExprsToInstructions(I, CE, &UserInsts);
+ }
+
+ // Go through all the user instrutions, if LDS exist within them as an
+ // operand, then replace it by replace instruction.
+ for (auto *II : UserInsts) {
+ auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
+ II->replaceUsesOfWith(GV, ReplaceInst);
+ }
+ }
+ }
+ }
+ }
+
+ // Create a set of replacement instructions which together replace LDS within
+ // non-kernel function F by accessing LDS indirectly using LDS pointer.
+ Value *getReplacementInst(Function *F, GlobalVariable *GV,
+ GlobalVariable *LDSPointer) {
+ // If the instruction which replaces LDS within F is already created, then
+ // return it.
+ auto LDSEntry = FunctionToLDSToReplaceInst.insert(
+ std::make_pair(F, DenseMap<GlobalVariable *, Value *>()));
+ if (!LDSEntry.second) {
+ auto ReplaceInstEntry =
+ LDSEntry.first->second.insert(std::make_pair(GV, nullptr));
+ if (!ReplaceInstEntry.second)
+ return ReplaceInstEntry.first->second;
+ }
+
+ // Get the instruction insertion point within the beginning of the entry
+ // block of current non-kernel function.
+ auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+
+ // Insert required set of instructions which replace LDS within F.
+ auto *V = Builder.CreateBitCast(
+ Builder.CreateGEP(
+ Builder.getInt8Ty(), LDSMemBaseAddr,
+ Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)),
+ GV->getType());
+
+ // Mark that the replacement instruction which replace LDS within F is
+ // created.
+ FunctionToLDSToReplaceInst[F][GV] = V;
+
+ return V;
+ }
+
+public:
+ ReplaceLDSUseImpl(Module &M)
+ : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {
+ LDSMemBaseAddr = Constant::getIntegerValue(
+ PointerType::get(Type::getInt8Ty(M.getContext()),
+ AMDGPUAS::LOCAL_ADDRESS),
+ APInt(32, 0));
+ }
+
+ // Entry-point function which interface ReplaceLDSUseImpl with outside of the
+ // class.
+ bool replaceLDSUse();
+
+private:
+ // For a given LDS from collected LDS globals set, replace its non-kernel
+ // function scope uses by pointer.
+ bool replaceLDSUse(GlobalVariable *GV);
+};
+
+// For given LDS from collected LDS globals set, replace its non-kernel function
+// scope uses by pointer.
+bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
+ // Holds all those non-kernel functions within which LDS is being accessed.
+ SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV];
+
+ // The LDS pointer which points to LDS and replaces all the uses of LDS.
+ GlobalVariable *LDSPointer = nullptr;
+
+ // Traverse through each kernel K, check and if required, initialize the
+ // LDS pointer to point to LDS within K.
+ for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
+ ++KI) {
+ Function *K = KI->first;
+ SmallPtrSet<Function *, 8> Callees = KI->second;
+
+ // Compute reachable and LDS used callees for kernel K.
+ set_intersect(Callees, LDSAccessors);
+
+ // None of the LDS accessing non-kernel functions are reachable from
+ // kernel K. Hence, no need to initialize LDS pointer within kernel K.
+ if (Callees.empty())
+ continue;
+
+ // We have found reachable and LDS used callees for kernel K, and we need to
+ // initialize LDS pointer within kernel K, and we need to replace LDS use
+ // within those callees by LDS pointer.
+ //
+ // But, first check if LDS pointer is already created, if not create one.
+ LDSPointer = createLDSPointer(GV);
+
+ // Initialize LDS pointer to point to LDS within kernel K.
+ initializeLDSPointer(K, GV, LDSPointer);
+ }
+
+ // We have not found reachable and LDS used callees for any of the kernels,
+ // and hence we have not created LDS pointer.
+ if (!LDSPointer)
+ return false;
+
+ // We have created an LDS pointer for LDS, and initialized it to point-to LDS
+ // within all relevent kernels. Now replace all the uses of LDS within
+ // non-kernel functions by LDS pointer.
+ replaceLDSUseByPointer(GV, LDSPointer);
+
+ return true;
+}
+
+// Entry-point function which interface ReplaceLDSUseImpl with outside of the
+// class.
+bool ReplaceLDSUseImpl::replaceLDSUse() {
+ // Collect LDS which requires their uses to be replaced by pointer.
+ std::vector<GlobalVariable *> LDSGlobals =
+ collectLDSRequiringPointerReplace();
+
+ // No LDS to pointer-replace. Nothing to do.
+ if (LDSGlobals.empty())
+ return false;
+
+ // Collect reachable callee set for each kernel defined in the module.
+ AMDGPU::collectReachableCallees(M, KernelToCallees);
+
+ if (KernelToCallees.empty()) {
+ // Either module does not have any kernel definitions, or none of the kernel
+ // has a call to non-kernel functions, or we could not resolve any of the
+ // call sites to proper non-kernel functions, because of the situations like
+ // inline asm calls. Nothing to replace.
+ return false;
+ }
+
+ // For every LDS from collected LDS globals set, replace its non-kernel
+ // function scope use by pointer.
+ bool Changed = false;
+ for (auto *GV : LDSGlobals)
+ Changed |= replaceLDSUse(GV);
+
+ return Changed;
+}
+
+class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
+public:
+ static char ID;
+
+ AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
+ initializeAMDGPUReplaceLDSUseWithPointerPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+} // namespace
+
+char AMDGPUReplaceLDSUseWithPointer::ID = 0;
+char &llvm::AMDGPUReplaceLDSUseWithPointerID =
+ AMDGPUReplaceLDSUseWithPointer::ID;
+
+INITIALIZE_PASS_BEGIN(
+ AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
+ "Replace within non-kernel function use of LDS with pointer",
+ false /*only look at the cfg*/, false /*analysis pass*/)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(
+ AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
+ "Replace within non-kernel function use of LDS with pointer",
+ false /*only look at the cfg*/, false /*analysis pass*/)
+
+bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
+ ReplaceLDSUseImpl LDSUseReplacer{M};
+ return LDSUseReplacer.replaceLDSUse();
+}
+
+ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
+ return new AMDGPUReplaceLDSUseWithPointer();
+}
+
+PreservedAnalyses
+AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
+ ReplaceLDSUseImpl LDSUseReplacer{M};
+ LDSUseReplacer.replaceLDSUse();
+ return PreservedAnalyses::all();
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
new file mode 100644
index 0000000..ef46e53
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -0,0 +1,514 @@
+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes how many registers and other resources are used by
+/// functions.
+///
+/// The results of this analysis are used to fill the register usage, flat
+/// usage, etc. into hardware registers.
+///
+/// The analysis takes callees into account. E.g. if a function A that needs 10
+/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
+/// will return 20.
+/// It is assumed that an indirect call can go into any function except
+/// hardware-entrypoints. Therefore the register usage of functions with
+/// indirect calls is estimated as the maximum of all non-entrypoint functions
+/// in the module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+#define DEBUG_TYPE "amdgpu-resource-usage"
+
+char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
+char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
+
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+ "amdgpu-assume-external-call-stack-size",
+ cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
+ cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+ "amdgpu-assume-dynamic-stack-object-size",
+ cl::desc("Assumed extra stack use if there are any "
+ "variable sized objects (in bytes)"),
+ cl::Hidden, cl::init(4096));
+
+INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
+ "Function register usage analysis", true, true)
+
+static const Function *getCalleeFunction(const MachineOperand &Op) {
+ if (Op.isImm()) {
+ assert(Op.getImm() == 0);
+ return nullptr;
+ }
+
+ return cast<Function>(Op.getGlobal());
+}
+
+static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, unsigned Reg) {
+ for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
+ if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
+ return true;
+ }
+
+ return false;
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
+ const GCNSubtarget &ST) const {
+ return NumExplicitSGPR +
+ IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
+ ST.getTargetID().isXnackOnOrAny());
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ if (ST.hasGFX90AInsts() && NumAGPR)
+ return alignTo(NumVGPR, 4) + NumAGPR;
+ return std::max(NumVGPR, NumAGPR);
+}
+
+bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ bool HasIndirectCall = false;
+
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ MachineModuleInfo &MMI =
+ getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+
+ auto CI = CallGraphResourceInfo.insert(
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = CI.first->second;
+ assert(CI.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF, TM);
+ HasIndirectCall |= Info.HasIndirectCall;
+ }
+
+ if (HasIndirectCall)
+ propagateIndirectCallRegisterUsage();
+
+ return false;
+}
+
+AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
+ const MachineFunction &MF, const TargetMachine &TM) const {
+ SIFunctionResourceInfo Info;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
+ MRI.isLiveIn(MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
+
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer. Inline assembly may
+ // need it though.
+ //
+ // If we only have implicit uses of flat_scr on flat instructions, it is not
+ // really needed.
+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+ (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
+ Info.UsesFlatScratch = false;
+ }
+
+ Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+ // Assume a big number if there are any unknown sized objects.
+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+ if (Info.HasDynamicallySizedStack)
+ Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
+
+ Info.UsesVCC =
+ MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
+ // A tail call isn't considered a call for MachineFrameInfo's purposes.
+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
+ }
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestAGPRReg) + 1;
+ }
+
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
+ }
+
+ // We found the maximum register index. They start at 0, so add one to get
+ // the number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
+ }
+
+ int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
+ int32_t MaxSGPR = -1;
+ uint64_t CalleeFrameSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: Check regmasks? Do they occur anywhere except calls?
+ for (const MachineOperand &MO : MI.operands()) {
+ unsigned Width = 0;
+ bool IsSGPR = false;
+ bool IsAGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ case AMDGPU::M0_LO16:
+ case AMDGPU::M0_HI16:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::MODE:
+ continue;
+
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ llvm_unreachable("src_pops_exiting_wave_id should not be used");
+
+ case AMDGPU::NoRegister:
+ assert(MI.isDebugInstr() &&
+ "Instruction uses invalid noreg register");
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ case AMDGPU::VCC_LO_LO16:
+ case AMDGPU::VCC_LO_HI16:
+ case AMDGPU::VCC_HI_LO16:
+ case AMDGPU::VCC_HI_HI16:
+ Info.UsesVCC = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ continue;
+
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(Reg) ||
+ AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+ AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 3;
+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 5;
+ } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 5;
+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 5;
+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 6;
+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 6;
+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 6;
+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 7;
+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 7;
+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 7;
+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 8;
+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 32;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned HWReg = TRI.getHWRegIndex(Reg);
+ int MaxUsed = HWReg + Width - 1;
+ if (IsSGPR) {
+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+ } else {
+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+ }
+ }
+
+ if (MI.isCall()) {
+ // Pseudo used just to encode the underlying global. Is there a better
+ // way to track this?
+
+ const MachineOperand *CalleeOp =
+ TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+
+ const Function *Callee = getCalleeFunction(*CalleeOp);
+ DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+ CallGraphResourceInfo.end();
+
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
+ bool IsIndirect = !Callee || Callee->isDeclaration();
+ if (!IsIndirect)
+ I = CallGraphResourceInfo.find(Callee);
+
+ if (IsIndirect || I == CallGraphResourceInfo.end()) {
+ CalleeFrameSize =
+ std::max(CalleeFrameSize,
+ static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
+ // Register usage of indirect calls gets handled later
+ Info.UsesVCC = true;
+ Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+ Info.HasDynamicallySizedStack = true;
+ Info.HasIndirectCall = true;
+ } else {
+ // We force CodeGen to run in SCC order, so the callee's register
+ // usage etc. should be the cumulative usage of all callees.
+ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
+ MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
+ CalleeFrameSize =
+ std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
+ Info.UsesVCC |= I->second.UsesVCC;
+ Info.UsesFlatScratch |= I->second.UsesFlatScratch;
+ Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
+ Info.HasRecursion |= I->second.HasRecursion;
+ Info.HasIndirectCall |= I->second.HasIndirectCall;
+ }
+
+ // FIXME: Call site could have norecurse on it
+ if (!Callee || !Callee->doesNotRecurse())
+ Info.HasRecursion = true;
+ }
+ }
+ }
+
+ Info.NumExplicitSGPR = MaxSGPR + 1;
+ Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
+ Info.PrivateSegmentSize += CalleeFrameSize;
+
+ return Info;
+}
+
+void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
+ // Collect the maximum number of registers from non-hardware-entrypoints.
+ // All these functions are potential targets for indirect calls.
+ int32_t NonKernelMaxSGPRs = 0;
+ int32_t NonKernelMaxVGPRs = 0;
+ int32_t NonKernelMaxAGPRs = 0;
+
+ for (const auto &I : CallGraphResourceInfo) {
+ if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
+ auto &Info = I.getSecond();
+ NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
+ NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
+ NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
+ }
+ }
+
+ // Add register usage for functions with indirect calls.
+ // For calls to unknown functions, we assume the maximum register usage of
+ // all non-hardware-entrypoints in the current module.
+ for (auto &I : CallGraphResourceInfo) {
+ auto &Info = I.getSecond();
+ if (Info.HasIndirectCall) {
+ Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
+ Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
+ Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
+ }
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
new file mode 100644
index 0000000..832e811
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -0,0 +1,79 @@
+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes how many registers and other resources are used by
+/// functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
+
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/ValueMap.h"
+
+namespace llvm {
+
+class GCNSubtarget;
+class MachineFunction;
+class TargetMachine;
+
+struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
+ static char ID;
+
+public:
+ // Track resource usage for callee functions.
+ struct SIFunctionResourceInfo {
+ // Track the number of explicitly used VGPRs. Special registers reserved at
+ // the end are tracked separately.
+ int32_t NumVGPR = 0;
+ int32_t NumAGPR = 0;
+ int32_t NumExplicitSGPR = 0;
+ uint64_t PrivateSegmentSize = 0;
+ bool UsesVCC = false;
+ bool UsesFlatScratch = false;
+ bool HasDynamicallySizedStack = false;
+ bool HasRecursion = false;
+ bool HasIndirectCall = false;
+
+ int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+ int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+ };
+
+ AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {}
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+
+ bool doInitialization(CallGraph &CG) override {
+ CallGraphResourceInfo.clear();
+ return CallGraphSCCPass::doInitialization(CG);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.setPreservesAll();
+ }
+
+ const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
+ auto Info = CallGraphResourceInfo.find(F);
+ assert(Info != CallGraphResourceInfo.end() &&
+ "Failed to find resource info for function");
+ return Info->getSecond();
+ }
+
+private:
+ SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF,
+ const TargetMachine &TM) const;
+ void propagateIndirectCallRegisterUsage();
+
+ DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fd65727..afe0167 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -199,6 +199,12 @@
def : SourceOfDivergence<int_amdgcn_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
@@ -226,6 +232,8 @@
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -240,9 +248,12 @@
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
@@ -274,6 +285,13 @@
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f1a7d74..0c5020d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -98,12 +98,12 @@
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
// Disable mutually exclusive bits.
- if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
- if (FS.find_lower("wavefrontsize16") == StringRef::npos)
+ if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
+ if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
FullFS += "-wavefrontsize16,";
- if (FS.find_lower("wavefrontsize32") == StringRef::npos)
+ if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
FullFS += "-wavefrontsize32,";
- if (FS.find_lower("wavefrontsize64") == StringRef::npos)
+ if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
FullFS += "-wavefrontsize64,";
}
@@ -163,6 +163,7 @@
WavefrontSizeLog2 = 5;
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
TargetID.setTargetIDFromFeaturesString(FS);
@@ -176,6 +177,7 @@
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
TargetTriple(TT),
+ GCN3Encoding(false),
Has16BitInsts(false),
HasMadMixInsts(false),
HasMadMacF32Insts(false),
@@ -184,6 +186,7 @@
HasVOP3PInsts(false),
HasMulI24(true),
HasMulU24(true),
+ HasSMulHi(false),
HasInv2PiInlineImm(false),
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
@@ -194,7 +197,8 @@
{ }
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const GCNTargetMachine &TM) :
+ const GCNTargetMachine &TM)
+ : // clang-format off
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
@@ -207,6 +211,7 @@
FastFMAF32(false),
FastDenormalF32(false),
HalfRate64Ops(false),
+ FullRate64Ops(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
@@ -216,6 +221,7 @@
HasApertureRegs(false),
SupportsXNACK(false),
EnableXNACK(false),
+ EnableTgSplit(false),
EnableCuMode(false),
TrapHandler(false),
@@ -227,14 +233,16 @@
DumpCode(false),
FP64(false),
- GCN3Encoding(false),
CIInsts(false),
GFX8Insts(false),
GFX9Insts(false),
+ GFX90AInsts(false),
GFX10Insts(false),
GFX10_3Insts(false),
GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
+ NegativeScratchOffsetBug(false),
+ NegativeUnalignedScratchOffsetBug(false),
HasSMemRealTime(false),
HasIntClamp(false),
HasFmaMixInsts(false),
@@ -249,10 +257,15 @@
HasSDWAOutModsVOPC(false),
HasDPP(false),
HasDPP8(false),
+ Has64BitDPP(false),
+ HasPackedFP32Ops(false),
+ HasExtendedImageInsts(false),
HasR128A16(false),
HasGFX10A16(false),
HasG16(false),
HasNSAEncoding(false),
+ NSAMaxSize(0),
+ GFX10_AEncoding(false),
GFX10_BEncoding(false),
HasDLInsts(false),
HasDot1Insts(false),
@@ -261,6 +274,7 @@
HasDot4Insts(false),
HasDot5Insts(false),
HasDot6Insts(false),
+ HasDot7Insts(false),
HasMAIInsts(false),
HasPkFmacF16Inst(false),
HasAtomicFaddInsts(false),
@@ -270,6 +284,7 @@
HasVscnt(false),
HasGetWaveIdInst(false),
HasSMemTimeInst(false),
+ HasShaderCyclesRegister(false),
HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
@@ -278,12 +293,14 @@
FlatGlobalInsts(false),
FlatScratchInsts(false),
ScalarFlatScratchInsts(false),
+ HasArchitectedFlatScratch(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
HasMFMAInlineLiteralBug(false),
UnalignedBufferAccess(false),
UnalignedDSAccess(false),
+ HasPackedTID(false),
ScalarizeGlobal(false),
@@ -294,6 +311,7 @@
HasVcmpxExecWARHazard(false),
HasLdsBranchVmemWARHazard(false),
HasNSAtoVMEMBug(false),
+ HasNSAClauseBug(false),
HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),
HasImageStoreD16Bug(false),
@@ -303,6 +321,7 @@
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+ // clang-format on
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
@@ -313,7 +332,8 @@
}
bool GCNSubtarget::enableFlatScratch() const {
- return EnableFlatScratch && hasFlatScratchInsts();
+ return flatScratchIsArchitected() ||
+ (EnableFlatScratch && hasFlatScratchInsts());
}
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
@@ -336,6 +356,105 @@
return 2;
}
+/// This list was mostly derived from experimentation.
+bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
+ switch (Opcode) {
+ case AMDGPU::V_CVT_F16_F32_e32:
+ case AMDGPU::V_CVT_F16_F32_e64:
+ case AMDGPU::V_CVT_F16_U16_e32:
+ case AMDGPU::V_CVT_F16_U16_e64:
+ case AMDGPU::V_CVT_F16_I16_e32:
+ case AMDGPU::V_CVT_F16_I16_e64:
+ case AMDGPU::V_RCP_F16_e64:
+ case AMDGPU::V_RCP_F16_e32:
+ case AMDGPU::V_RSQ_F16_e64:
+ case AMDGPU::V_RSQ_F16_e32:
+ case AMDGPU::V_SQRT_F16_e64:
+ case AMDGPU::V_SQRT_F16_e32:
+ case AMDGPU::V_LOG_F16_e64:
+ case AMDGPU::V_LOG_F16_e32:
+ case AMDGPU::V_EXP_F16_e64:
+ case AMDGPU::V_EXP_F16_e32:
+ case AMDGPU::V_SIN_F16_e64:
+ case AMDGPU::V_SIN_F16_e32:
+ case AMDGPU::V_COS_F16_e64:
+ case AMDGPU::V_COS_F16_e32:
+ case AMDGPU::V_FLOOR_F16_e64:
+ case AMDGPU::V_FLOOR_F16_e32:
+ case AMDGPU::V_CEIL_F16_e64:
+ case AMDGPU::V_CEIL_F16_e32:
+ case AMDGPU::V_TRUNC_F16_e64:
+ case AMDGPU::V_TRUNC_F16_e32:
+ case AMDGPU::V_RNDNE_F16_e64:
+ case AMDGPU::V_RNDNE_F16_e32:
+ case AMDGPU::V_FRACT_F16_e64:
+ case AMDGPU::V_FRACT_F16_e32:
+ case AMDGPU::V_FREXP_MANT_F16_e64:
+ case AMDGPU::V_FREXP_MANT_F16_e32:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+ case AMDGPU::V_LDEXP_F16_e64:
+ case AMDGPU::V_LDEXP_F16_e32:
+ case AMDGPU::V_LSHLREV_B16_e64:
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_ADD_U16_e64:
+ case AMDGPU::V_ADD_U16_e32:
+ case AMDGPU::V_SUB_U16_e64:
+ case AMDGPU::V_SUB_U16_e32:
+ case AMDGPU::V_SUBREV_U16_e64:
+ case AMDGPU::V_SUBREV_U16_e32:
+ case AMDGPU::V_MUL_LO_U16_e64:
+ case AMDGPU::V_MUL_LO_U16_e32:
+ case AMDGPU::V_ADD_F16_e64:
+ case AMDGPU::V_ADD_F16_e32:
+ case AMDGPU::V_SUB_F16_e64:
+ case AMDGPU::V_SUB_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_MUL_F16_e64:
+ case AMDGPU::V_MUL_F16_e32:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F16_e32:
+ case AMDGPU::V_MIN_F16_e64:
+ case AMDGPU::V_MIN_F16_e32:
+ case AMDGPU::V_MAX_U16_e64:
+ case AMDGPU::V_MAX_U16_e32:
+ case AMDGPU::V_MIN_U16_e64:
+ case AMDGPU::V_MIN_U16_e32:
+ case AMDGPU::V_MAX_I16_e64:
+ case AMDGPU::V_MAX_I16_e32:
+ case AMDGPU::V_MIN_I16_e64:
+ case AMDGPU::V_MIN_I16_e32:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return getGeneration() <= AMDGPUSubtarget::GFX9;
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_MADAK_F16:
+ case AMDGPU::V_MADMK_F16:
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAAK_F16:
+ case AMDGPU::V_MAD_U16_e64:
+ case AMDGPU::V_MAD_I16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_e32:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
+ // In gfx9, the preferred handling of the unused high 16-bits changed. Most
+ // instructions maintain the legacy behavior of 0ing. Some instructions
+ // changed to preserving the high bits.
+ return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case AMDGPU::V_MAD_MIXLO_F16:
+ case AMDGPU::V_MAD_MIXHI_F16:
+ default:
+ return false;
+ }
+}
+
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {
if (NWaves == 1)
@@ -681,12 +800,12 @@
return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
}
-unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+unsigned
+GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
- if (MFI.hasFlatScratchInit()) {
+ if (HasFlatScratchInit) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@@ -698,6 +817,28 @@
return 2; // VCC.
}
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
+}
+
+unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
+ // The logic to detect if the function has
+ // flat scratch init is slightly different than how
+ // SIMachineFunctionInfo constructor derives.
+ // We don't use amdgpu-calls, amdgpu-stack-objects
+ // attributes and isAmdHsaOrMesa here as it doesn't really matter.
+ // TODO: Outline this derivation logic and have just
+ // one common function in the backend to avoid duplication.
+ bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ bool FunctionHasFlatScratchInit = false;
+ if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
+ enableFlatScratch()) {
+ FunctionHasFlatScratchInit = true;
+ }
+ return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
+}
+
unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
@@ -711,13 +852,11 @@
return Occupancy;
}
-unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
+unsigned GCNSubtarget::getBaseMaxNumSGPRs(
+ const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
+ unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
// Compute maximum number of SGPRs function can use using default/requested
// minimum number of waves per execution unit.
- std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
@@ -728,7 +867,7 @@
F, "amdgpu-num-sgpr", MaxNumSGPRs);
// Make sure requested value does not violate subtarget's specifications.
- if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+ if (Requested && (Requested <= ReservedNumSGPRs))
Requested = 0;
// If more SGPRs are required to support the input user/system SGPRs,
@@ -738,7 +877,7 @@
// of reserved special registers in total. Theoretically you could re-use
// the last input registers for these special registers, but this would
// require a lot of complexity to deal with the weird aliasing.
- unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+ unsigned InputNumSGPRs = PreloadedSGPRs;
if (Requested && Requested < InputNumSGPRs)
Requested = InputNumSGPRs;
@@ -757,17 +896,43 @@
if (hasSGPRInitBug())
MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
- return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
- MaxAddressableNumSGPRs);
+ return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
}
-unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
+ getReservedNumSGPRs(MF));
+}
+static unsigned getMaxNumPreloadedSGPRs() {
+ // Max number of user SGPRs
+ unsigned MaxUserSGPRs = 4 + // private segment buffer
+ 2 + // Dispatch ptr
+ 2 + // queue ptr
+ 2 + // kernel segment ptr
+ 2 + // dispatch ID
+ 2 + // flat scratch init
+ 2; // Implicit buffer ptr
+ // Max number of system SGPRs
+ unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
+ 1 + // WorkGroupIDY
+ 1 + // WorkGroupIDZ
+ 1 + // WorkGroupInfo
+ 1; // private segment wave byte offset
+ return MaxUserSGPRs + MaxSystemSGPRs;
+}
+
+unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
+ return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
+ getReservedNumSGPRs(F));
+}
+
+unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+ const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
// Compute maximum number of VGPRs function can use using default/requested
// minimum number of waves per execution unit.
- std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
// Check if maximum number of VGPRs was explicitly requested using
@@ -776,6 +941,9 @@
unsigned Requested = AMDGPU::getIntegerAttribute(
F, "amdgpu-num-vgpr", MaxNumVGPRs);
+ if (hasGFX90AInsts())
+ Requested *= 2;
+
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -791,6 +959,16 @@
return MaxNumVGPRs;
}
+unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
+ return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+}
+
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+ const Function &F = MF.getFunction();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+}
+
void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
int UseOpIdx, SDep &Dep) const {
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ba3a8ac..b160cdf 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -45,6 +45,7 @@
Triple TargetTriple;
protected:
+ bool GCN3Encoding;
bool Has16BitInsts;
bool HasMadMixInsts;
bool HasMadMacF32Insts;
@@ -53,6 +54,7 @@
bool HasVOP3PInsts;
bool HasMulI24;
bool HasMulU24;
+ bool HasSMulHi;
bool HasInv2PiInlineImm;
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
@@ -124,6 +126,10 @@
return TargetTriple.getArch() == Triple::amdgcn;
}
+ bool isGCN3Encoding() const {
+ return GCN3Encoding;
+ }
+
bool has16BitInsts() const {
return Has16BitInsts;
}
@@ -156,6 +162,10 @@
return HasMulU24;
}
+ bool hasSMulHi() const {
+ return HasSMulHi;
+ }
+
bool hasInv2PiInlineImm() const {
return HasInv2PiInlineImm;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce7c82e..e4485f8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -32,6 +32,8 @@
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/PassManager.h"
@@ -52,6 +54,115 @@
using namespace llvm;
+namespace {
+class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
+public:
+ SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
+class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
+public:
+ VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
+static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass &RC) {
+ return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass &RC) {
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+
+/// -{sgpr|vgpr}-regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+
+static SGPRRegisterRegAlloc
+defaultSGPRRegAlloc("default",
+ "pick SGPR register allocator based on -O option",
+ useDefaultRegisterAllocator);
+
+static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<SGPRRegisterRegAlloc>>
+SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for SGPRs"));
+
+static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<VGPRRegisterRegAlloc>>
+VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for VGPRs"));
+
+
+static void initializeDefaultSGPRRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = SGPRRegAlloc;
+ SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
+ }
+}
+
+static void initializeDefaultVGPRRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = VGPRRegAlloc;
+ VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
+ }
+}
+
+static FunctionPass *createBasicSGPRRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createGreedySGPRRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createFastSGPRRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+static FunctionPass *createBasicVGPRRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createGreedyVGPRRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createFastVGPRRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateVGPRs, true);
+}
+
+static SGPRRegisterRegAlloc basicRegAllocSGPR(
+ "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc greedyRegAllocSGPR(
+ "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
+
+static SGPRRegisterRegAlloc fastRegAllocSGPR(
+ "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+
+
+static VGPRRegisterRegAlloc basicRegAllocVGPR(
+ "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc greedyRegAllocVGPR(
+ "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+
+static VGPRRegisterRegAlloc fastRegAllocVGPR(
+ "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+}
+
+
static cl::opt<bool> EnableR600StructurizeCFG(
"r600-ir-structurize",
cl::desc("Use StructurizeCFG IR pass"),
@@ -162,6 +273,11 @@
cl::init(true),
cl::Hidden);
+static cl::opt<bool> OptVGPRLiveRange(
+ "amdgpu-opt-vgpr-liverange",
+ cl::desc("Enable VGPR liverange optimizations for if-else structure"),
+ cl::init(true), cl::Hidden);
+
// Enable atomic optimization
static cl::opt<bool> EnableAtomicOptimizations(
"amdgpu-atomic-optimizations",
@@ -193,6 +309,21 @@
cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableLDSReplaceWithPointer(
+ "amdgpu-enable-lds-replace-with-pointer",
+ cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
+ cl::Hidden);
+
+static cl::opt<bool, true> EnableLowerModuleLDS(
+ "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
+ cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnablePreRAOptimizations(
+ "amdgpu-enable-pre-ra-optimizations",
+ cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -215,9 +346,11 @@
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);
+ initializeSIOptimizeVGPRLiveRangePass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPUAttributorPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
@@ -228,12 +361,15 @@
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
+ initializeAMDGPURegBankCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
+ initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
+ initializeAMDGPULowerModuleLDSPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
@@ -242,9 +378,8 @@
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
- initializeSIRemoveShortExecBranchesPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
- initializeSIInsertSkipsPass(*PR);
+ initializeSILateBranchLoweringPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
@@ -256,9 +391,9 @@
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
- initializeGCNRegBankReassignPass(*PR);
+ initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignPass(*PR);
- initializeSIAddIMGInitPass(*PR);
+ initializeGCNPreRAOptimizationsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -388,6 +523,7 @@
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
@@ -408,6 +544,7 @@
if (const Function *F = dyn_cast<Function>(&GV))
return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
+ GV.removeDeadConstantUsers();
return !GV.use_empty();
}
@@ -480,8 +617,7 @@
AAM.registerFunctionAnalysis<AMDGPUAA>();
}
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
[this](StringRef PassName, ModulePassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
@@ -501,6 +637,14 @@
PM.addPass(AMDGPUAlwaysInlinePass());
return true;
}
+ if (PassName == "amdgpu-replace-lds-use-with-pointer") {
+ PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
+ return true;
+ }
+ if (PassName == "amdgpu-lower-module-lds") {
+ PM.addPass(AMDGPULowerModuleLDSPass());
+ return true;
+ }
return false;
});
PB.registerPipelineParsingCallback(
@@ -530,7 +674,6 @@
PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
return true;
}
-
return false;
});
@@ -546,16 +689,16 @@
return false;
});
- PB.registerPipelineStartEPCallback([this, DebugPassManager](
- ModulePassManager &PM,
- PassBuilder::OptimizationLevel Level) {
- FunctionPassManager FPM(DebugPassManager);
- FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
- FPM.addPass(AMDGPUUseNativeCallsPass());
- if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
- FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
- PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
- });
+ PB.registerPipelineStartEPCallback(
+ [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM;
+ FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ FPM.addPass(AMDGPUUseNativeCallsPass());
+ if (EnableLibCallSimplify &&
+ Level != PassBuilder::OptimizationLevel::O0)
+ FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
PB.registerPipelineEarlySimplificationEPCallback(
[this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
@@ -577,12 +720,11 @@
});
PB.registerCGSCCOptimizerLateEPCallback(
- [this, DebugPassManager](CGSCCPassManager &PM,
- PassBuilder::OptimizationLevel Level) {
+ [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
if (Level == PassBuilder::OptimizationLevel::O0)
return;
- FunctionPassManager FPM(DebugPassManager);
+ FunctionPassManager FPM;
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
@@ -732,6 +874,9 @@
// anything.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
+ // Garbage collection is not supported.
+ disablePass(&GCLoweringID);
+ disablePass(&ShadowStackGCLoweringID);
}
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
@@ -754,6 +899,19 @@
bool addGCPasses() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
+
+ /// Check if a pass is enabled given \p Opt option. The option always
+ /// overrides defaults if explicitely used. Otherwise its default will
+ /// be used given that a pass shall work at an optimization \p Level
+ /// minimum.
+ bool isPassEnabled(const cl::opt<bool> &Opt,
+ CodeGenOpt::Level Level = CodeGenOpt::Default) const {
+ if (Opt.getNumOccurrences())
+ return Opt;
+ if (TM->getOptLevel() < Level)
+ return false;
+ return Opt;
+ }
};
std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
@@ -803,9 +961,18 @@
bool addLegalizeMachineIR() override;
void addPreRegBankSelect() override;
bool addRegBankSelect() override;
+ void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
+
+ FunctionPass *createSGPRAllocPass(bool Optimized);
+ FunctionPass *createVGPRAllocPass(bool Optimized);
+ FunctionPass *createRegAllocPass(bool Optimized) override;
+
+ bool addRegAssignAndRewriteFast() override;
+ bool addRegAssignAndRewriteOptimized() override;
+
void addPreRegAlloc() override;
bool addPreRewrite() override;
void addPostRegAlloc() override;
@@ -856,9 +1023,6 @@
// A call to propagate attributes pass in the backend in case opt was not run.
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
- addPass(createAtomicExpandPass());
-
-
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
@@ -878,14 +1042,28 @@
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
- if (TM.getOptLevel() > CodeGenOpt::None) {
+ // Can increase LDS used by kernel so runs before PromoteAlloca
+ if (EnableLowerModuleLDS) {
+ // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
+ // pass "amdgpu-lower-module-lds", and also it required to be run only if
+ // "amdgpu-lower-module-lds" pass is enabled.
+ if (EnableLDSReplaceWithPointer)
+ addPass(createAMDGPUReplaceLDSUseWithPointerPass());
+
+ addPass(createAMDGPULowerModuleLDSPass());
+ }
+
+ if (TM.getOptLevel() > CodeGenOpt::None)
addPass(createInferAddressSpacesPass());
+
+ addPass(createAtomicExpandPass());
+
+ if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createAMDGPUPromoteAlloca());
if (EnableSROA)
addPass(createSROAPass());
-
- if (EnableScalarIRPasses)
+ if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
if (EnableAMDGPUAliasAnalysis) {
@@ -896,11 +1074,11 @@
AAR.addAAResult(WrapperPass->getResult());
}));
}
- }
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- // TODO: May want to move later or split into an early and late one.
- addPass(createAMDGPUCodeGenPreparePass());
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass());
+ }
}
TargetPassConfig::addIRPasses();
@@ -917,7 +1095,7 @@
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
- if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
+ if (isPassEnabled(EnableScalarIRPasses))
addEarlyCSEOrGVNPass();
}
@@ -929,11 +1107,9 @@
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
- addPass(&AMDGPUPerfHintAnalysisID);
-
TargetPassConfig::addCodeGenPrepare();
- if (EnableLoadStoreVectorizer)
+ if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(createLoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can
@@ -944,7 +1120,8 @@
}
bool AMDGPUPassConfig::addPreISel() {
- addPass(createFlattenCFGPass());
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createFlattenCFGPass());
return false;
}
@@ -1014,13 +1191,15 @@
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
- addPass(createAMDGPULateCodeGenPreparePass());
- if (EnableAtomicOptimizations) {
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createAMDGPULateCodeGenPreparePass());
+
+ if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
addPass(createAMDGPUAtomicOptimizerPass());
}
- // FIXME: We need to run a pass to propagate the attributes when calls are
- // supported.
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
@@ -1032,13 +1211,15 @@
}
addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
}
- addPass(createSinkingPass());
addPass(createAMDGPUAnnotateUniformValues());
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
}
addPass(createLCSSAPass());
+ if (TM->getOptLevel() > CodeGenOpt::Less)
+ addPass(&AMDGPUPerfHintAnalysisID);
+
return false;
}
@@ -1055,15 +1236,14 @@
addPass(&SIFoldOperandsID);
if (EnableDPPCombine)
addPass(&GCNDPPCombineID);
- addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
- if (EnableSDWAPeephole) {
+ if (isPassEnabled(EnableSDWAPeephole)) {
addPass(&SIPeepholeSDWAID);
addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&SIFoldOperandsID);
- addPass(&DeadMachineInstructionElimID);
}
+ addPass(&DeadMachineInstructionElimID);
addPass(createSIShrinkInstructionsPass());
}
@@ -1079,7 +1259,6 @@
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
- addPass(createSIAddIMGInitPass());
return false;
}
@@ -1109,12 +1288,13 @@
return false;
}
+void GCNPassConfig::addPreGlobalInstructionSelect() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPURegBankCombiner(IsOptNone));
+}
+
bool GCNPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
- // TODO: Fix instruction selection to do the right thing for image
- // instructions with tfe or lwe in the first place, instead of running a
- // separate pass to fix them up?
- addPass(createSIAddIMGInitPass());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
@@ -1147,8 +1327,21 @@
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
- insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ if (isPassEnabled(EnablePreRAOptimizations))
+ insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
+
+ // This is not an essential optimization and it has a noticeable impact on
+ // compilation time, so we only enable it from O2.
+ if (TM->getOptLevel() > CodeGenOpt::Less)
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+
+ // FIXME: when an instruction has a Killed operand, and the instruction is
+ // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+ // the register in LiveVariables, this would trigger a failure in verifier,
+ // we should fix it and enable the verifier.
+ if (OptVGPRLiveRange)
+ insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -1161,10 +1354,81 @@
}
bool GCNPassConfig::addPreRewrite() {
- if (EnableRegReassign) {
+ if (EnableRegReassign)
addPass(&GCNNSAReassignID);
- addPass(&GCNRegBankReassignID);
- }
+ return true;
+}
+
+FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
+ initializeDefaultSGPRRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+
+ return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
+ initializeDefaultVGPRRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyVGPRRegisterAllocator();
+
+ return createFastVGPRRegisterAllocator();
+}
+
+FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
+ llvm_unreachable("should not be used");
+}
+
+static const char RegAllocOptNotSupportedMessage[] =
+ "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+
+bool GCNPassConfig::addRegAssignAndRewriteFast() {
+ if (!usingDefaultRegAlloc())
+ report_fatal_error(RegAllocOptNotSupportedMessage);
+
+ addPass(createSGPRAllocPass(false));
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
+
+ addPass(createVGPRAllocPass(false));
+ return true;
+}
+
+bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
+ if (!usingDefaultRegAlloc())
+ report_fatal_error(RegAllocOptNotSupportedMessage);
+
+ addPass(createSGPRAllocPass(true));
+
+ // Commit allocated register changes. This is mostly necessary because too
+ // many things rely on the use lists of the physical registers, such as the
+ // verifier. This is only necessary with allocators which use LiveIntervals,
+ // since FastRegAlloc does the replacments itself.
+ addPass(createVirtRegRewriter(false));
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
+
+ addPass(createVGPRAllocPass(true));
+
+ addPreRewrite();
+ addPass(&VirtRegRewriterID);
+
return true;
}
@@ -1173,9 +1437,6 @@
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
-
- // Equivalent of PEI for SGPRs.
- addPass(&SILowerSGPRSpillsID);
}
void GCNPassConfig::addPreSched2() {
@@ -1185,15 +1446,18 @@
void GCNPassConfig::addPreEmitPass() {
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
- addPass(createSIShrinkInstructionsPass());
+
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createSIShrinkInstructionsPass());
+
addPass(createSIModeRegisterPass());
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIInsertHardClausesID);
- addPass(&SIRemoveShortExecBranchesID);
- addPass(&SIInsertSkipsPassID);
- addPass(&SIPreEmitPeepholeID);
+ addPass(&SILateBranchLoweringPassID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -1217,8 +1481,8 @@
yaml::MachineFunctionInfo *
GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return new yaml::SIMachineFunctionInfo(*MFI,
- *MF.getSubtarget().getRegisterInfo());
+ return new yaml::SIMachineFunctionInfo(
+ *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
}
bool GCNTargetMachine::parseMachineFunctionInfo(
@@ -1229,7 +1493,8 @@
MachineFunction &MF = PFS.MF;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MFI->initializeBaseYamlFields(YamlMFI);
+ if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
+ return true;
if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 95aefa2..1bfe026 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
static bool EnableFixedFunctionABI;
+ static bool EnableLowerModuleLDS;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
@@ -51,8 +52,7 @@
void adjustPassManager(PassManagerBuilder &) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
void registerDefaultAliasAnalyses(AAManager &) override;
/// Get the integer value of a null pointer in the given address space.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7b8a796..63f449f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
@@ -39,7 +40,7 @@
static cl::opt<unsigned> UnrollThresholdIf(
"amdgpu-unroll-threshold-if",
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
- cl::init(150), cl::Hidden);
+ cl::init(200), cl::Hidden);
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
@@ -106,6 +107,10 @@
UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;
+ // Conditional branch in a loop back edge needs 3 additional exec
+ // manipulations in average.
+ UP.BEInsns += 3;
+
// TODO: Do we want runtime unrolling?
// Maximum alloca size than can fit registers. Reserve 16 registers.
@@ -310,8 +315,17 @@
return getHardwareNumberOfRegisters(false) / NumVGPRs;
}
-unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
- return 32;
+TypeSize
+GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
+ }
+ llvm_unreachable("Unsupported register kind");
}
unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
@@ -321,7 +335,9 @@
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+ return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -495,14 +511,12 @@
}
}
-int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost GCNTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
// FIXME: We're having to query the throughput cost so that the basic
@@ -518,7 +532,7 @@
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
bool IsFloat = Ty->isFPOrFPVectorTy();
// Assume that floating point arithmetic operations cost twice as much as
@@ -542,12 +556,13 @@
// similarly to what getCastInstrCost() does.
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
- unsigned Cost = getArithmeticInstrCost(
+ InstructionCost Cost = getArithmeticInstrCost(
Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
// We don't know anything about this scalar instruction.
@@ -555,7 +570,7 @@
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// Because we don't have any legal vector operations, but the legal types, we
@@ -628,6 +643,8 @@
LLVM_FALLTHROUGH;
case ISD::FADD:
case ISD::FSUB:
+ if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
+ NElts = (NElts + 1) / 2;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
@@ -713,8 +730,9 @@
}
}
-int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
if (ICA.getID() == Intrinsic::fabs)
return 0;
@@ -731,45 +749,34 @@
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
- Type *RetTy = ICA.getReturnType();
- unsigned VF = ICA.getVectorFactor().getFixedValue();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
- assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
- SmallVector<Type *, 4> Types;
- for (const Value *Op : Args) {
- Type *OpTy = Op->getType();
- assert(VF == 1 || !OpTy->isVectorTy());
- Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
- }
-
- if (VF > 1 && !RetTy->isVoidTy())
- RetTy = FixedVectorType::get(RetTy, VF);
// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
- unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
- if (RetVF > 1 || VF > 1) {
+ InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ if (RetVF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
- ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+ ScalarizationCost +=
+ getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
}
- IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
- ScalarizationCost, I);
+ IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
+ ScalarizationCost);
return getIntrinsicInstrCost(Attrs, CostKind);
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
unsigned NElts = LT.second.isVector() ?
LT.second.getVectorNumElements() : 1;
@@ -779,69 +786,96 @@
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
- if (ST->has16BitInsts() && SLT == MVT::f16)
+ if ((ST->has16BitInsts() && SLT == MVT::f16) ||
+ (ST->hasPackedFP32Ops() && SLT == MVT::f32))
NElts = (NElts + 1) / 2;
// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost(CostKind);
- if (ICA.getID() == Intrinsic::fma) {
+
+ switch (ICA.getID()) {
+ case Intrinsic::fma:
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
+ break;
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
+ if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
+ NElts = 1;
+ break;
}
return LT.first * NElts * InstRate;
}
-unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind) {
- if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
- return Opcode == Instruction::PHI ? 0 : 1;
-
- // XXX - For some reason this isn't called for switch.
+InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ assert((I == nullptr || I->getOpcode() == Opcode) &&
+ "Opcode should reflect passed instruction.");
+ const bool SCost =
+ (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
+ const int CBrCost = SCost ? 5 : 7;
switch (Opcode) {
- case Instruction::Br:
- case Instruction::Ret:
- return 10;
- default:
- return BaseT::getCFInstrCost(Opcode, CostKind);
+ case Instruction::Br: {
+ // Branch instruction takes about 4 slots on gfx900.
+ auto BI = dyn_cast_or_null<BranchInst>(I);
+ if (BI && BI->isUnconditional())
+ return SCost ? 1 : 4;
+ // Suppose conditional branch takes additional 3 exec manipulations
+ // instructions in average.
+ return CBrCost;
}
+ case Instruction::Switch: {
+ auto SI = dyn_cast_or_null<SwitchInst>(I);
+ // Each case (including default) takes 1 cmp + 1 cbr instructions in
+ // average.
+ return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
+ }
+ case Instruction::Ret:
+ return SCost ? 1 : 10;
+ }
+ return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
-int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- bool IsPairwise,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF))
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
// 16-bit types only).
- if (IsPairwise ||
- !ST->hasVOP3PInsts() ||
- OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
+ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getFullRateInstrCost();
}
-int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
// 16-bit types only).
- if (IsPairwise ||
- !ST->hasVOP3PInsts() ||
- OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
- CostKind);
+ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost(CostKind);
}
-int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
@@ -1096,8 +1130,10 @@
}
}
-unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
- int Index, VectorType *SubTp) {
+InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *VT, ArrayRef<int> Mask,
+ int Index, VectorType *SubTp) {
+ Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
@@ -1115,7 +1151,7 @@
}
}
- return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
@@ -1141,9 +1177,15 @@
if (!CallerMode.isInlineCompatible(CalleeMode))
return false;
+ if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
+ Callee->hasFnAttribute(Attribute::InlineHint))
+ return true;
+
// Hack to make compile times reasonable.
- if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
- // Single BB does not increase total BB amount, thus subtract 1.
+ if (InlineMaxBB) {
+ // Single BB does not increase total BB amount.
+ if (Callee->size() == 1)
+ return true;
size_t BBSize = Caller->size() + Callee->size() - 1;
return BBSize <= InlineMaxBB;
}
@@ -1192,8 +1234,10 @@
}
int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
- return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
- : getQuarterRateInstrCost(CostKind);
+ return ST->hasFullRate64Ops()
+ ? getFullRateInstrCost()
+ : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
}
R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
@@ -1209,8 +1253,9 @@
return getHardwareNumberOfRegisters(Vec);
}
-unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
- return 32;
+TypeSize
+R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ return TypeSize::getFixed(32);
}
unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
@@ -1265,8 +1310,9 @@
return 8;
}
-unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind) {
+InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
return Opcode == Instruction::PHI ? 0 : 1;
@@ -1276,12 +1322,12 @@
case Instruction::Ret:
return 10;
default:
- return BaseT::getCFInstrCost(Opcode, CostKind);
+ return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
}
-int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b29c941..37c0756 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -25,6 +25,7 @@
namespace llvm {
class AMDGPUTargetLowering;
+class AMDGPUTargetMachine;
class GCNSubtarget;
class InstCombiner;
class Loop;
@@ -120,7 +121,7 @@
unsigned getHardwareNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(unsigned RCID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -152,7 +153,7 @@
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -162,12 +163,14 @@
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
- int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index);
bool isSourceOfDivergence(const Value *V) const;
bool isAlwaysUniform(const Value *V) const;
@@ -194,10 +197,11 @@
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const;
- unsigned getVectorSplitCost() { return 0; }
+ InstructionCost getVectorSplitCost() { return 0; }
- unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
@@ -207,17 +211,15 @@
int getInlinerVectorBonusPercent() { return 0; }
- int getArithmeticReductionCost(
- unsigned Opcode,
- VectorType *Ty,
- bool IsPairwise,
+ InstructionCost getArithmeticReductionCost(
+ unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
- int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
- int getMinMaxReductionCost(
- VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(
+ VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
@@ -242,7 +244,7 @@
TTI::PeelingPreferences &PP);
unsigned getHardwareNumberOfRegisters(bool Vec) const;
unsigned getNumberOfRegisters(bool Vec) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
@@ -252,8 +254,10 @@
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index);
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 84d72e1..4e3d5fd 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
// there is at most one ret and one unreachable instruction, it ensures there is
// at most one divergent exiting block.
//
@@ -54,6 +54,9 @@
namespace {
class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+private:
+ const TargetTransformInfo *TTI = nullptr;
+
public:
static char ID; // Pass identification, replacement for typeid
@@ -63,6 +66,9 @@
// We can preserve non-critical-edgeness when we unify function exit nodes
void getAnalysisUsage(AnalysisUsage &AU) const override;
+ BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
+ ArrayRef<BasicBlock *> ReturningBlocks,
+ StringRef Name);
bool runOnFunction(Function &F) override;
};
@@ -110,12 +116,9 @@
/// XXX - Is there a more efficient way to find this?
static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
BasicBlock &BB) {
- SmallVector<BasicBlock *, 8> Stack;
+ SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));
SmallPtrSet<BasicBlock *, 8> Visited;
- for (BasicBlock *Pred : predecessors(&BB))
- Stack.push_back(Pred);
-
while (!Stack.empty()) {
BasicBlock *Top = Stack.pop_back_val();
if (!DA.isUniform(Top->getTerminator()))
@@ -130,49 +133,15 @@
return true;
}
-static void removeDoneExport(Function &F) {
- ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
- if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
- Intrin->setArgOperand(6, BoolFalse); // done
- } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
- Intrin->setArgOperand(4, BoolFalse); // done
- }
- }
- }
- }
-}
-
-static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
- ArrayRef<BasicBlock *> ReturningBlocks,
- bool InsertExport,
- const TargetTransformInfo &TTI,
- StringRef Name) {
+BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+ Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
+ StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
IRBuilder<> B(NewRetBlock);
- if (InsertExport) {
- // Ensure that there's only one "done" export in the shader by removing the
- // "done" bit set on the original final export. More than one "done" export
- // can lead to undefined behavior.
- removeDoneExport(F);
-
- Value *Undef = UndefValue::get(B.getFloatTy());
- B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
- {
- B.getInt32(AMDGPU::Exp::ET_NULL),
- B.getInt32(0), // enabled channels
- Undef, Undef, Undef, Undef, // values
- B.getTrue(), // done
- B.getTrue(), // valid mask
- });
- }
-
PHINode *PN = nullptr;
if (F.getReturnType()->isVoidTy()) {
B.CreateRetVoid();
@@ -180,7 +149,6 @@
// If the function doesn't return void... add a PHI node to the block...
PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
"UnifiedRetVal");
- assert(!InsertExport);
B.CreateRet(PN);
}
@@ -206,7 +174,7 @@
for (BasicBlock *BB : ReturningBlocks) {
// Cleanup possible branch to unconditional branch to the return.
- simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
+ simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
SimplifyCFGOptions().bonusInstThreshold(2));
}
@@ -220,25 +188,21 @@
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- // If there's only one exit, we don't need to do anything, unless this is a
- // pixel shader and that exit is an infinite loop, since we still have to
- // insert an export in that case.
- if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
+ // If there's only one exit, we don't need to do anything.
+ if (PDT.root_size() <= 1)
return false;
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
- SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;
- bool InsertExport = false;
-
bool Changed = false;
std::vector<DominatorTree::UpdateType> Updates;
@@ -246,8 +210,6 @@
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
- else
- UniformlyReachedRetBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
@@ -259,36 +221,6 @@
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
- // For pixel shaders, the producer guarantees that an export is
- // executed before each return instruction. However, if there is an
- // infinite loop and we insert a return ourselves, we need to uphold
- // that guarantee by inserting a null export. This can happen e.g. in
- // an infinite loop with kill instructions, which is supposed to
- // terminate. However, we don't need to do this if there is a non-void
- // return value, since then there is an epilog afterwards which will
- // still export.
- //
- // Note: In the case where only some threads enter the infinite loop,
- // this can result in the null export happening redundantly after the
- // original exports. However, The last "real" export happens after all
- // the threads that didn't enter an infinite loop converged, which
- // means that the only extra threads to execute the null export are
- // threads that entered the infinite loop, and they only could've
- // exited through being killed which sets their exec bit to 0.
- // Therefore, unless there's an actual infinite loop, which can have
- // invalid results, or there's a kill after the last export, which we
- // assume the frontend won't do, this export will have the same exec
- // mask as the last "real" export, and therefore the valid mask will be
- // overwritten with the same value and will still be correct. Also,
- // even though this forces an extra unnecessary export wait, we assume
- // that this happens rare enough in practice to that we don't have to
- // worry about performance.
- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
- RetTy->isVoidTy()) {
- InsertExport = true;
- }
-
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
@@ -380,23 +312,9 @@
if (ReturningBlocks.empty())
return Changed; // No blocks return
- if (ReturningBlocks.size() == 1 && !InsertExport)
+ if (ReturningBlocks.size() == 1)
return Changed; // Already has a single return block
- const TargetTransformInfo &TTI
- = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
- // Unify returning blocks. If we are going to insert the export it is also
- // necessary to include blocks that are uniformly reached, because in addition
- // to inserting the export the "done" bits on existing exports will be cleared
- // and we do not want to end up with the normal export in a non-unified,
- // uniformly reached block with the "done" bit cleared.
- auto BlocksToUnify = std::move(ReturningBlocks);
- if (InsertExport) {
- llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
- }
-
- unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
- "UnifiedReturnBlock");
+ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index b9a8c6b..56befe4 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -171,7 +171,7 @@
static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
for (MachineLoop::iterator iter = LoopInfo.begin(),
iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
- (*iter)->print(dbgs(), 0);
+ (*iter)->print(dbgs());
}
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index af4a479..00032c7 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -11,6 +11,7 @@
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -113,9 +114,7 @@
ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
- ImmTyDLC,
- ImmTyGLC,
- ImmTySLC,
+ ImmTyCPol,
ImmTySWZ,
ImmTyTFE,
ImmTyD16,
@@ -299,6 +298,8 @@
return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
}
+ bool isVRegWithInputMods() const;
+
bool isSDWAOperand(MVT type) const;
bool isSDWAFP16Operand() const;
bool isSDWAFP32Operand() const;
@@ -336,12 +337,7 @@
bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
- bool isDLC() const { return isImmTy(ImmTyDLC); }
- bool isGLC() const { return isImmTy(ImmTyGLC); }
- // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced
- // value of the GLC operand.
- bool isGLC_1() const { return isImmTy(ImmTyGLC); }
- bool isSLC() const { return isImmTy(ImmTySLC); }
+ bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
@@ -449,6 +445,26 @@
return isSSrcF16();
}
+ bool isSSrcV2FP32() const {
+ llvm_unreachable("cannot happen");
+ return isSSrcF32();
+ }
+
+ bool isSCSrcV2FP32() const {
+ llvm_unreachable("cannot happen");
+ return isSCSrcF32();
+ }
+
+ bool isSSrcV2INT32() const {
+ llvm_unreachable("cannot happen");
+ return isSSrcB32();
+ }
+
+ bool isSCSrcV2INT32() const {
+ llvm_unreachable("cannot happen");
+ return isSCSrcB32();
+ }
+
bool isSSrcOrLdsB32() const {
return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) ||
isLiteralImm(MVT::i32) || isExpr();
@@ -502,6 +518,22 @@
return isVSrcB16() || isLiteralImm(MVT::v2i16);
}
+ bool isVCSrcV2FP32() const {
+ return isVCSrcF64();
+ }
+
+ bool isVSrcV2FP32() const {
+ return isVSrcF64() || isLiteralImm(MVT::v2f32);
+ }
+
+ bool isVCSrcV2INT32() const {
+ return isVCSrcB64();
+ }
+
+ bool isVSrcV2INT32() const {
+ return isVSrcB64() || isLiteralImm(MVT::v2i32);
+ }
+
bool isVSrcF32() const {
return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
}
@@ -542,6 +574,102 @@
return isVISrcF16() || isVISrcB32();
}
+ bool isVISrc_64B64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
+ }
+
+ bool isVISrc_64F64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f64);
+ }
+
+ bool isVISrc_64V2FP32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_64V2INT32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_256B64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
+ }
+
+ bool isVISrc_256F64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
+ }
+
+ bool isVISrc_128B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_128V2B16() const {
+ return isVISrc_128B16();
+ }
+
+ bool isVISrc_128B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_128F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_256V2FP32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_256V2INT32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_512B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_512B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_512V2B16() const {
+ return isVISrc_512B16();
+ }
+
+ bool isVISrc_512F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_512F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_512V2F16() const {
+ return isVISrc_512F16() || isVISrc_512B32();
+ }
+
+ bool isVISrc_1024B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_1024B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_1024V2B16() const {
+ return isVISrc_1024B16();
+ }
+
+ bool isVISrc_1024F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_1024F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_1024V2F16() const {
+ return isVISrc_1024F16() || isVISrc_1024B32();
+ }
+
bool isAISrcB32() const {
return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32);
}
@@ -566,6 +694,14 @@
return isAISrcF16() || isAISrcB32();
}
+ bool isAISrc_64B64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::i64);
+ }
+
+ bool isAISrc_64F64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::f64);
+ }
+
bool isAISrc_128B32() const {
return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32);
}
@@ -590,6 +726,22 @@
return isAISrc_128F16() || isAISrc_128B32();
}
+ bool isVISrc_128F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_128V2F16() const {
+ return isVISrc_128F16() || isVISrc_128B32();
+ }
+
+ bool isAISrc_256B64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::i64);
+ }
+
+ bool isAISrc_256F64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::f64);
+ }
+
bool isAISrc_512B32() const {
return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32);
}
@@ -837,9 +989,7 @@
case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
- case ImmTyDLC: OS << "DLC"; break;
- case ImmTyGLC: OS << "GLC"; break;
- case ImmTySLC: OS << "SLC"; break;
+ case ImmTyCPol: OS << "CPol"; break;
case ImmTySWZ: OS << "SWZ"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
@@ -1021,6 +1171,7 @@
bool ForcedDPP = false;
bool ForcedSDWA = false;
KernelScopeInfo KernelScope;
+ unsigned CPolSeen;
/// @name Auto-generated Match Functions
/// {
@@ -1061,7 +1212,8 @@
bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
- bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+ // TODO: Possibly make subtargetHasRegister const.
+ bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
bool ParseDirectiveAMDGPUHsaKernel();
bool ParseDirectiveISAVersion();
@@ -1105,7 +1257,7 @@
bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
unsigned RegWidth);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
+ bool IsAtomic, bool IsLds = false);
void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded);
@@ -1140,7 +1292,7 @@
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1157,7 +1309,7 @@
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
- if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
@@ -1165,10 +1317,6 @@
}
}
- bool hasXNACK() const {
- return AMDGPU::hasXNACK(getSTI());
- }
-
bool hasMIMG_R128() const {
return AMDGPU::hasMIMG_R128(getSTI());
}
@@ -1181,6 +1329,8 @@
return AMDGPU::hasGFX10A16(getSTI());
}
+ bool hasG16() const { return AMDGPU::hasG16(getSTI()); }
+
bool isSI() const {
return AMDGPU::isSI(getSTI());
}
@@ -1197,6 +1347,10 @@
return AMDGPU::isGFX9(getSTI());
}
+ bool isGFX90A() const {
+ return AMDGPU::isGFX90A(getSTI());
+ }
+
bool isGFX9Plus() const {
return AMDGPU::isGFX9Plus(getSTI());
}
@@ -1219,6 +1373,10 @@
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
}
+ bool hasArchitectedFlatScratch() const {
+ return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+ }
+
bool hasSGPR102_SGPR103() const {
return !isVI() && !isGFX9();
}
@@ -1294,8 +1452,9 @@
bool (*ConvertResult)(int64_t&) = nullptr);
OperandMatchResultTy
- parseNamedBit(const char *Name, OperandVector &Operands,
+ parseNamedBit(StringRef Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+ OperandMatchResultTy parseCPol(OperandVector &Operands);
OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
StringRef &Value,
SMLoc &StringLoc);
@@ -1379,14 +1538,19 @@
bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
- bool validateLdsDirect(const MCInst &Inst);
+ bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
+ bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
bool validateVccOperand(unsigned Reg) const;
bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+ bool validateAGPRLdSt(const MCInst &Inst) const;
+ bool validateVGPRAlign(const MCInst &Inst) const;
+ bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
bool validateDivScale(const MCInst &Inst);
bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
const SMLoc &IDLoc);
+ Optional<StringRef> validateLdsDirect(const MCInst &Inst);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1403,6 +1567,7 @@
bool isId(const AsmToken &Token, const StringRef Id) const;
bool isToken(const AsmToken::TokenKind Kind) const;
bool trySkipId(const StringRef Id);
+ bool trySkipId(const StringRef Pref, const StringRef Id);
bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind);
bool trySkipToken(const AsmToken::TokenKind Kind);
bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
@@ -1420,6 +1585,8 @@
void lex();
public:
+ void onBeginOfFile() override;
+
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);
@@ -1451,16 +1618,12 @@
OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands);
int64_t parseGPRIdxMacro();
- void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
- void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
- void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
- void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
+ void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false); }
+ void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true); }
+ void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
- AMDGPUOperand::Ptr defaultDLC() const;
- AMDGPUOperand::Ptr defaultGLC() const;
- AMDGPUOperand::Ptr defaultGLC_1() const;
- AMDGPUOperand::Ptr defaultSLC() const;
+ AMDGPUOperand::Ptr defaultCPol() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMEMOffset() const;
@@ -1474,6 +1637,8 @@
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx);
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
@@ -1482,6 +1647,9 @@
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands);
+ void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands);
+
+ bool parseDimId(unsigned &Encoding);
OperandMatchResultTy parseDim(OperandVector &Operands);
OperandMatchResultTy parseDPP8(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
@@ -1551,11 +1719,16 @@
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return &APFloat::IEEEdouble();
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
@@ -1715,7 +1888,8 @@
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslesly converted to f16.
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
- (type == MVT::v2i16)? MVT::i16 : type;
+ (type == MVT::v2i16)? MVT::i16 :
+ (type == MVT::v2f32)? MVT::f32 : type;
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -1725,6 +1899,13 @@
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
+bool AMDGPUOperand::isVRegWithInputMods() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ // GFX90A allows DPP on 64-bit operands.
+ (isRegClass(AMDGPU::VReg_64RegClassID) &&
+ AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]);
+}
+
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg32();
@@ -1751,8 +1932,9 @@
}
bool AMDGPUOperand::isBoolReg() const {
- return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
- (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32());
+ auto FB = AsmParser->getFeatureBits();
+ return isReg() && ((FB[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
+ (FB[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32()));
}
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
@@ -1806,6 +1988,7 @@
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1849,7 +2032,11 @@
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16: {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -1881,6 +2068,10 @@
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -1897,6 +2088,7 @@
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
setImmKindConst();
@@ -2000,6 +2192,7 @@
case 4: return AMDGPU::VReg_128RegClassID;
case 5: return AMDGPU::VReg_160RegClassID;
case 6: return AMDGPU::VReg_192RegClassID;
+ case 7: return AMDGPU::VReg_224RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
case 32: return AMDGPU::VReg_1024RegClassID;
@@ -2022,6 +2215,7 @@
case 4: return AMDGPU::SGPR_128RegClassID;
case 5: return AMDGPU::SGPR_160RegClassID;
case 6: return AMDGPU::SGPR_192RegClassID;
+ case 7: return AMDGPU::SGPR_224RegClassID;
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
@@ -2034,6 +2228,7 @@
case 4: return AMDGPU::AReg_128RegClassID;
case 5: return AMDGPU::AReg_160RegClassID;
case 6: return AMDGPU::AReg_192RegClassID;
+ case 7: return AMDGPU::AReg_224RegClassID;
case 8: return AMDGPU::AReg_256RegClassID;
case 16: return AMDGPU::AReg_512RegClassID;
case 32: return AMDGPU::AReg_1024RegClassID;
@@ -2529,7 +2724,7 @@
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
return nullptr;
}
- if (isHsaAbiVersion3(&getSTI())) {
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
@@ -3200,7 +3395,7 @@
return true;
unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
- unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+ unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
DMask = 1;
@@ -3230,6 +3425,7 @@
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16);
assert(VAddr0Idx != -1);
assert(SrsrcIdx != -1);
@@ -3241,22 +3437,26 @@
unsigned Dim = Inst.getOperand(DimIdx).getImm();
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
- unsigned VAddrSize =
+ unsigned ActualAddrSize =
IsNSA ? SrsrcIdx - VAddr0Idx
: AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
+ bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm());
- unsigned AddrSize = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
- (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ unsigned ExpectedAddrSize =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
+
if (!IsNSA) {
- if (AddrSize > 8)
- AddrSize = 16;
- else if (AddrSize > 4)
- AddrSize = 8;
+ if (ExpectedAddrSize > 8)
+ ExpectedAddrSize = 16;
+
+ // Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required.
+ // This provides backward compatibility for assembly created
+ // before 160b/192b/224b types were directly supported.
+ if (ActualAddrSize == 8 && (ExpectedAddrSize >= 5 && ExpectedAddrSize <= 7))
+ return true;
}
- return VAddrSize == AddrSize;
+ return ActualAddrSize == ExpectedAddrSize;
}
bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
@@ -3298,6 +3498,29 @@
return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
}
+bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+
+ if (!BaseOpcode->MSAA)
+ return true;
+
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ assert(DimIdx != -1);
+
+ unsigned Dim = Inst.getOperand(DimIdx).getImm();
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+
+ return DimInfo->MSAA;
+}
+
static bool IsMovrelsSDWAOpcode(const unsigned Opcode)
{
switch (Opcode) {
@@ -3559,7 +3782,7 @@
}
}
-bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
+Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
using namespace SIInstrFlags;
const unsigned Opcode = Inst.getOpcode();
@@ -3567,33 +3790,29 @@
// lds_direct register is defined so that it can be used
// with 9-bit operands only. Ignore encodings which do not accept these.
- if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0)
- return true;
+ const auto Enc = VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA;
+ if ((Desc.TSFlags & Enc) == 0)
+ return None;
- const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
- const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
- const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
-
- const int SrcIndices[] = { Src1Idx, Src2Idx };
-
- // lds_direct cannot be specified as either src1 or src2.
- for (int SrcIdx : SrcIndices) {
- if (SrcIdx == -1) break;
- const MCOperand &Src = Inst.getOperand(SrcIdx);
+ for (auto SrcName : {OpName::src0, OpName::src1, OpName::src2}) {
+ auto SrcIdx = getNamedOperandIdx(Opcode, SrcName);
+ if (SrcIdx == -1)
+ break;
+ const auto &Src = Inst.getOperand(SrcIdx);
if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
- return false;
+
+ if (isGFX90A())
+ return StringRef("lds_direct is not supported on this GPU");
+
+ if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA))
+ return StringRef("lds_direct cannot be used with this instruction");
+
+ if (SrcName != OpName::src0)
+ return StringRef("lds_direct may be used as src0 only");
}
}
- if (Src0Idx == -1)
- return true;
-
- const MCOperand &Src = Inst.getOperand(Src0Idx);
- if (!Src.isReg() || Src.getReg() != LDS_DIRECT)
- return true;
-
- // lds_direct is specified as src0. Check additional limitations.
- return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode);
+ return None;
}
SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
@@ -3624,7 +3843,7 @@
// For FLAT segment the offset must be positive;
// MSB is ignored and forced to zero.
- if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) {
+ if (TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)) {
unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true);
if (!isIntN(OffsetSize, Op.getImm())) {
Error(getFlatOffsetLoc(Operands),
@@ -3733,6 +3952,28 @@
return true;
}
+bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
+ const OperandVector &Operands) {
+ const unsigned Opc = Inst.getOpcode();
+ int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
+ if (DppCtrlIdx < 0)
+ return true;
+ unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+
+ if (!AMDGPU::isLegal64BitDPPControl(DppCtrl)) {
+ // DPP64 is supported for row_newbcast only.
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx >= 0 &&
+ getMRI()->getSubReg(Inst.getOperand(Src0Idx).getReg(), AMDGPU::sub1)) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
+ Error(S, "64 bit dpp only supports row_newbcast");
+ return false;
+ }
+ }
+
+ return true;
+}
+
// Check if VCC register matches wavefront size
bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
auto FB = getFeatureBits();
@@ -3802,18 +4043,148 @@
return true;
}
+// Returns -1 if not a register, 0 if VGPR and 1 if AGPR.
+static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx,
+ const MCRegisterInfo *MRI) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx);
+ if (OpIdx < 0)
+ return -1;
+
+ const MCOperand &Op = Inst.getOperand(OpIdx);
+ if (!Op.isReg())
+ return -1;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ auto Reg = Sub ? Sub : Op.getReg();
+ const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+ return AGPR32.contains(Reg) ? 1 : 0;
+}
+
+bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF |
+ SIInstrFlags::MTBUF | SIInstrFlags::MIMG |
+ SIInstrFlags::DS)) == 0)
+ return true;
+
+ uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+
+ const MCRegisterInfo *MRI = getMRI();
+ int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI);
+ int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI);
+
+ if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) {
+ int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI);
+ if (Data2Areg >= 0 && Data2Areg != DataAreg)
+ return false;
+ }
+
+ auto FB = getFeatureBits();
+ if (FB[AMDGPU::FeatureGFX90AInsts]) {
+ if (DataAreg < 0 || DstAreg < 0)
+ return true;
+ return DstAreg == DataAreg;
+ }
+
+ return DstAreg < 1 && DataAreg < 1;
+}
+
+bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
+ auto FB = getFeatureBits();
+ if (!FB[AMDGPU::FeatureGFX90AInsts])
+ return true;
+
+ const MCRegisterInfo *MRI = getMRI();
+ const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+ const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ if (!Sub)
+ continue;
+
+ if (VGPR32.contains(Sub) && ((Sub - AMDGPU::VGPR0) & 1))
+ return false;
+ if (AGPR32.contains(Sub) && ((Sub - AMDGPU::AGPR0) & 1))
+ return false;
+ }
+
+ return true;
+}
+
+// gfx90a has an undocumented limitation:
+// DS_GWS opcodes must use even aligned registers.
+bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
+ const OperandVector &Operands) {
+ if (!getFeatureBits()[AMDGPU::FeatureGFX90AInsts])
+ return true;
+
+ int Opc = Inst.getOpcode();
+ if (Opc != AMDGPU::DS_GWS_INIT_vi && Opc != AMDGPU::DS_GWS_BARRIER_vi &&
+ Opc != AMDGPU::DS_GWS_SEMA_BR_vi)
+ return true;
+
+ const MCRegisterInfo *MRI = getMRI();
+ const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+ int Data0Pos =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0);
+ assert(Data0Pos != -1);
+ auto Reg = Inst.getOperand(Data0Pos).getReg();
+ auto RegIdx = Reg - (VGPR32.contains(Reg) ? AMDGPU::VGPR0 : AMDGPU::AGPR0);
+ if (RegIdx & 1) {
+ SMLoc RegLoc = getRegLoc(Reg, Operands);
+ Error(RegLoc, "vgpr must be even aligned");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
const OperandVector &Operands,
const SMLoc &IDLoc) {
- int GLCPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
- AMDGPU::OpName::glc1);
- if (GLCPos != -1) {
- // -1 is set by GLC_1 default operand. In all cases "glc" must be present
- // in the asm string, and the default value means it is not present.
- if (Inst.getOperand(GLCPos).getImm() == -1) {
+ int CPolPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolPos == -1)
+ return true;
+
+ unsigned CPol = Inst.getOperand(CPolPos).getImm();
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & (SIInstrFlags::SMRD)) &&
+ (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) {
+ Error(IDLoc, "invalid cache policy for SMRD instruction");
+ return false;
+ }
+
+ if (isGFX90A() && (CPol & CPol::SCC)) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
+ Error(S, "scc is not supported on this GPU");
+ return false;
+ }
+
+ if (!(TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet)))
+ return true;
+
+ if (TSFlags & SIInstrFlags::IsAtomicRet) {
+ if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) {
Error(IDLoc, "instruction must use glc");
return false;
}
+ } else {
+ if (CPol & CPol::GLC) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]);
+ Error(S, "instruction must not use glc");
+ return false;
+ }
}
return true;
@@ -3822,9 +4193,8 @@
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
- if (!validateLdsDirect(Inst)) {
- Error(getRegLoc(AMDGPU::LDS_DIRECT, Operands),
- "invalid use of lds_direct");
+ if (auto ErrMsg = validateLdsDirect(Inst)) {
+ Error(getRegLoc(LDS_DIRECT, Operands), *ErrMsg);
return false;
}
if (!validateSOPLiteral(Inst)) {
@@ -3851,6 +4221,9 @@
"invalid op_sel operand");
return false;
}
+ if (!validateDPP(Inst, Operands)) {
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
@@ -3861,6 +4234,11 @@
Error(IDLoc, "dim modifier is required on this GPU");
return false;
}
+ if (!validateMIMGMSAA(Inst)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
+ "invalid dim; must be MSAA type");
+ return false;
+ }
if (!validateMIMGDataSize(Inst)) {
Error(IDLoc,
"image data size does not match dmask and tfe");
@@ -3893,6 +4271,26 @@
if (!validateMAIAccWrite(Inst, Operands)) {
return false;
}
+ if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
+ return false;
+ }
+
+ if (!validateAGPRLdSt(Inst)) {
+ Error(IDLoc, getFeatureBits()[AMDGPU::FeatureGFX90AInsts]
+ ? "invalid register class: data and dst should be all VGPR or AGPR"
+ : "invalid register class: agpr loads and stores not supported on this GPU"
+ );
+ return false;
+ }
+ if (!validateVGPRAlign(Inst)) {
+ Error(IDLoc,
+ "invalid register class: vgpr tuples must be 64 bit aligned");
+ return false;
+ }
+ if (!validateGWS(Inst, Operands)) {
+ return false;
+ }
+
if (!validateDivScale(Inst)) {
Error(IDLoc, "ABS not allowed in VOP3B instructions");
return false;
@@ -4062,21 +4460,19 @@
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
return TokError("directive only supported for amdgcn architecture");
- std::string Target;
-
- SMLoc TargetStart = getLoc();
- if (getParser().parseEscapedString(Target))
+ std::string TargetIDDirective;
+ SMLoc TargetStart = getTok().getLoc();
+ if (getParser().parseEscapedString(TargetIDDirective))
return true;
- SMRange TargetRange = SMRange(TargetStart, getLoc());
- std::string ExpectedTarget;
- raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+ SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+ if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
+ return getParser().Error(TargetRange.Start,
+ (Twine(".amdgcn_target directive's target id ") +
+ Twine(TargetIDDirective) +
+ Twine(" does not match the specified target id ") +
+ Twine(getTargetStreamer().getTargetID()->toString())).str());
- if (Target != ExpectedTargetOS.str())
- return Error(TargetRange.Start, "target must match options", TargetRange);
-
- getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
return false;
}
@@ -4143,12 +4539,12 @@
SMRange VGPRRange;
uint64_t NextFreeVGPR = 0;
+ uint64_t AccumOffset = 0;
SMRange SGPRRange;
uint64_t NextFreeSGPR = 0;
unsigned UserSGPRCount = 0;
bool ReserveVCC = true;
bool ReserveFlatScr = true;
- bool ReserveXNACK = hasXNACK();
Optional<bool> EnableWavefrontSize32;
while (true) {
@@ -4191,7 +4587,15 @@
if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
return OutOfRangeError(ValRange);
KD.private_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_kernarg_size") {
+ if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.kernarg_size = Val;
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
@@ -4222,6 +4626,10 @@
if (Val)
UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
@@ -4241,10 +4649,20 @@
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
- PARSE_BITS_ENTRY(
- KD.compute_pgm_rsrc2,
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
- ValRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+ } else if (ID == ".amdhsa_enable_private_segment") {
+ if (!hasArchitectedFlatScratch())
+ return Error(
+ IDRange.Start,
+ "directive is not supported without architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
@@ -4271,6 +4689,10 @@
} else if (ID == ".amdhsa_next_free_sgpr") {
SGPRRange = ValRange;
NextFreeSGPR = Val;
+ } else if (ID == ".amdhsa_accum_offset") {
+ if (!isGFX90A())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+ AccumOffset = Val;
} else if (ID == ".amdhsa_reserve_vcc") {
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
@@ -4278,6 +4700,10 @@
} else if (ID == ".amdhsa_reserve_flat_scratch") {
if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveFlatScr = Val;
@@ -4286,7 +4712,9 @@
return Error(IDRange.Start, "directive requires gfx8+", IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveXNACK = Val;
+ if (Val != getTargetStreamer().getTargetID()->isXnackOnOrAny())
+ return getParser().Error(IDRange.Start, ".amdhsa_reserve_xnack_mask does not match target id",
+ IDRange);
} else if (ID == ".amdhsa_float_round_mode_32") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
@@ -4311,6 +4739,11 @@
return Error(IDRange.Start, "directive requires gfx9+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
+ } else if (ID == ".amdhsa_tg_split") {
+ if (!isGFX90A())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val,
+ ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
@@ -4372,7 +4805,8 @@
unsigned VGPRBlocks;
unsigned SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
- ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR,
+ getTargetStreamer().getTargetID()->isXnackOnOrAny(),
+ EnableWavefrontSize32, NextFreeVGPR,
VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks,
SGPRBlocks))
return true;
@@ -4395,9 +4829,21 @@
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
UserSGPRCount);
+ if (isGFX90A()) {
+ if (Seen.find(".amdhsa_accum_offset") == Seen.end())
+ return TokError(".amdhsa_accum_offset directive is required");
+ if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+ return TokError("accum_offset should be in range [4..256] in "
+ "increments of 4");
+ if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+ return TokError("accum_offset exceeds total VGPR allocation");
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ (AccumOffset / 4 - 1));
+ }
+
getTargetStreamer().EmitAmdhsaKernelDescriptor(
getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
- ReserveFlatScr, ReserveXNACK);
+ ReserveFlatScr);
return false;
}
@@ -4423,9 +4869,9 @@
// targeted GPU.
if (isToken(AsmToken::EndOfStatement)) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
- getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
- ISA.Stepping,
- "AMD", "AMDGPU");
+ getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor,
+ ISA.Stepping,
+ "AMD", "AMDGPU");
return false;
}
@@ -4450,8 +4896,8 @@
if (!parseString(ArchName, "invalid arch name"))
return true;
- getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
- VendorName, ArchName);
+ getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping,
+ VendorName, ArchName);
return false;
}
@@ -4560,19 +5006,11 @@
"architectures");
}
- auto ISAVersionStringFromASM = getToken().getStringContents();
+ auto TargetIDDirective = getLexer().getTok().getStringContents();
+ if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
+ return Error(getParser().getTok().getLoc(), "target id must match options");
- std::string ISAVersionStringFromSTI;
- raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
- IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);
-
- if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
- return Error(getLoc(),
- ".amd_amdgpu_isa directive does not match triple and/or mcpu "
- "arguments specified through the command line");
- }
-
- getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str());
+ getTargetStreamer().EmitISAVersion();
Lex();
return false;
@@ -4582,7 +5020,7 @@
const char *AssemblerDirectiveBegin;
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
- isHsaAbiVersion3(&getSTI())
+ isHsaAbiVersion3Or4(&getSTI())
? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
HSAMD::V3::AssemblerDirectiveEnd)
: std::make_tuple(HSAMD::AssemblerDirectiveBegin,
@@ -4599,7 +5037,7 @@
HSAMetadataString))
return true;
- if (isHsaAbiVersion3(&getSTI())) {
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
return Error(getLoc(), "invalid HSA metadata");
} else {
@@ -4749,12 +5187,9 @@
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (isHsaAbiVersion3(&getSTI())) {
- if (IDVal == ".amdgcn_target")
- return ParseDirectiveAMDGCNTarget();
-
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (IDVal == ".amdhsa_kernel")
- return ParseDirectiveAMDHSAKernel();
+ return ParseDirectiveAMDHSAKernel();
// TODO: Restructure/combine with PAL metadata directive.
if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
@@ -4779,6 +5214,9 @@
return ParseDirectiveHSAMetadata();
}
+ if (IDVal == ".amdgcn_target")
+ return ParseDirectiveAMDGCNTarget();
+
if (IDVal == ".amdgpu_lds")
return ParseDirectiveAMDGPULDS();
@@ -4792,7 +5230,7 @@
}
bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
- unsigned RegNo) const {
+ unsigned RegNo) {
for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
@@ -4824,7 +5262,7 @@
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
- return (isVI() || isGFX9()) && hasXNACK();
+ return (isVI() || isGFX9()) && getTargetStreamer().getTargetID()->isXnackSupported();
case AMDGPU::SGPR_NULL:
return isGFX10Plus();
default:
@@ -4881,16 +5319,21 @@
unsigned Prefix = Operands.size();
for (;;) {
+ auto Loc = getLoc();
ResTy = parseReg(Operands);
+ if (ResTy == MatchOperand_NoMatch)
+ Error(Loc, "expected a register");
if (ResTy != MatchOperand_Success)
- return ResTy;
+ return MatchOperand_ParseFail;
RBraceLoc = getLoc();
if (trySkipToken(AsmToken::RBrac))
break;
- if (!trySkipToken(AsmToken::Comma))
+ if (!skipToken(AsmToken::Comma,
+ "expected a comma or a closing square bracket")) {
return MatchOperand_ParseFail;
+ }
}
if (Operands.size() - Prefix > 1) {
@@ -4940,11 +5383,9 @@
OperandMode Mode = OperandMode_Default;
if (IsMIMG && isGFX10Plus() && Operands.size() == 2)
Mode = OperandMode_NSA;
+ CPolSeen = 0;
OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
- // Eat the comma or space if there is one.
- trySkipToken(AsmToken::Comma);
-
if (Res != MatchOperand_Success) {
checkUnsupportedInstruction(Name, NameLoc);
if (!Parser.hasPendingError()) {
@@ -4959,6 +5400,9 @@
}
return true;
}
+
+ // Eat the comma or space if there is one.
+ trySkipToken(AsmToken::Comma);
}
return false;
@@ -5043,39 +5487,27 @@
}
OperandMatchResultTy
-AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
+AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy) {
- int64_t Bit = 0;
+ int64_t Bit;
SMLoc S = getLoc();
- // We are at the end of the statement, and this is a default argument, so
- // use a default value.
- if (!isToken(AsmToken::EndOfStatement)) {
- switch(getTokenKind()) {
- case AsmToken::Identifier: {
- StringRef Tok = getTokenStr();
- if (Tok == Name) {
- if (Tok == "r128" && !hasMIMG_R128())
- Error(S, "r128 modifier is not supported on this GPU");
- if (Tok == "a16" && !isGFX9() && !hasGFX10A16())
- Error(S, "a16 modifier is not supported on this GPU");
- Bit = 1;
- Parser.Lex();
- } else if (Tok.startswith("no") && Tok.endswith(Name)) {
- Bit = 0;
- Parser.Lex();
- } else {
- return MatchOperand_NoMatch;
- }
- break;
- }
- default:
- return MatchOperand_NoMatch;
- }
+ if (trySkipId(Name)) {
+ Bit = 1;
+ } else if (trySkipId("no", Name)) {
+ Bit = 0;
+ } else {
+ return MatchOperand_NoMatch;
}
- if (!isGFX10Plus() && ImmTy == AMDGPUOperand::ImmTyDLC)
+ if (Name == "r128" && !hasMIMG_R128()) {
+ Error(S, "r128 modifier is not supported on this GPU");
return MatchOperand_ParseFail;
+ }
+ if (Name == "a16" && !isGFX9() && !hasGFX10A16()) {
+ Error(S, "a16 modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
ImmTy = AMDGPUOperand::ImmTyR128A16;
@@ -5084,6 +5516,62 @@
return MatchOperand_Success;
}
+OperandMatchResultTy
+AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
+ unsigned CPolOn = 0;
+ unsigned CPolOff = 0;
+ SMLoc S = getLoc();
+
+ if (trySkipId("glc"))
+ CPolOn = AMDGPU::CPol::GLC;
+ else if (trySkipId("noglc"))
+ CPolOff = AMDGPU::CPol::GLC;
+ else if (trySkipId("slc"))
+ CPolOn = AMDGPU::CPol::SLC;
+ else if (trySkipId("noslc"))
+ CPolOff = AMDGPU::CPol::SLC;
+ else if (trySkipId("dlc"))
+ CPolOn = AMDGPU::CPol::DLC;
+ else if (trySkipId("nodlc"))
+ CPolOff = AMDGPU::CPol::DLC;
+ else if (trySkipId("scc"))
+ CPolOn = AMDGPU::CPol::SCC;
+ else if (trySkipId("noscc"))
+ CPolOff = AMDGPU::CPol::SCC;
+ else
+ return MatchOperand_NoMatch;
+
+ if (!isGFX10Plus() && ((CPolOn | CPolOff) & AMDGPU::CPol::DLC)) {
+ Error(S, "dlc modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
+
+ if (!isGFX90A() && ((CPolOn | CPolOff) & AMDGPU::CPol::SCC)) {
+ Error(S, "scc modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
+
+ if (CPolSeen & (CPolOn | CPolOff)) {
+ Error(S, "duplicate cache policy modifier");
+ return MatchOperand_ParseFail;
+ }
+
+ CPolSeen |= (CPolOn | CPolOff);
+
+ for (unsigned I = 1; I != Operands.size(); ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (Op.isCPol()) {
+ Op.setImm((Op.getImm() | CPolOn) & ~CPolOff);
+ return MatchOperand_Success;
+ }
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, CPolOn, S,
+ AMDGPUOperand::ImmTyCPol));
+
+ return MatchOperand_Success;
+}
+
static void addOptionalImmOperand(
MCInst& Inst, const OperandVector& Operands,
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
@@ -5757,7 +6245,7 @@
}
return false;
}
- if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+ if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) {
Error(Op.Loc, "invalid operation id");
return false;
}
@@ -5765,7 +6253,7 @@
Error(Stream.Loc, "message operation does not support streams");
return false;
}
- if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+ if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) {
Error(Stream.Loc, "invalid message stream id");
return false;
}
@@ -5934,6 +6422,18 @@
}
bool
+AMDGPUAsmParser::trySkipId(const StringRef Pref, const StringRef Id) {
+ if (isToken(AsmToken::Identifier)) {
+ StringRef Tok = getTokenStr();
+ if (Tok.startswith(Pref) && Tok.drop_front(Pref.size()) == Id) {
+ lex();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) {
if (isId(Id) && peekToken().is(Kind)) {
lex();
@@ -6489,32 +6989,38 @@
// mubuf
//===----------------------------------------------------------------------===//
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC_1() const {
- return AMDGPUOperand::CreateImm(this, -1, SMLoc(), AMDGPUOperand::ImmTyGLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol);
}
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
- const OperandVector &Operands,
- bool IsAtomic,
- bool IsAtomicReturn,
- bool IsLds) {
+ const OperandVector &Operands,
+ bool IsAtomic,
+ bool IsLds) {
bool IsLdsOpcode = IsLds;
bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
- assert(IsAtomicReturn ? IsAtomic : true);
unsigned FirstOperandIdx = 1;
+ bool IsAtomicReturn = false;
+
+ if (IsAtomic) {
+ for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (!Op.isCPol())
+ continue;
+ IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC;
+ break;
+ }
+
+ if (!IsAtomicReturn) {
+ int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode());
+ if (NewOpc != -1)
+ Inst.setOpcode(NewOpc);
+ }
+
+ IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags &
+ SIInstrFlags::IsAtomicRet;
+ }
for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -6565,18 +7071,12 @@
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
- if (!IsAtomic || IsAtomicReturn) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC,
- IsAtomicReturn ? -1 : 0);
- }
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
-
- if (isGFX10Plus())
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -6611,12 +7111,9 @@
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
-
- if (isGFX10Plus())
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
}
//===----------------------------------------------------------------------===//
@@ -6658,14 +7155,12 @@
if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
- if (IsGFX10Plus)
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::tfe) != -1)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
if (!IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
@@ -6676,6 +7171,61 @@
cvtMIMG(Inst, Operands, true);
}
+void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+ bool IsAtomicReturn = false;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (!Op.isCPol())
+ continue;
+ IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC;
+ break;
+ }
+
+ if (!IsAtomicReturn) {
+ int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode());
+ if (NewOpc != -1)
+ Inst.setOpcode(NewOpc);
+ }
+
+ IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags &
+ SIInstrFlags::IsAtomicRet;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ if (IsAtomicReturn && i == 1)
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle the case where soffset is an immediate
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ continue;
+ }
+ assert(Op.isImm());
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ if ((int)Inst.getNumOperands() <=
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
+}
+
void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst,
const OperandVector &Operands) {
for (unsigned I = 1; I < Operands.size(); ++I) {
@@ -6747,17 +7297,14 @@
return false;
}
+// Both bound_ctrl:0 and bound_ctrl:1 are encoded as 1.
+// This is intentional and ensures compatibility with sp3.
+// See bug 35397 for details.
static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
- if (BoundCtrl == 0) {
+ if (BoundCtrl == 0 || BoundCtrl == 1) {
BoundCtrl = 1;
return true;
}
-
- if (BoundCtrl == -1) {
- BoundCtrl = 0;
- return true;
- }
-
return false;
}
@@ -6772,9 +7319,7 @@
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
- {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
- {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
- {"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
+ {"", AMDGPUOperand::ImmTyCPol, false, nullptr},
{"swz", AMDGPUOperand::ImmTySWZ, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
@@ -6808,6 +7353,18 @@
{"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
};
+void AMDGPUAsmParser::onBeginOfFile() {
+ if (!getParser().getStreamer().getTargetStreamer() ||
+ getSTI().getTargetTriple().getArch() == Triple::r600)
+ return;
+
+ if (!getTargetStreamer().getTargetID())
+ getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());
+
+ if (isHsaAbiVersion3Or4(&getSTI()))
+ getTargetStreamer().EmitDirectiveAMDGCNTarget();
+}
+
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
OperandMatchResultTy res = parseOptionalOpr(Operands);
@@ -6857,6 +7414,8 @@
Op.ConvertResult);
} else if (Op.Type == AMDGPUOperand::ImmTyDim) {
res = parseDim(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyCPol) {
+ res = parseCPol(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
}
@@ -7010,6 +7569,7 @@
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_vi ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -7028,16 +7588,13 @@
cvtVOP3(Inst, Operands, OptionalIdx);
}
-void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
- const OperandVector &Operands) {
- OptionalImmIndexMap OptIdx;
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptIdx) {
const int Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
- cvtVOP3(Inst, Operands, OptIdx);
-
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
@@ -7046,7 +7603,10 @@
// FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
// instruction, and then figure out where to actually put the modifiers
- addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ if (OpSelIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+ }
int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
if (OpSelHiIdx != -1) {
@@ -7057,7 +7617,6 @@
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1) {
- assert(IsPacked);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
}
@@ -7069,16 +7628,16 @@
AMDGPU::OpName::src1_modifiers,
AMDGPU::OpName::src2_modifiers };
- int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
-
- unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ unsigned OpSel = 0;
unsigned OpSelHi = 0;
unsigned NegLo = 0;
unsigned NegHi = 0;
- if (OpSelHiIdx != -1) {
+ if (OpSelIdx != -1)
+ OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSelHiIdx != -1)
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
- }
if (NegLoIdx != -1) {
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
@@ -7111,6 +7670,12 @@
}
}
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptIdx;
+ cvtVOP3(Inst, Operands, OptIdx);
+ cvtVOP3P(Inst, Operands, OptIdx);
+}
+
//===----------------------------------------------------------------------===//
// dpp
//===----------------------------------------------------------------------===//
@@ -7167,6 +7732,39 @@
return isImm() && isUInt<16>(getImm());
}
+//===----------------------------------------------------------------------===//
+// dim
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) {
+ // We want to allow "dim:1D" etc.,
+ // but the initial 1 is tokenized as an integer.
+ std::string Token;
+ if (isToken(AsmToken::Integer)) {
+ SMLoc Loc = getToken().getEndLoc();
+ Token = std::string(getTokenStr());
+ lex();
+ if (getLoc() != Loc)
+ return false;
+ }
+
+ StringRef Suffix;
+ if (!parseId(Suffix))
+ return false;
+ Token += Suffix;
+
+ StringRef DimId = Token;
+ if (DimId.startswith("SQ_RSRC_IMG_"))
+ DimId = DimId.drop_front(12);
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
+ if (!DimInfo)
+ return false;
+
+ Encoding = DimInfo->Encoding;
+ return true;
+}
+
OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
if (!isGFX10Plus())
return MatchOperand_NoMatch;
@@ -7176,35 +7774,22 @@
if (!trySkipId("dim", AsmToken::Colon))
return MatchOperand_NoMatch;
- // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
- // integer.
- std::string Token;
- if (isToken(AsmToken::Integer)) {
- SMLoc Loc = getToken().getEndLoc();
- Token = std::string(getTokenStr());
- lex();
- if (getLoc() != Loc)
- return MatchOperand_ParseFail;
+ unsigned Encoding;
+ SMLoc Loc = getLoc();
+ if (!parseDimId(Encoding)) {
+ Error(Loc, "invalid dim value");
+ return MatchOperand_ParseFail;
}
- if (!isToken(AsmToken::Identifier))
- return MatchOperand_ParseFail;
- Token += getTokenStr();
- StringRef DimId = Token;
- if (DimId.startswith("SQ_RSRC_IMG_"))
- DimId = DimId.substr(12);
-
- const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
- if (!DimInfo)
- return MatchOperand_ParseFail;
-
- lex();
-
- Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Encoding, S,
AMDGPUOperand::ImmTyDim));
return MatchOperand_Success;
}
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
SMLoc S = getLoc();
@@ -7245,6 +7830,9 @@
bool
AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl,
const OperandVector &Operands) {
+ if (Ctrl == "row_newbcast")
+ return isGFX90A();
+
if (Ctrl == "row_share" ||
Ctrl == "row_xmask")
return isGFX10Plus();
@@ -7322,6 +7910,7 @@
.Case("row_ror", {DppCtrl::ROW_ROR0, 1, 15})
.Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15})
.Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15})
+ .Case("row_newbcast", {DppCtrl::ROW_NEWBCAST_FIRST, 0, 15})
.Default({-1, 0, 0});
bool Valid;
@@ -7400,6 +7989,9 @@
void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+ bool HasModifiers =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1;
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
@@ -7426,7 +8018,8 @@
if (IsDPP8) {
if (Op.isDPP8()) {
Op.addImmOperands(Inst, 1);
- } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ } else if (HasModifiers &&
+ isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithFPInputModsOperands(Inst, 2);
} else if (Op.isFI()) {
Fi = Op.getImm();
@@ -7436,8 +8029,11 @@
llvm_unreachable("Invalid operand type");
}
} else {
- if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ if (HasModifiers &&
+ isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
} else if (Op.isDPPCtrl()) {
Op.addImmOperands(Inst, 1);
} else if (Op.isImm()) {
@@ -7691,8 +8287,6 @@
return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
case MCK_lds:
return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
- case MCK_glc:
- return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
case MCK_idxen:
return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
case MCK_offen:
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 5dc5481..5f43aa8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -6,17 +6,12 @@
//
//===----------------------------------------------------------------------===//
-def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
-def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
-def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-
def BUFAddrKind {
int Offset = 0;
int OffEn = 1;
@@ -105,6 +100,8 @@
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
bits<4> elements = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccb_value = 0;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -113,6 +110,10 @@
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MTBUF = 1;
+
// copy relevant pseudo op flags
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -120,39 +121,47 @@
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
bits<12> offset;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
bits<7> format;
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> srsrc;
- bits<1> slc;
bits<1> tfe;
bits<8> soffset;
bits<4> dfmt = format{3-0};
bits<3> nfmt = format{6-4};
+
+ // GFX90A+ only: instruction uses AccVGPR for data
+ // Bit superceedes tfe.
+ bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
class getMTBUFInsDA<list<RegisterClass> vdataList,
list<RegisterClass> vaddrList=[]> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
+ offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+ offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz)
);
dag InsData = !if(!empty(vaddrList),
- (ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
- (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+ (ins vdata_op:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol,
+ TFE:$tfe, SWZ:$swz),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol,
+ TFE:$tfe, SWZ:$swz)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -202,9 +211,9 @@
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MTBUF_Pseudo<opName,
- (outs vdataClass:$vdata),
+ (outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -217,17 +226,11 @@
int elems, ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
- MTBUFAddr64Table<0, NAME>;
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
+ MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
- MTBUFAddr64Table<1, NAME>;
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
@@ -252,7 +255,7 @@
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -265,16 +268,10 @@
int elems, ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
- [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
@@ -341,6 +338,9 @@
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
bits<4> elements = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccb_value = 0;
+ bits<1> IsBufferInv = 0;
}
class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -349,6 +349,10 @@
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MUBUF = 1;
+
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
@@ -357,16 +361,23 @@
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
bits<12> offset;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> srsrc;
- bits<1> slc;
bits<1> tfe;
bits<8> soffset;
+
+ // GFX90A+ only: instruction uses AccVGPR for data
+ // Bit superceedes tfe.
+ bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
@@ -380,7 +391,8 @@
let mayLoad = 0;
let mayStore = 0;
- // Set everything to 0.
+ let IsBufferInv = 1;
+ // Set everything else to 0.
let offen = 0;
let idxen = 0;
let addr64 = 0;
@@ -395,6 +407,8 @@
let has_offset = 0;
let has_slc = 0;
let has_tfe = 0;
+ let has_sccb = 0;
+ let sccb_value = 0;
}
class getMUBUFInsDA<list<RegisterClass> vdataList,
@@ -402,33 +416,31 @@
bit isLds = 0> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, SLC:$slc),
+ offset:$offset, CPol_0:$cpol),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, SLC:$slc)
+ offset:$offset, CPol_0:$cpol)
);
dag InsData = !if(!empty(vaddrList),
- (ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
- (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
+ (ins vdata_op:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol)
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
+ !if(isLds, (ins SWZ_0:$swz), (ins TFE_0:$tfe, SWZ_0:$swz))
);
}
class getMUBUFElements<ValueType vt> {
- // eq does not support ValueType for some reason.
- string vtAsStr = !cast<string>(vt);
-
int ret =
- !if(!eq(vtAsStr, "f16"), 1,
- !if(!eq(vtAsStr, "v2f16"), 2,
- !if(!eq(vtAsStr, "v3f16"), 3,
- !if(!eq(vtAsStr, "v4f16"), 4,
+ !if(!eq(vt, f16), 1,
+ !if(!eq(vt, v2f16), 2,
+ !if(!eq(vt, v3f16), 3,
+ !if(!eq(vt, v4f16), 4,
!if(!eq(vt.Size, 32), 1,
!if(!eq(vt.Size, 64), 2,
!if(!eq(vt.Size, 96), 3,
@@ -482,13 +494,15 @@
bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
- int addrKindCopy = addrKind>
+ int addrKindCopy = addrKind,
+ RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret>
: MUBUF_Pseudo<opName,
- (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
+ (outs vdata_op:$vdata),
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
- !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe") # "$dlc$swz",
+ !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
+ !if(isLds, " lds", "$tfe") # "$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -506,15 +520,15 @@
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset))
>;
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -531,7 +545,7 @@
bit TiedDest = 0,
bit isLds = 0> {
- defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+ defvar legal_load_vt = !if(!eq(load_vt, v3f16), v4f16, load_vt);
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
@@ -567,7 +581,7 @@
: MUBUF_Pseudo<opName,
(outs),
getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -581,16 +595,16 @@
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt);
+ defvar legal_store_vt = !if(!eq(store_vt, v3f16), v4f16, store_vt);
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt,
[(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ i16:$offset))]>,
MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt,
[(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ i16:$offset))]>,
MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
@@ -608,8 +622,8 @@
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
- (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
- " $srsrc, $soffset$offset lds$glc$slc$swz"> {
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz),
+ " $srsrc, $soffset$offset lds$cpol$swz"> {
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
@@ -626,18 +640,19 @@
class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
list<RegisterClass> vaddrList=[]> {
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag ret = !if(vdata_in,
!if(!empty(vaddrList),
- (ins vdataClass:$vdata_in,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc),
- (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc)
+ (ins vdata_op:$vdata_in,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol),
+ (ins vdata_op:$vdata_in, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol)
),
!if(!empty(vaddrList),
- (ins vdataClass:$vdata,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
- (ins vdataClass:$vdata, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
+ (ins vdata_op:$vdata,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol)
));
}
@@ -678,7 +693,9 @@
let has_glc = 0;
let has_dlc = 0;
let has_tfe = 0;
+ let has_sccb = 1;
let maybeAtomic = 1;
+ let AsmMatchConverter = "cvtMubufAtomic";
}
class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
@@ -690,13 +707,14 @@
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
(outs),
getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol",
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let glc_value = 0;
let dlc_value = 0;
- let AsmMatchConverter = "cvtMubufAtomic";
+ let sccb_value = 0;
+ let IsAtomicNoRet = 1;
}
class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
@@ -704,19 +722,21 @@
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
+ RegisterClass vdataClassCopy = vdataClass,
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
- (outs vdataClassCopy:$vdata),
+ (outs vdata_op:$vdata),
getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol",
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
let glc_value = 1;
let dlc_value = 0;
+ let sccb_value = 0;
+ let IsAtomicRet = 1;
let Constraints = "$vdata = $vdata_in";
let DisableEncoding = "$vdata_in";
- let AsmMatchConverter = "cvtMubufAtomicReturn";
}
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
@@ -751,15 +771,15 @@
let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
- (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
+ (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, NAME # "_RTN">;
let FPAtomic = isFP in
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
- (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
- vdataType:$vdata_in))]>,
+ (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, NAME # "_RTN">;
let FPAtomic = isFP in
@@ -1106,6 +1126,15 @@
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
>;
+
+let OtherPredicates = [isGFX90APlus] in {
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN <
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32
+>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32
+>;
+}
} // End SubtargetPredicate = HasAtomicFaddInsts
//===----------------------------------------------------------------------===//
@@ -1154,6 +1183,17 @@
} // End let SubtargetPredicate = isGFX7Plus
+let SubtargetPredicate = isGFX90APlus in {
+ def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> {
+ }
+ def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> {
+ }
+
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
let SubtargetPredicate = isGFX10Plus in {
def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
@@ -1169,30 +1209,27 @@
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1201,8 +1238,7 @@
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1255,32 +1291,27 @@
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1289,9 +1320,8 @@
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_cpol $auxiliary),
+ 0, (extract_swz $auxiliary))
>;
}
@@ -1351,7 +1381,7 @@
timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1359,7 +1389,7 @@
timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1367,7 +1397,7 @@
i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1377,7 +1407,7 @@
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_slc $cachepolicy))
+ (set_glc $cachepolicy))
>;
}
@@ -1425,7 +1455,7 @@
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1433,7 +1463,7 @@
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1441,7 +1471,7 @@
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1451,7 +1481,7 @@
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy)
>;
}
@@ -1460,15 +1490,24 @@
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
}
+let SubtargetPredicate = isGFX90APlus in {
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">;
+} // End SubtargetPredicate = isGFX90APlus
+
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_slc $cachepolicy)), sub0)
+ (set_glc $cachepolicy)), VReg_64)), sub0)
>;
def : GCNPat<
@@ -1476,10 +1515,11 @@
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
@@ -1488,10 +1528,11 @@
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
@@ -1500,32 +1541,32 @@
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ i16:$offset))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_ld> {
def : GCNPat <
- (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+ (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+ (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset))
>;
}
@@ -1545,9 +1586,8 @@
PatFrag ld> {
def : GCNPat <
- (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $srsrc, $soffset, $offset)
>;
}
@@ -1570,12 +1610,12 @@
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
>;
}
@@ -1585,12 +1625,12 @@
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in)
>;
def : GCNPat <
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
+ (InstrOffset $srsrc, $soffset, $offset, $in)
>;
}
@@ -1635,14 +1675,13 @@
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is first
def : GCNPat <
- (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+ (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+ (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset))
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
@@ -1655,9 +1694,8 @@
PatFrag st> {
def : GCNPat <
- (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset)
>;
}
@@ -1671,13 +1709,13 @@
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0)
>;
}
@@ -1716,15 +1754,14 @@
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1732,8 +1769,7 @@
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1741,8 +1777,7 @@
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1752,8 +1787,7 @@
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1784,15 +1818,14 @@
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1800,8 +1833,7 @@
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1809,8 +1841,7 @@
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1820,8 +1851,7 @@
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1863,21 +1893,21 @@
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{16} = ps.lds;
let Inst{24-18} = op;
let Inst{31-26} = 0x38;
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
- let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25} = op{7};
}
@@ -1891,13 +1921,6 @@
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
- multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName,
- string asmName> {
- def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> {
- MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName);
- let AsmString = asmName # ps.AsmOperands;
- }
- }
multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
def _BOTHEN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -1929,16 +1952,33 @@
}
multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
def _BOTHEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx10", 1>;
def _IDXEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx10", 1>;
def _OFFEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx10", 1>;
def _OFFSET_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx10", 1>;
}
multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
- MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>;
+ MUBUF_Real_Atomics_RTN_gfx10<op> {
+ def _BOTHEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx10", 0>;
+ def _IDXEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx10", 0>;
+ def _OFFEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx10", 0>;
+ def _OFFSET_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx10", 0>;
+ }
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2018,18 +2058,38 @@
def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
}
- multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> :
- MUBUF_Real_AllAddr_gfx6_gfx7<op> {
+ multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 0>;
+ def _BOTHEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 0>;
+ def _IDXEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 0>;
+ def _OFFEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 0>;
+ def _OFFSET_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 0>;
+
def _ADDR64_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>,
+ AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 1>;
def _BOTHEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 1>;
def _IDXEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 1>;
def _OFFEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 1>;
def _OFFSET_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 1>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -2118,13 +2178,13 @@
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-16} = op;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
@@ -2135,7 +2195,7 @@
class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> {
- let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25-19} = format;
let Inst{53} = op{3};
}
@@ -2204,33 +2264,58 @@
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+class MUBUF_Real_Base_vi <bits<7> op, MUBUF_Pseudo ps, int Enc,
+ bit has_sccb = ps.has_sccb> :
MUBUF_Real<ps>,
Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isGFX8GFX9;
- let DecoderNamespace = "GFX8";
+ SIMCInstr<ps.PseudoInstr, Enc>,
+ AtomicNoRet<!subst("_RTN","",NAME), !if(ps.IsAtomicNoRet, 0,
+ !if(ps.IsAtomicRet, 1, ?))> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
+ let Inst{15} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccb_value);
let Inst{16} = ps.lds;
- let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps, bit has_sccb = ps.has_sccb> :
+ MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.VI, has_sccb> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+ let DecoderNamespace = "GFX8";
+
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+}
+
+class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps,
+ bit has_sccb = ps.has_sccb> :
+ MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX90A, has_sccb> {
+ let AssemblerPredicate = isGFX90APlus;
+ let DecoderNamespace = "GFX90A";
+ let AsmString = ps.Mnemonic # !subst("$sccb", !if(has_sccb, "$sccb",""),
+ !subst("$tfe", "", ps.AsmOperands));
+
+ let Inst{55} = acc;
+}
+
+multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> {
+ def _vi : MUBUF_Real_vi<op, ps>;
+ def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>;
+}
+
multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
- def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
- def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ defm _OFFSET : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ defm _OFFEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ defm _IDXEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ defm _BOTHEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
@@ -2252,6 +2337,24 @@
MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+
+ def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">;
+ def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">;
+ def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">;
+ def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">;
+
+ def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">;
+ def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">;
+ def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">;
+ def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">;
}
class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
@@ -2264,13 +2367,13 @@
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{16} = ps.lds;
- let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
@@ -2285,10 +2388,10 @@
multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
MUBUF_Real_AllAddr_vi<op> {
- def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
- def _OFFEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
- def _IDXEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
- def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ defm _OFFEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ defm _IDXEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ defm _BOTHEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>;
@@ -2374,46 +2477,79 @@
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
-def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+defm BUFFER_STORE_LDS_DWORD : MUBUF_Real_vi_gfx90a <0x3d, BUFFER_STORE_LDS_DWORD>;
+let AssemblerPredicate = isGFX8GFX9 in {
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+} // End AssemblerPredicate = isGFX8GFX9
let SubtargetPredicate = HasAtomicFaddInsts in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>;
-defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>;
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
} // End SubtargetPredicate = HasAtomicFaddInsts
-class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+let SubtargetPredicate = isGFX90APlus in {
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>;
+} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus
+
+def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> {
+}
+def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>;
+
+class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> :
MTBUF_Real<ps>,
Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isGFX8GFX9;
- let DecoderNamespace = "GFX8";
+ SIMCInstr<ps.PseudoInstr, Enc> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{53} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccb_value);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.VI> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+ let DecoderNamespace = "GFX8";
+
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+}
+
+class MTBUF_Real_gfx90a <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.GFX90A> {
+ let AssemblerPredicate = isGFX90APlus;
+ let DecoderNamespace = "GFX90A";
+ let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+ let Inst{55} = acc;
+}
+
+multiclass MTBUF_Real_vi_gfx90a<bits<4> op, MTBUF_Pseudo ps> {
+ def _vi : MTBUF_Real_vi<op, ps>;
+ def _gfx90a : MTBUF_Real_gfx90a<op, ps>;
+}
+
multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
- def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
- def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ defm _OFFSET : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ defm _OFFEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ defm _IDXEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ defm _BOTHEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
}
class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
@@ -2426,15 +2562,15 @@
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
@@ -2478,7 +2614,10 @@
def MUBUFInfoTable : GenericTable {
let FilterClass = "MUBUF_Pseudo";
let CppTypeName = "MUBUFInfo";
- let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
+ let Fields = [
+ "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset",
+ "IsBufferInv"
+ ];
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getMUBUFOpcodeHelper";
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 71f2026..a696834 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -44,6 +44,7 @@
AMDGPUAliasAnalysis.cpp
AMDGPUAlwaysInlinePass.cpp
AMDGPUAnnotateKernelFeatures.cpp
+ AMDGPUAttributor.cpp
AMDGPUAnnotateUniformValues.cpp
AMDGPUArgumentUsageInfo.cpp
AMDGPUAsmPrinter.cpp
@@ -67,6 +68,7 @@
AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
+ AMDGPULowerModuleLDSPass.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
@@ -80,6 +82,7 @@
AMDGPUPropagateAttributes.cpp
AMDGPURegBankCombiner.cpp
AMDGPURegisterBankInfo.cpp
+ AMDGPUReplaceLDSUseWithPointer.cpp
AMDGPURewriteOutArguments.cpp
AMDGPUSubtarget.cpp
AMDGPUTargetMachine.cpp
@@ -90,6 +93,7 @@
AMDGPUPerfHintAnalysis.cpp
AMDILCFGStructurizer.cpp
AMDGPUPrintfRuntimeBinding.cpp
+ AMDGPUResourceUsageAnalysis.cpp
GCNHazardRecognizer.cpp
GCNIterativeScheduler.cpp
GCNMinRegStrategy.cpp
@@ -109,7 +113,6 @@
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
- SIAddIMGInit.cpp
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp
@@ -118,7 +121,7 @@
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
SIInsertHardClauses.cpp
- SIInsertSkips.cpp
+ SILateBranchLowering.cpp
SIInsertWaitcnts.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
@@ -131,18 +134,18 @@
SIMemoryLegalizer.cpp
SIOptimizeExecMasking.cpp
SIOptimizeExecMaskingPreRA.cpp
+ SIOptimizeVGPRLiveRange.cpp
SIPeepholeSDWA.cpp
SIPostRABundler.cpp
SIPreEmitPeephole.cpp
SIProgramInfo.cpp
SIRegisterInfo.cpp
- SIRemoveShortExecBranches.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
- GCNRegBankReassign.cpp
GCNNSAReassign.cpp
GCNDPPCombine.cpp
+ GCNPreRAOptimizations.cpp
SIModeRegister.cpp
LINK_COMPONENTS
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index 328c810..ad9528e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -52,32 +52,41 @@
let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]);
}
-class DS_Real <DS_Pseudo ds> :
- InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>,
+class DS_Real <DS_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
Enc64 {
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let LGKM_CNT = 1;
let DS = 1;
let UseNamedOperandTable = 1;
// copy relevant pseudo op flags
- let SubtargetPredicate = ds.SubtargetPredicate;
- let OtherPredicates = ds.OtherPredicates;
- let AsmMatchConverter = ds.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
// encoding fields
- bits<8> vdst;
+ bits<10> vdst;
bits<1> gds;
bits<8> addr;
- bits<8> data0;
- bits<8> data1;
+ bits<10> data0;
+ bits<10> data1;
bits<8> offset0;
bits<8> offset1;
bits<16> offset;
- let offset0 = !if(ds.has_offset, offset{7-0}, ?);
- let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+ let offset0 = !if(ps.has_offset, offset{7-0}, ?);
+ let offset1 = !if(ps.has_offset, offset{15-8}, ?);
+
+ bits<1> acc = !if(ps.has_vdst, vdst{9},
+ !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
}
@@ -86,7 +95,7 @@
class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
- (ins rc:$data0, offset:$offset, gds:$gds),
+ (ins getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
" $data0$offset$gds"> {
let has_addr = 0;
@@ -97,11 +106,12 @@
class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
" $addr, $data0$offset$gds"> {
let has_data1 = 0;
let has_vdst = 0;
+ let IsAtomicNoRet = 1;
}
multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -114,13 +124,22 @@
}
}
-class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A1D_NORET<opName, rc>,
+ AtomicNoRet<opName, 0>;
+ }
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, offset:$offset, gds:$gds),
" $addr, $data0, $data1$offset$gds"> {
let has_vdst = 0;
+ let IsAtomicNoRet = 1;
}
multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -133,10 +152,11 @@
}
}
-class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+ (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
offset0:$offset0, offset1:$offset1, gds:$gds),
" $addr, $data0, $data1$offset0$offset1$gds"> {
@@ -153,14 +173,16 @@
}
}
-class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ (outs data_op:$vdst),
+ (ins VGPR_32:$addr, data_op:$data0, offset:$offset, gds:$gds),
" $vdst, $addr, $data0$offset$gds"> {
let hasPostISelHook = 1;
let has_data1 = 0;
+ let IsAtomicRet = 1;
}
multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
@@ -175,15 +197,27 @@
}
}
+multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32,
+ string NoRetOp = ""> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A1D_RET<opName, rc>,
+ AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp),
+ !if(!eq(NoRetOp, ""), 0, 1)>;
+ }
+}
+
class DS_1A2D_RET<string opName,
RegisterClass rc = VGPR_32,
- RegisterClass src = rc>
+ RegisterClass src = rc,
+ RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+ RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+ (outs dst_op:$vdst),
+ (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset:$offset, gds:$gds),
" $vdst, $addr, $data0, $data1$offset$gds"> {
let hasPostISelHook = 1;
+ let IsAtomicRet = 1;
}
multiclass DS_1A2D_RET_mc<string opName,
@@ -201,10 +235,12 @@
class DS_1A2D_Off8_RET<string opName,
RegisterClass rc = VGPR_32,
- RegisterClass src = rc>
+ RegisterClass src = rc,
+ RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+ RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+ (outs dst_op:$vdst),
+ (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
" $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
@@ -224,11 +260,12 @@
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
+ (outs data_op:$vdst),
!if(HasTiedOutput,
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
(ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
" $vdst, $addr$offset$gds"> {
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -250,7 +287,7 @@
class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
- (outs rc:$vdst),
+ (outs getLdStRegisterOperand<rc>.ret:$vdst),
(ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
" $vdst, $addr$offset0$offset1$gds"> {
@@ -269,7 +306,7 @@
}
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
+ (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
(ins VGPR_32:$addr, offset:$offset),
" $vdst, $addr$offset gds"> {
@@ -281,7 +318,7 @@
}
class DS_0A_RET <string opName> : DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
+ (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
(ins offset:$offset, gds:$gds),
" $vdst$offset$gds"> {
@@ -336,7 +373,8 @@
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> {
+ (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, offset:$offset),
+ " $data0$offset gds"> {
let has_gws_data0 = 1;
let hasSideEffects = 1;
@@ -360,10 +398,11 @@
let has_gds = 0;
}
-class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
+ RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret>
: DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
- (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+ (outs data_op:$vdst),
+ (ins VGPR_32:$addr, data_op:$data0, offset:$offset),
" $vdst, $addr, $data0$offset",
[(set i32:$vdst,
(node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
@@ -420,6 +459,11 @@
} // End mayLoad = 0
+let SubtargetPredicate = isGFX90APlus in {
+ defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>;
+ defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
+} // End SubtargetPredicate = isGFX90APlus
+
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -674,38 +718,6 @@
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
-let AddedComplexity = 100 in {
-
-foreach vt = VReg_64.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
-}
-
-let SubtargetPredicate = isGFX7Plus in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
-}
-
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
-}
-
-let SubtargetPredicate = HasUnalignedAccessMode in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
-}
-
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
-}
-
-} // End SubtargetPredicate = HasUnalignedAccessMode
-
-} // End SubtargetPredicate = isGFX7Plus
-
-} // End AddedComplexity = 100
-
let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
@@ -829,31 +841,38 @@
defm : DS128Bit8ByteAlignedPat_mc<vt>;
}
+// Prefer ds_read over ds_read2 and ds_write over ds_write2, all other things
+// being equal, because it has a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
}
let SubtargetPredicate = isGFX7Plus in {
foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
}
foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
}
let SubtargetPredicate = HasUnalignedAccessMode in {
+// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
+// for unaligned accesses?
foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
-}
+// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned
+// accesses.
} // End SubtargetPredicate = HasUnalignedAccessMode
@@ -938,6 +957,10 @@
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+let SubtargetPredicate = isGFX90APlus in {
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+}
+
def : Pat <
(SIds_ordered_count i32:$value, i16:$offset),
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
@@ -959,10 +982,10 @@
let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{25-18} = op;
let Inst{31-26} = 0x36;
- let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0));
- let Inst{47-40} = !if(ps.has_data0, data0, 0);
- let Inst{55-48} = !if(ps.has_data1, data1, 0);
- let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
}
//===----------------------------------------------------------------------===//
@@ -1166,22 +1189,23 @@
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
- DS_Real <ds>,
- SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
+class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
+ DS_Real <ps>,
+ SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
// encoding
- let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
- let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
- let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
+ let Inst{16} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{24-17} = op;
+ let Inst{25} = acc;
let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0));
- let Inst{47-40} = !if(ds.has_data0, data0, 0);
- let Inst{55-48} = !if(ds.has_data1, data1, 0);
- let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
}
def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
@@ -1344,3 +1368,8 @@
def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>;
def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>;
def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
+
+let SubtargetPredicate = isGFX90APlus in {
+ def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
+ def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+} // End SubtargetPredicate = isGFX90APlus
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 8061c6c..fe62b85 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -123,6 +123,7 @@
DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(VReg_256)
DECODE_OPERAND_REG(VReg_512)
+DECODE_OPERAND_REG(VReg_1024)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
@@ -135,7 +136,9 @@
DECODE_OPERAND_REG(SReg_512)
DECODE_OPERAND_REG(AGPR_32)
+DECODE_OPERAND_REG(AReg_64)
DECODE_OPERAND_REG(AReg_128)
+DECODE_OPERAND_REG(AReg_256)
DECODE_OPERAND_REG(AReg_512)
DECODE_OPERAND_REG(AReg_1024)
DECODE_OPERAND_REG(AV_32)
@@ -157,6 +160,14 @@
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
+static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
+}
+
static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -173,6 +184,14 @@
return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
}
+static DecodeStatus decodeOperand_AReg_64(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
+}
+
static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -181,6 +200,14 @@
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
}
+static DecodeStatus decodeOperand_AReg_256(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
+}
+
static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -197,6 +224,127 @@
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
}
+static DecodeStatus decodeOperand_VReg_64(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_128(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_256(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_512(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
+}
+
+static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
+ const MCRegisterInfo *MRI) {
+ if (OpIdx < 0)
+ return false;
+
+ const MCOperand &Op = Inst.getOperand(OpIdx);
+ if (!Op.isReg())
+ return false;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ auto Reg = Sub ? Sub : Op.getReg();
+ return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
+}
+
+static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
+ unsigned Imm,
+ AMDGPUDisassembler::OpWidthTy Opw,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ if (!DAsm->isGFX90A()) {
+ Imm &= 511;
+ } else {
+ // If atomic has both vdata and vdst their register classes are tied.
+ // The bit is decoded along with the vdst, first operand. We need to
+ // change register class to AGPR if vdst was AGPR.
+ // If a DS instruction has both data0 and data1 their register classes
+ // are also tied.
+ unsigned Opc = Inst.getOpcode();
+ uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
+ uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+ const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
+ int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
+ if ((int)Inst.getNumOperands() == DataIdx) {
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (IsAGPROperand(Inst, DstIdx, MRI))
+ Imm |= 512;
+ }
+
+ if (TSFlags & SIInstrFlags::DS) {
+ int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+ if ((int)Inst.getNumOperands() == Data2Idx &&
+ IsAGPROperand(Inst, DataIdx, MRI))
+ Imm |= 512;
+ }
+ }
+ return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
+}
+
+static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW32, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW64, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW96, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW128, Decoder);
+}
+
static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -250,6 +398,9 @@
return MCDisassembler::Fail;
}
+// The disassembler is greedy, so we need to check FI operand value to
+// not parse a dpp if the correct literal is not set. For dpp16 the
+// autogenerated decoder checks the dpp literal
static bool isValidDPP8(const MCInst &MI) {
using namespace llvm::AMDGPU::DPP;
int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
@@ -341,6 +492,12 @@
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
if (Res) break;
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address);
+ if (Res)
+ break;
+ }
+
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
if (Res) break;
@@ -351,6 +508,13 @@
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address);
+ if (Res)
+ break;
+ }
+
Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
if (Res) break;
@@ -369,6 +533,7 @@
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -379,9 +544,44 @@
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MUBUF | SIInstrFlags::FLAT)) &&
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::glc1) != -1) {
- insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1);
+ (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+ int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolPos != -1) {
+ unsigned CPol =
+ (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
+ AMDGPU::CPol::GLC : 0;
+ if (MI.getNumOperands() <= (unsigned)CPolPos) {
+ insertNamedMCOperand(MI, MCOperand::createImm(CPol),
+ AMDGPU::OpName::cpol);
+ } else if (CPol) {
+ MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
+ }
+ }
+ }
+
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+ (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) {
+ // GFX90A lost TFE, its place is occupied by ACC.
+ int TFEOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+ if (TFEOpIdx != -1) {
+ auto TFEIter = MI.begin();
+ std::advance(TFEIter, TFEOpIdx);
+ MI.insert(TFEIter, MCOperand::createImm(0));
+ }
+ }
+
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+ int SWZOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
+ if (SWZOpIdx != -1) {
+ auto SWZIter = MI.begin();
+ std::advance(SWZIter, SWZOpIdx);
+ MI.insert(SWZIter, MCOperand::createImm(0));
+ }
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
@@ -453,6 +653,8 @@
return MCDisassembler::Success;
}
+// We must check FI == literal to reject not genuine dpp8 insts, and we must
+// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
@@ -513,21 +715,21 @@
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
unsigned DimIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
+ int A16Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
const AMDGPU::MIMGDimInfo *Dim =
AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
+ const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
- AddrSize = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
- (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ AddrSize =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
+
IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
if (!IsNSA) {
if (AddrSize > 8)
AddrSize = 16;
- else if (AddrSize > 4)
- AddrSize = 8;
} else {
if (AddrSize > Info->VAddrDwords) {
// The NSA encoding does not contain enough operands for the combination
@@ -545,7 +747,7 @@
DstSize = (DstSize + 1) / 2;
}
- if (MI.getOperand(TFEIdx).getImm())
+ if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
DstSize += 1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
@@ -701,6 +903,10 @@
return decodeSrcOp(OPWV216, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const {
+ return decodeSrcOp(OPWV232, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
// Some instructions have operand restrictions beyond what the encoding
// allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -718,10 +924,18 @@
return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
}
@@ -758,6 +972,10 @@
return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_1024RegClassID, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
// table-gen generated disassembler doesn't care about operand types
// leaving only registry class so SSrc_32 operand turns into SReg_32
@@ -914,8 +1132,10 @@
case OPW128: // splat constants
case OPW512:
case OPW1024:
+ case OPWV232:
return MCOperand::createImm(getInlineImmVal32(Imm));
case OPW64:
+ case OPW256:
return MCOperand::createImm(getInlineImmVal64(Imm));
case OPW16:
case OPWV216:
@@ -935,8 +1155,14 @@
case OPW16:
case OPWV216:
return VGPR_32RegClassID;
- case OPW64: return VReg_64RegClassID;
+ case OPW64:
+ case OPWV232: return VReg_64RegClassID;
+ case OPW96: return VReg_96RegClassID;
case OPW128: return VReg_128RegClassID;
+ case OPW160: return VReg_160RegClassID;
+ case OPW256: return VReg_256RegClassID;
+ case OPW512: return VReg_512RegClassID;
+ case OPW1024: return VReg_1024RegClassID;
}
}
@@ -950,8 +1176,11 @@
case OPW16:
case OPWV216:
return AGPR_32RegClassID;
- case OPW64: return AReg_64RegClassID;
+ case OPW64:
+ case OPWV232: return AReg_64RegClassID;
+ case OPW96: return AReg_96RegClassID;
case OPW128: return AReg_128RegClassID;
+ case OPW160: return AReg_160RegClassID;
case OPW256: return AReg_256RegClassID;
case OPW512: return AReg_512RegClassID;
case OPW1024: return AReg_1024RegClassID;
@@ -969,8 +1198,11 @@
case OPW16:
case OPWV216:
return SGPR_32RegClassID;
- case OPW64: return SGPR_64RegClassID;
+ case OPW64:
+ case OPWV232: return SGPR_64RegClassID;
+ case OPW96: return SGPR_96RegClassID;
case OPW128: return SGPR_128RegClassID;
+ case OPW160: return SGPR_160RegClassID;
case OPW256: return SGPR_256RegClassID;
case OPW512: return SGPR_512RegClassID;
}
@@ -986,7 +1218,8 @@
case OPW16:
case OPWV216:
return TTMP_32RegClassID;
- case OPW64: return TTMP_64RegClassID;
+ case OPW64:
+ case OPWV232: return TTMP_64RegClassID;
case OPW128: return TTMP_128RegClassID;
case OPW256: return TTMP_256RegClassID;
case OPW512: return TTMP_512RegClassID;
@@ -1040,6 +1273,7 @@
case OPWV216:
return decodeSpecialReg32(Val);
case OPW64:
+ case OPWV232:
return decodeSpecialReg64(Val);
default:
llvm_unreachable("unexpected immediate type");
@@ -1209,6 +1443,10 @@
bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
+bool AMDGPUDisassembler::isGFX90A() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
@@ -1217,6 +1455,10 @@
return AMDGPU::isGFX10Plus(STI);
}
+bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling
//===----------------------------------------------------------------------===//
@@ -1276,7 +1518,8 @@
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
- KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
+ if (!hasArchitectedFlatScratch())
+ KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
@@ -1327,9 +1570,12 @@
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
using namespace amdhsa;
StringRef Indent = "\t";
- PRINT_DIRECTIVE(
- ".amdhsa_system_sgpr_private_segment_wavefront_offset",
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ if (hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ else
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
@@ -1387,7 +1633,6 @@
uint16_t TwoByteBuffer = 0;
uint32_t FourByteBuffer = 0;
- uint64_t EightByteBuffer = 0;
StringRef ReservedBytes;
StringRef Indent = "\t";
@@ -1408,11 +1653,19 @@
<< FourByteBuffer << '\n';
return MCDisassembler::Success;
+ case amdhsa::KERNARG_SIZE_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ KdStream << Indent << ".amdhsa_kernarg_size "
+ << FourByteBuffer << '\n';
+ return MCDisassembler::Success;
+
case amdhsa::RESERVED0_OFFSET:
- // 8 reserved bytes, must be 0.
- EightByteBuffer = DE.getU64(Cursor);
- if (EightByteBuffer) {
- return MCDisassembler::Fail;
+ // 4 reserved bytes, must be 0.
+ ReservedBytes = DE.getBytes(Cursor, 4);
+ for (int I = 0; I < 4; ++I) {
+ if (ReservedBytes[I] != 0) {
+ return MCDisassembler::Fail;
+ }
}
return MCDisassembler::Success;
@@ -1463,8 +1716,9 @@
using namespace amdhsa;
TwoByteBuffer = DE.getU16(Cursor);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
@@ -1473,8 +1727,9 @@
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -1589,6 +1844,8 @@
Inst.addOperand(MCOperand::createExpr(Add));
return true;
}
+ // Add to list of referenced addresses, so caller can synthesize a label.
+ ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 714dabb..dc879ec 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -99,12 +99,14 @@
MCOperand decodeOperand_VS_128(unsigned Val) const;
MCOperand decodeOperand_VSrc16(unsigned Val) const;
MCOperand decodeOperand_VSrcV216(unsigned Val) const;
+ MCOperand decodeOperand_VSrcV232(unsigned Val) const;
MCOperand decodeOperand_VReg_64(unsigned Val) const;
MCOperand decodeOperand_VReg_96(unsigned Val) const;
MCOperand decodeOperand_VReg_128(unsigned Val) const;
MCOperand decodeOperand_VReg_256(unsigned Val) const;
MCOperand decodeOperand_VReg_512(unsigned Val) const;
+ MCOperand decodeOperand_VReg_1024(unsigned Val) const;
MCOperand decodeOperand_SReg_32(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
@@ -117,7 +119,9 @@
MCOperand decodeOperand_SReg_512(unsigned Val) const;
MCOperand decodeOperand_AGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_AReg_64(unsigned Val) const;
MCOperand decodeOperand_AReg_128(unsigned Val) const;
+ MCOperand decodeOperand_AReg_256(unsigned Val) const;
MCOperand decodeOperand_AReg_512(unsigned Val) const;
MCOperand decodeOperand_AReg_1024(unsigned Val) const;
MCOperand decodeOperand_AV_32(unsigned Val) const;
@@ -126,12 +130,15 @@
enum OpWidthTy {
OPW32,
OPW64,
+ OPW96,
OPW128,
+ OPW160,
OPW256,
OPW512,
OPW1024,
OPW16,
OPWV216,
+ OPWV232,
OPW_LAST_,
OPW_FIRST_ = OPW32
};
@@ -159,11 +166,16 @@
int getTTmpIdx(unsigned Val) const;
+ const MCInstrInfo *getMCII() const { return MCII.get(); }
+
bool isVI() const;
bool isGFX9() const;
+ bool isGFX90A() const;
bool isGFX9Plus() const;
bool isGFX10() const;
bool isGFX10Plus() const;
+
+ bool hasArchitectedFlatScratch() const;
};
//===----------------------------------------------------------------------===//
@@ -173,6 +185,7 @@
class AMDGPUSymbolizer : public MCSymbolizer {
private:
void *DisInfo;
+ std::vector<uint64_t> ReferencedAddresses;
public:
AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo,
@@ -187,6 +200,10 @@
void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
int64_t Value,
uint64_t Address) override;
+
+ ArrayRef<uint64_t> getReferencedAddresses() const override {
+ return ReferencedAddresses;
+ }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 8d3e138..596c3d7 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -532,7 +532,10 @@
def : UMad24Pat<MULADD_UINT24_eg>;
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
-def : FSHRPattern <BIT_ALIGN_INT_eg>;
+def : AMDGPUPat <
+ (fshr i32:$src0, i32:$src1, i32:$src2),
+ (BIT_ALIGN_INT_eg $src0, $src1, $src2)
+>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 57a355a..90f26e5 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,9 +6,9 @@
//
//===----------------------------------------------------------------------===//
-def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
-def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
-def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
+def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
@@ -54,6 +54,8 @@
bits<1> glcValue = 0;
bits<1> has_dlc = 1;
bits<1> dlcValue = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccbValue = 0;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -67,9 +69,9 @@
let VM_CNT = 1;
let LGKM_CNT = !not(!or(is_flat_global, is_flat_scratch));
- let IsFlatGlobal = is_flat_global;
+ let FlatGlobal = is_flat_global;
- let IsFlatScratch = is_flat_scratch;
+ let FlatScratch = is_flat_scratch;
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -79,22 +81,29 @@
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let FLAT = 1;
+
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
- let OtherPredicates = ps.OtherPredicates;
- let TSFlags = ps.TSFlags;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
+ let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let VM_CNT = ps.VM_CNT;
+ let LGKM_CNT = ps.LGKM_CNT;
// encoding fields
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> saddr;
- bits<8> vdst;
+ bits<10> vdst;
- bits<1> slc;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
@@ -106,7 +115,8 @@
// Signed offset. Highest bit ignored for flat and treated as 12-bit
// unsigned for flat accesses.
bits<13> offset;
- bits<1> nv = 0; // XXX - What does this actually do?
+ // GFX90A+ only: instruction uses AccVGPR for data
+ bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0));
// We don't use tfe right now, and it was removed in gfx9.
bits<1> tfe = 0;
@@ -116,17 +126,17 @@
let Inst{13} = lds;
let Inst{15-14} = seg;
- let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
- let Inst{17} = slc;
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
+ let Inst{17} = cpol{CPolBit.SLC};
let Inst{24-18} = op;
let Inst{31-26} = 0x37; // Encoding.
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_data, vdata, ?);
+ let Inst{47-40} = !if(ps.has_data, vdata{7-0}, ?);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
// 54-48 is reserved.
- let Inst{55} = nv; // nv on GFX9+, TFE before.
- let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+ let Inst{55} = acc; // nv on GFX9+, TFE before. AccVGPR for data on GFX90A.
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?);
}
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -139,9 +149,10 @@
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
bit HasTiedOutput = 0,
- bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0,
+ RegisterOperand vdata_op = getLdStRegisterOperand<regClass>.ret> : FLAT_Pseudo<
opName,
- (outs regClass:$vdst),
+ (outs vdata_op:$vdst),
!con(
!con(
!if(EnableSaddr,
@@ -149,9 +160,9 @@
(ins VReg_64:$vaddr)),
(ins flat_offset:$offset)),
// FIXME: Operands with default values do not work with following non-optional operands.
- !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
- (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+ !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in),
+ (ins CPol_0:$cpol))),
+ " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = HasSaddr;
@@ -169,10 +180,10 @@
(outs),
!con(
!if(EnableSaddr,
- (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr),
- (ins VReg_64:$vaddr, vdataClass:$vdata)),
- (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)),
- " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+ (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64:$saddr),
+ (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -196,9 +207,9 @@
opName,
(outs regClass:$vdst),
!con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
- (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ (ins flat_offset:$offset, CPol_0:$cpol),
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
- " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_global = 1;
let has_data = 0;
let mayLoad = 1;
@@ -234,8 +245,8 @@
opName,
(outs),
!con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
- (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ (ins flat_offset:$offset, CPol:$cpol)),
+ " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_global = 1;
let mayLoad = 0;
let mayStore = 1;
@@ -266,16 +277,16 @@
bit EnableVaddr = !not(EnableSaddr)>
: FLAT_Pseudo<
opName,
- (outs regClass:$vdst),
+ (outs getLdStRegisterOperand<regClass>.ret:$vdst),
!con(
!if(EnableSaddr,
(ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
!if(EnableVaddr,
(ins VGPR_32:$vaddr, flat_offset:$offset),
(ins flat_offset:$offset))),
- !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
- (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
+ (ins CPol_0:$cpol))),
+ " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = 1;
@@ -289,15 +300,16 @@
}
class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
- bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo<
+ bit EnableVaddr = !not(EnableSaddr),
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
!if(EnableVaddr,
- (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
- (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))),
+ " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -344,7 +356,10 @@
let has_dlc = 0;
let dlcValue = 0;
let has_vdst = 0;
+ let has_sccb = 1;
+ let sccbValue = 0;
let maybeAtomic = 1;
+ let IsAtomicNoRet = 1;
}
class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
@@ -354,6 +369,9 @@
let has_vdst = 1;
let glcValue = 1;
let dlcValue = 0;
+ let sccbValue = 0;
+ let IsAtomicNoRet = 0;
+ let IsAtomicRet = 1;
let PseudoInstr = NAME # "_RTN";
}
@@ -364,11 +382,12 @@
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata$offset$slc">,
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
@@ -377,11 +396,11 @@
}
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata$offset$glc1$slc",
+ (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata$offset$cpol",
[(set vt:$vdst,
- (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
+ (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>{
let FPAtomic = isFP;
@@ -396,12 +415,13 @@
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata, off$offset$slc">,
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, off$offset$cpol">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
@@ -411,8 +431,8 @@
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata, $saddr$offset$slc">,
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
@@ -429,14 +449,16 @@
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+ RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata, off$offset$glc1$slc",
+ (outs vdst_op:$vdst),
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, off$offset$cpol",
[(set vt:$vdst,
- (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
+ (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
@@ -444,9 +466,9 @@
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">,
+ (outs vdst_op:$vdst),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
@@ -605,6 +627,15 @@
} // End SubtargetPredicate = isGFX7GFX10
+let SubtargetPredicate = isGFX90APlus in {
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -777,6 +808,15 @@
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
} // End OtherPredicates = [HasAtomicFaddInsts]
+
+let OtherPredicates = [isGFX90APlus] in {
+ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
+ "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+ >;
+ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+ >;
+} // End OtherPredicates = [isGFX90APlus]
} // End is_flat_global = 1
//===----------------------------------------------------------------------===//
@@ -785,33 +825,33 @@
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffset i64:$vaddr, i16:$offset))),
+ (vt (node (FlatOffset i64:$vaddr, i16:$offset))),
(inst $vaddr, $offset)
>;
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (node (FlatOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, $in)
>;
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, $in)
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, 0, 0, $in)
+ (inst $saddr, $voffset, $offset, 0, $in)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))),
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset))),
(inst $vaddr, $offset)
>;
class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))),
- (inst $saddr, $voffset, $offset, 0, 0, 0)
+ (inst $saddr, $voffset, $offset, 0)
>;
class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -839,19 +879,19 @@
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)),
+ (node vt:$data, (FlatOffset i64:$vaddr, i16:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset)),
+ (node vt:$data, (GlobalOffset i64:$vaddr, i16:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
@@ -859,29 +899,29 @@
ValueType vt, ValueType data_vt = vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data),
+ (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data),
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst $vaddr, $data, $offset)
>;
class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffsetSigned i64:$vaddr, i16:$offset), vt:$data),
+ (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)),
+ (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst $vaddr, $data, $offset)
>;
@@ -892,7 +932,7 @@
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (inst $vaddr, $offset, 0, $in)
>;
class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -907,7 +947,7 @@
class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
- (inst $saddr, $offset, 0, 0, 0, $in)
+ (inst $saddr, $offset, 0, $in)
>;
class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -1202,6 +1242,17 @@
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
}
+let OtherPredicates = [isGFX90APlus] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_64, f64>;
+}
+
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1337,16 +1388,21 @@
// VI
//===----------------------------------------------------------------------===//
-class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
+class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
+
+ let Inst{25} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
+ let AsmString = ps.Mnemonic #
+ !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
}
-multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)>;
- def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+multiclass FLAT_Real_AllAddr_vi<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
@@ -1374,15 +1430,17 @@
def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
-multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
- def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+ def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
}
-multiclass FLAT_Global_Real_Atomics_vi<bits<7> op> :
- FLAT_Real_AllAddr_vi<op> {
- def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
- def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
+ FLAT_Real_AllAddr_vi<op, has_sccb> {
+ def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+ def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
}
@@ -1489,6 +1547,19 @@
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
+}
+
+let SubtargetPredicate = isGFX90AOnly in {
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64, 0>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64, 0>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64, 0>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f, 0>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50, 0>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
+} // End SubtargetPredicate = isGFX90AOnly
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1500,7 +1571,7 @@
let DecoderNamespace = "GFX10";
let Inst{11-0} = offset{11-0};
- let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
+ let Inst{12} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
let Inst{55} = 0;
}
@@ -1695,10 +1766,3 @@
defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
-
-let SubtargetPredicate = HasAtomicFaddInsts in {
-
-defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>;
-
-} // End SubtargetPredicate = HasAtomicFaddInsts
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e4eacd1..2bf3651 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -54,21 +54,20 @@
class GCNDPPCombine : public MachineFunctionPass {
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
+ const GCNSubtarget *ST;
using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
- MachineInstr *createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
+ MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
- MachineOperand *OldOpnd,
- bool CombBCZ) const;
+ MachineOperand *OldOpnd, bool CombBCZ,
+ bool IsShrinkable) const;
- MachineInstr *createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
- RegSubRegPair CombOldVGPR,
- bool CombBCZ) const;
+ MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
+ RegSubRegPair CombOldVGPR, bool CombBCZ,
+ bool IsShrinkable) const;
bool hasNoImmOrEqual(MachineInstr &MI,
unsigned OpndName,
@@ -99,7 +98,8 @@
}
private:
- int getDPPOp(unsigned Op) const;
+ int getDPPOp(unsigned Op, bool IsShrinkable) const;
+ bool isShrinkable(MachineInstr &MI) const;
};
} // end anonymous namespace
@@ -114,11 +114,40 @@
return new GCNDPPCombine();
}
-int GCNDPPCombine::getDPPOp(unsigned Op) const {
+bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
+ unsigned Op = MI.getOpcode();
+ if (!TII->isVOP3(Op)) {
+ return false;
+ }
+ if (!TII->hasVALU32BitEncoding(Op)) {
+ LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n");
+ return false;
+ }
+ if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ // Give up if there are any uses of the carry-out from instructions like
+ // V_ADD_CO_U32. The shrunken form of the instruction would write it to vcc
+ // instead of to a virtual register.
+ if (!MRI->use_nodbg_empty(SDst->getReg()))
+ return false;
+ }
+ // check if other than abs|neg modifiers are set (opsel for example)
+ const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
+ if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
+ LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
+ return false;
+ }
+ return true;
+}
+
+int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
auto DPP32 = AMDGPU::getDPPOp32(Op);
- if (DPP32 == -1) {
+ if (IsShrinkable) {
+ assert(DPP32 == -1);
auto E32 = AMDGPU::getVOPe32(Op);
- DPP32 = (E32 == -1)? -1 : AMDGPU::getDPPOp32(E32);
+ DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
}
return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
}
@@ -137,7 +166,8 @@
case AMDGPU::IMPLICIT_DEF:
return nullptr;
case AMDGPU::COPY:
- case AMDGPU::V_MOV_B32_e32: {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B64_PSEUDO: {
auto &Op1 = Def->getOperand(1);
if (Op1.isImm())
return &Op1;
@@ -150,11 +180,13 @@
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
- bool CombBCZ) const {
- assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ bool CombBCZ,
+ bool IsShrinkable) const {
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
auto OrigOp = OrigMI.getOpcode();
- auto DPPOp = getDPPOp(OrigOp);
+ auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
return nullptr;
@@ -174,7 +206,11 @@
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
- assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+ assert(isOfRegClass(
+ CombOldVGPR,
+ *MRI->getRegClass(
+ TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
+ *MRI));
auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
CombOldVGPR.SubReg);
@@ -308,11 +344,9 @@
return false;
}
-MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
- RegSubRegPair CombOldVGPR,
- MachineOperand *OldOpndValue,
- bool CombBCZ) const {
+MachineInstr *GCNDPPCombine::createDPPInst(
+ MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
+ MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
assert(CombOldVGPR.Reg);
if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
@@ -325,12 +359,14 @@
return nullptr;
}
CombOldVGPR = getRegSubRegPair(*Src1);
- if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
- LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
+ auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+ const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
+ if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
+ LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n");
return nullptr;
}
}
- return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
+ return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
}
// returns true if MI doesn't have OpndName immediate operand or the
@@ -346,7 +382,8 @@
}
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
- assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
@@ -362,6 +399,17 @@
return false;
}
+ if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+ assert(DppCtrl && DppCtrl->isImm());
+ if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
+ LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
+ " control value\n");
+ // Let it split, then control may become legal.
+ return false;
+ }
+ }
+
auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
assert(RowMaskOpnd && RowMaskOpnd->isImm());
auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
@@ -430,8 +478,9 @@
auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
// try to reuse previous old reg if its undefined (IMPLICIT_DEF)
if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+ const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
CombOldVGPR = RegSubRegPair(
- MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+ MRI->createVirtualRegister(RC));
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
DPPMIs.push_back(UndefInst.getInstr());
@@ -482,21 +531,8 @@
continue;
}
- if (TII->isVOP3(OrigOp)) {
- if (!TII->hasVALU32BitEncoding(OrigOp)) {
- LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
- break;
- }
- // check if other than abs|neg modifiers are set (opsel for example)
- const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
- if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
- LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
- break;
- }
- } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
+ bool IsShrinkable = isShrinkable(OrigMI);
+ if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
break;
}
@@ -521,7 +557,7 @@
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
- OldOpndValue, CombBCZ)) {
+ OldOpndValue, CombBCZ, IsShrinkable)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -532,8 +568,9 @@
BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
- if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
- OldOpndValue, CombBCZ)) {
+ if (auto *DPPInst =
+ createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
+ IsShrinkable)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -566,12 +603,12 @@
}
bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
- auto &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasDPP() || skipFunction(MF.getFunction()))
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasDPP() || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
- TII = ST.getInstrInfo();
+ TII = ST->getInstrInfo();
bool Changed = false;
for (auto &MBB : MF) {
@@ -581,12 +618,17 @@
Changed = true;
++NumDPPMovsCombined;
} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
- auto Split = TII->expandMovDPP64(MI);
- for (auto M : { Split.first, Split.second }) {
- if (combineDPPMov(*M))
- ++NumDPPMovsCombined;
+ if (ST->has64BitDPP() && combineDPPMov(MI)) {
+ Changed = true;
+ ++NumDPPMovsCombined;
+ } else {
+ auto Split = TII->expandMovDPP64(MI);
+ for (auto M : { Split.first, Split.second }) {
+ if (M && combineDPPMov(*M))
+ ++NumDPPMovsCombined;
+ }
+ Changed = true;
}
- Changed = true;
}
}
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ed1dc77..bc2fb1e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -23,6 +23,9 @@
// Hazard Recoginizer Implementation
//===----------------------------------------------------------------------===//
+static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
+ const GCNSubtarget &ST);
+
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
IsHazardRecognizerMode(false),
CurrCycleInstr(nullptr),
@@ -32,8 +35,9 @@
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
ClauseDefs(TRI.getNumRegUnits()) {
- MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+ MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
TSchedModel.init(&ST);
+ RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
}
void GCNHazardRecognizer::Reset() {
@@ -87,6 +91,25 @@
}
}
+static bool isDGEMM(unsigned Opcode) {
+ return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
+}
+
+static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+
+ if (!SIInstrInfo::isMAI(MI) ||
+ isDGEMM(Opcode) ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
+ return false;
+
+ return true;
+}
+
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
const MachineInstr &MI) {
if (TII.isAlwaysGDS(MI.getOpcode()))
@@ -138,12 +161,6 @@
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return HazardType;
- // FIXME: Should flat be considered vmem?
- if ((SIInstrInfo::isVMEM(*MI) ||
- SIInstrInfo::isFLAT(*MI))
- && checkVMEMHazards(MI) > 0)
- return HazardType;
-
if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
return HazardType;
@@ -153,6 +170,12 @@
if (ST.hasNoDataDepHazard())
return NoHazard;
+ // FIXME: Should flat be considered vmem?
+ if ((SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI))
+ && checkVMEMHazards(MI) > 0)
+ return HazardType;
+
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return HazardType;
@@ -165,6 +188,11 @@
if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
return HazardType;
+ if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+ return HazardType;
+
if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
return HazardType;
@@ -251,9 +279,6 @@
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
- WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
-
if (ST.hasNSAtoVMEMBug())
WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
@@ -262,6 +287,9 @@
if (ST.hasNoDataDepHazard())
return WaitStates;
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+
if (SIInstrInfo::isVALU(*MI))
WaitStates = std::max(WaitStates, checkVALUHazards(MI));
@@ -274,6 +302,11 @@
if (isRWLane(MI->getOpcode()))
WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+ if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+ WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
+
if (MI->isInlineAsm())
return std::max(WaitStates, checkInlineAsmHazards(MI));
@@ -319,8 +352,7 @@
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
- if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
- CurrCycleInstr->isKill()) {
+ if (CurrCycleInstr->isMetaInstruction()) {
CurrCycleInstr = nullptr;
return;
}
@@ -359,23 +391,22 @@
// Helper Functions
//===----------------------------------------------------------------------===//
-typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- MachineBasicBlock *MBB,
- MachineBasicBlock::reverse_instr_iterator I,
- int WaitStates,
- IsExpiredFn IsExpired,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ int WaitStates, IsExpiredFn IsExpired,
DenseSet<const MachineBasicBlock *> &Visited) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
continue;
- if (IsHazard(&*I))
+ if (IsHazard(*I))
return WaitStates;
if (I->isInlineAsm() || I->isMetaInstruction())
@@ -383,12 +414,11 @@
WaitStates += SIInstrInfo::getNumWaitStates(*I);
- if (IsExpired(&*I, WaitStates))
+ if (IsExpired(*I, WaitStates))
return std::numeric_limits<int>::max();
}
- int MinWaitStates = WaitStates;
- bool Found = false;
+ int MinWaitStates = std::numeric_limits<int>::max();
for (MachineBasicBlock *Pred : MBB->predecessors()) {
if (!Visited.insert(Pred).second)
continue;
@@ -396,25 +426,14 @@
int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
WaitStates, IsExpired, Visited);
- if (W == std::numeric_limits<int>::max())
- continue;
-
- MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
- if (IsExpired(nullptr, MinWaitStates))
- return MinWaitStates;
-
- Found = true;
+ MinWaitStates = std::min(MinWaitStates, W);
}
- if (Found)
- return MinWaitStates;
-
- return std::numeric_limits<int>::max();
+ return MinWaitStates;
}
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- MachineInstr *MI,
- IsExpiredFn IsExpired) {
+ const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
std::next(MI->getReverseIterator()),
@@ -423,7 +442,7 @@
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
if (IsHazardRecognizerMode) {
- auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+ auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
return WaitStates >= Limit;
};
return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
@@ -432,7 +451,7 @@
int WaitStates = 0;
for (MachineInstr *MI : EmittedInstrs) {
if (MI) {
- if (IsHazard(MI))
+ if (IsHazard(*MI))
return WaitStates;
if (MI->isInlineAsm())
@@ -451,8 +470,8 @@
int Limit) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
- return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+ auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
+ return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
};
return getWaitStatesSince(IsHazardFn, Limit);
@@ -460,8 +479,8 @@
int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
int Limit) {
- auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
- return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+ auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
+ return isSSetReg(MI.getOpcode()) && IsHazard(MI);
};
return getWaitStatesSince(IsHazardFn, Limit);
@@ -560,8 +579,12 @@
// A read of an SGPR by SMRD instruction requires 4 wait states when the
// SGPR was written by a VALU instruction.
int SmrdSgprWaitStates = 4;
- auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
- auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
+ auto IsHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isVALU(MI);
+ };
+ auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isSALU(MI);
+ };
bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
@@ -601,9 +624,11 @@
// A read of an SGPR by a VMEM instruction requires 5 wait states when the
// SGPR was written by a VALU Instruction.
const int VmemSgprWaitStates = 5;
- auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
+ auto IsHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isVALU(MI);
+ };
for (const MachineOperand &Use : VMEM->uses()) {
- if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
@@ -622,15 +647,18 @@
int DppVgprWaitStates = 2;
int DppExecWaitStates = 5;
int WaitStatesNeeded = 0;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ auto IsHazardDefFn = [TII](const MachineInstr &MI) {
+ return TII->isVALU(MI);
+ };
for (const MachineOperand &Use : DPP->uses()) {
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
- [](MachineInstr *) { return true; },
- DppVgprWaitStates);
+ DppVgprWaitStates - getWaitStatesSinceDef(
+ Use.getReg(),
+ [](const MachineInstr &) { return true; },
+ DppVgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
@@ -648,7 +676,9 @@
// v_div_fmas requires 4 wait states after a write to vcc from a VALU
// instruction.
const int DivFMasWaitStates = 4;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ auto IsHazardDefFn = [TII](const MachineInstr &MI) {
+ return TII->isVALU(MI);
+ };
int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
DivFMasWaitStates);
@@ -660,8 +690,8 @@
unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
const int GetRegWaitStates = 2;
- auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
- return GetRegHWReg == getHWReg(TII, *MI);
+ auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
+ return GetRegHWReg == getHWReg(TII, MI);
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
@@ -673,8 +703,8 @@
unsigned HWReg = getHWReg(TII, *SetRegInstr);
const int SetRegWaitStates = ST.getSetRegWaitStates();
- auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
- return HWReg == getHWReg(TII, *MI);
+ auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
+ return HWReg == getHWReg(TII, MI);
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
return SetRegWaitStates - WaitStatesNeeded;
@@ -739,13 +769,13 @@
const int VALUWaitStates = 1;
int WaitStatesNeeded = 0;
- if (!TRI->isVGPR(MRI, Def.getReg()))
+ if (!TRI->isVectorRegister(MRI, Def.getReg()))
return WaitStatesNeeded;
Register Reg = Def.getReg();
- auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
- int DataIdx = createsVALUHazard(*MI);
+ auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
+ int DataIdx = createsVALUHazard(MI);
return DataIdx >= 0 &&
- TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
@@ -808,9 +838,7 @@
return 0;
Register LaneSelectReg = LaneSelectOp->getReg();
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isVALU(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
const int RWLaneWaitStates = 4;
int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
@@ -826,8 +854,8 @@
const int RFEWaitStates = 1;
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
+ auto IsHazardFn = [TII](const MachineInstr &MI) {
+ return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;
@@ -836,9 +864,7 @@
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
const int SMovRelWaitStates = 1;
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isSALU(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
SMovRelWaitStates);
}
@@ -856,18 +882,12 @@
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isVOPC(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
- auto IsExpiredFn = [] (MachineInstr *MI, int) {
- if (!MI)
- return false;
- unsigned Opc = MI->getOpcode();
- return SIInstrInfo::isVALU(*MI) &&
- Opc != AMDGPU::V_NOP_e32 &&
- Opc != AMDGPU::V_NOP_e64 &&
- Opc != AMDGPU::V_NOP_sdwa;
+ auto IsExpiredFn = [](const MachineInstr &MI, int) {
+ unsigned Opc = MI.getOpcode();
+ return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -900,13 +920,14 @@
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
- if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
- !SIInstrInfo::isFLAT(*I))
+ auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
+ if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
+ !SIInstrInfo::isFLAT(I))
return false;
for (const MachineOperand &Def : MI->defs()) {
- MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
+ const MachineOperand *Op =
+ I.findRegisterUseOperand(Def.getReg(), false, TRI);
if (!Op)
continue;
return true;
@@ -914,12 +935,12 @@
return false;
};
- auto IsExpiredFn = [](MachineInstr *MI, int) {
- return MI && (SIInstrInfo::isVALU(*MI) ||
- (MI->getOpcode() == AMDGPU::S_WAITCNT &&
- !MI->getOperand(0).getImm()) ||
- (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- MI->getOperand(0).getImm() == 0xffe3));
+ auto IsExpiredFn = [](const MachineInstr &MI, int) {
+ return SIInstrInfo::isVALU(MI) ||
+ (MI.getOpcode() == AMDGPU::S_WAITCNT &&
+ !MI.getOperand(0).getImm()) ||
+ (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ MI.getOperand(0).getImm() == 0xffe3);
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -968,43 +989,41 @@
return false;
const Register SDSTReg = SDST->getReg();
- auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
- return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
+ auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
+ return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
};
- auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
- if (MI) {
- if (TII->isSALU(*MI)) {
- switch (MI->getOpcode()) {
- case AMDGPU::S_SETVSKIP:
- case AMDGPU::S_VERSION:
- case AMDGPU::S_WAITCNT_VSCNT:
- case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_EXPCNT:
- // These instructions cannot not mitigate the hazard.
+ auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
+ if (TII->isSALU(MI)) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SETVSKIP:
+ case AMDGPU::S_VERSION:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ // These instructions cannot not mitigate the hazard.
+ return false;
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ // Reducing lgkmcnt count to 0 always mitigates the hazard.
+ return (MI.getOperand(1).getImm() == 0) &&
+ (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ case AMDGPU::S_WAITCNT: {
+ const int64_t Imm = MI.getOperand(0).getImm();
+ AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
+ return (Decoded.LgkmCnt == 0);
+ }
+ default:
+ // SOPP instructions cannot mitigate the hazard.
+ if (TII->isSOPP(MI))
return false;
- case AMDGPU::S_WAITCNT_LGKMCNT:
- // Reducing lgkmcnt count to 0 always mitigates the hazard.
- return (MI->getOperand(1).getImm() == 0) &&
- (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- case AMDGPU::S_WAITCNT: {
- const int64_t Imm = MI->getOperand(0).getImm();
- AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
- return (Decoded.LgkmCnt == 0);
- }
- default:
- // SOPP instructions cannot mitigate the hazard.
- if (TII->isSOPP(*MI))
- return false;
- // At this point the SALU can be assumed to mitigate the hazard
- // because either:
- // (a) it is independent of the at risk SMEM (breaking chain),
- // or
- // (b) it is dependent on the SMEM, in which case an appropriate
- // s_waitcnt lgkmcnt _must_ exist between it and the at risk
- // SMEM instruction.
- return true;
- }
+ // At this point the SALU can be assumed to mitigate the hazard
+ // because either:
+ // (a) it is independent of the at risk SMEM (breaking chain),
+ // or
+ // (b) it is dependent on the SMEM, in which case an appropriate
+ // s_waitcnt lgkmcnt _must_ exist between it and the at risk
+ // SMEM instruction.
+ return true;
}
}
return false;
@@ -1028,25 +1047,23 @@
if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
return false;
- auto IsHazardFn = [TRI] (MachineInstr *I) {
- if (SIInstrInfo::isVALU(*I))
+ auto IsHazardFn = [TRI](const MachineInstr &I) {
+ if (SIInstrInfo::isVALU(I))
return false;
- return I->readsRegister(AMDGPU::EXEC, TRI);
+ return I.readsRegister(AMDGPU::EXEC, TRI);
};
const SIInstrInfo *TII = ST.getInstrInfo();
- auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
- if (!MI)
- return false;
- if (SIInstrInfo::isVALU(*MI)) {
- if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
+ auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
+ if (SIInstrInfo::isVALU(MI)) {
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
return true;
- for (auto MO : MI->implicit_operands())
+ for (auto MO : MI.implicit_operands())
if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
return true;
}
- if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
return true;
return false;
};
@@ -1061,52 +1078,71 @@
return true;
}
-bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
+ const GCNSubtarget &ST) {
if (!ST.hasLdsBranchVmemWARHazard())
return false;
- auto IsHazardInst = [] (const MachineInstr *MI) {
- if (SIInstrInfo::isDS(*MI))
+ // Check if the necessary condition for the hazard is met: both LDS and VMEM
+ // instructions need to appear in the same function.
+ bool HasLds = false;
+ bool HasVmem = false;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ HasLds |= SIInstrInfo::isDS(MI);
+ HasVmem |=
+ SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
+ if (HasLds && HasVmem)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+ if (!RunLdsBranchVmemWARHazardFixup)
+ return false;
+
+ assert(ST.hasLdsBranchVmemWARHazard());
+
+ auto IsHazardInst = [](const MachineInstr &MI) {
+ if (SIInstrInfo::isDS(MI))
return 1;
- if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
return 2;
return 0;
};
- auto InstType = IsHazardInst(MI);
+ auto InstType = IsHazardInst(*MI);
if (!InstType)
return false;
- auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
- return I && (IsHazardInst(I) ||
- (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
- !I->getOperand(1).getImm()));
+ auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
+ return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I.getOperand(1).getImm());
};
- auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
- if (!I->isBranch())
+ auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
+ if (!I.isBranch())
return false;
- auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
+ auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
auto InstType2 = IsHazardInst(I);
return InstType2 && InstType != InstType2;
};
- auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
- if (!I)
- return false;
-
+ auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
auto InstType2 = IsHazardInst(I);
if (InstType == InstType2)
return true;
- return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
- !I->getOperand(1).getImm();
+ return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I.getOperand(1).getImm();
};
- return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
+ return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
std::numeric_limits<int>::max();
};
@@ -1137,12 +1173,12 @@
if (!Offset || (Offset->getImm() & 6) == 0)
return 0;
- auto IsHazardFn = [TII] (MachineInstr *I) {
- if (!SIInstrInfo::isMIMG(*I))
+ auto IsHazardFn = [TII](const MachineInstr &I) {
+ if (!SIInstrInfo::isMIMG(I))
return false;
- const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
- TII->getInstSizeInBytes(*I) >= 16;
+ TII->getInstSizeInBytes(I) >= 16;
};
return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
@@ -1154,17 +1190,17 @@
if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
return 0;
- auto IsHazardFn = [] (MachineInstr *I) {
- if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
+ auto IsHazardFn = [](const MachineInstr &I) {
+ if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
return false;
- return SIInstrInfo::isFPAtomic(*I);
+ return SIInstrInfo::isFPAtomic(I);
};
- auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
- if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
+ auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
+ if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
return true;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
case AMDGPU::S_WAITCNT:
case AMDGPU::S_WAITCNT_VSCNT:
case AMDGPU::S_WAITCNT_VMCNT:
@@ -1179,7 +1215,6 @@
return false;
};
-
return FPAtomicToDenormModeWaitStates -
::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
}
@@ -1187,11 +1222,15 @@
int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
assert(SIInstrInfo::isMAI(*MI));
+ return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
+}
+
+int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
int WaitStatesNeeded = 0;
unsigned Opc = MI->getOpcode();
- auto IsVALUFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isVALU(*MI);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI);
};
if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
@@ -1220,10 +1259,10 @@
}
}
- auto IsMFMAFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isMAI(*MI) &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ auto IsMFMAFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_operands()) {
@@ -1245,15 +1284,15 @@
Register Reg = Op.getReg();
unsigned HazardDefLatency = 0;
- auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
- (MachineInstr *MI) {
+ auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
+ this](const MachineInstr &MI) {
if (!IsMFMAFn(MI))
return false;
- Register DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (DstReg == Reg)
return false;
- HazardDefLatency = std::max(HazardDefLatency,
- TSchedModel.computeInstrLatency(MI));
+ HazardDefLatency =
+ std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
return TRI.regsOverlap(DstReg, Reg);
};
@@ -1292,10 +1331,10 @@
if (WaitStatesNeeded == MaxWaitStates)
return WaitStatesNeeded; // Early exit.
- auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+ auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
+ if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
- Register DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1324,13 +1363,13 @@
Register DstReg = MI->getOperand(0).getReg();
unsigned HazardDefLatency = 0;
- auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
- (MachineInstr *MI) {
+ auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
+ this](const MachineInstr &MI) {
if (!IsMFMAFn(MI))
return false;
- Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
- HazardDefLatency = std::max(HazardDefLatency,
- TSchedModel.computeInstrLatency(MI));
+ Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ HazardDefLatency =
+ std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1353,14 +1392,171 @@
return WaitStatesNeeded;
}
+int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
+ int WaitStatesNeeded = 0;
+ unsigned Opc = MI->getOpcode();
+
+ auto IsMFMAFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ };
+
+ auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
+ };
+
+ auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
+ };
+
+ if (!IsMFMAFn(*MI))
+ return WaitStatesNeeded;
+
+ const int VALUWritesExecWaitStates = 4;
+ int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+ getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
+ VALUWritesExecWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+
+ // Loop for both DGEMM and S/HGEMM 2nd instruction.
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
+ const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+ const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
+ const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
+ const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
+ const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
+ const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
+ const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
+ const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
+ const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
+ const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
+ const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+ const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
+ const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
+ const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+ const int MaxWaitStates = 19;
+
+ if (!Use.isReg())
+ continue;
+ unsigned Reg = Use.getReg();
+ bool FullReg;
+ const MachineInstr *MI1;
+
+ auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
+ return false;
+ Register DstReg = MI.getOperand(0).getReg();
+ FullReg = (DstReg == Reg);
+ MI1 = &MI;
+ return TRI.regsOverlap(DstReg, Reg);
+ };
+
+ WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
+ getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
+ MaxWaitStates);
+ if (NumWaitStates == std::numeric_limits<int>::max())
+ continue;
+
+ int OpNo = MI->getOperandNo(&Use);
+ unsigned Opc1 = MI1->getOpcode();
+ int NeedWaitStates = 0;
+ if (OpNo == SrcCIdx) {
+ if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+ NeedWaitStates = 0;
+ } else if (FullReg) {
+ if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
+ (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
+ NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+ } else {
+ switch (Opc1) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ if (!isXDL(ST, *MI))
+ NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
+ break;
+ case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+ if (!isXDL(ST, *MI))
+ NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
+ break;
+ default:
+ switch (TSchedModel.computeInstrLatency(MI1)) {
+ case 2:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+ }
+ }
+ }
+ } else {
+ switch (Opc1) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
+ break;
+ case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+ NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
+ break;
+ default:
+ switch (TSchedModel.computeInstrLatency(MI1)) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+ }
+ }
+ }
+ if (WaitStatesNeeded >= NeedWaitStates)
+ continue;
+
+ WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+
+ return WaitStatesNeeded;
+}
+
int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
- if (!ST.hasMAIInsts())
+ // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
+ if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
return 0;
int WaitStatesNeeded = 0;
- auto IsAccVgprReadFn = [] (MachineInstr *MI) {
- return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
+ auto IsAccVgprReadFn = [](const MachineInstr &MI) {
+ return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_uses()) {
@@ -1380,12 +1576,12 @@
if (WaitStatesNeeded == MaxWaitStates)
return WaitStatesNeeded; // Early exit.
- auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+ auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
+ if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
- auto IsVALUFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
};
return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
std::numeric_limits<int>::max();
@@ -1399,22 +1595,252 @@
return WaitStatesNeeded;
}
+int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
+ if (!ST.hasGFX90AInsts())
+ return 0;
+
+ auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ };
+
+ auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
+ return isDGEMM(MI.getOpcode());
+ };
+
+ // This is checked in checkMAIHazards90A()
+ if (IsMFMAFn(*MI))
+ return 0;
+
+ int WaitStatesNeeded = 0;
+
+ bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) ||
+ SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI);
+ bool IsVALU = SIInstrInfo::isVALU(*MI);
+
+ const MachineInstr *MFMA = nullptr;
+ unsigned Reg;
+ auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+ return false;
+ if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
+ return false;
+ MFMA = &MI;
+ return true;
+ };
+
+ const MachineInstr *DOT = nullptr;
+ auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
+ if (!SIInstrInfo::isDOT(MI) ||
+ !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+ return false;
+ DOT = &MI;
+ return true;
+ };
+
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src2);
+
+ if (IsMemOrExport || IsVALU) {
+ const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
+ const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
+ const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+ const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
+ const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
+ const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
+ const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
+ const int DotWriteSameDotReadSrcAB = 3;
+ const int DotWriteDifferentVALURead = 3;
+ const int MaxWaitStates = 19;
+
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ if (!Use.isReg())
+ continue;
+ Reg = Use.getReg();
+
+ DOT = nullptr;
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+ MaxWaitStates);
+ if (DOT) {
+ int NeedWaitStates = 0;
+ if (DOT->getOpcode() == MI->getOpcode()) {
+ if (&Use - &MI->getOperand(0) != SrcCIdx)
+ NeedWaitStates = DotWriteSameDotReadSrcAB;
+ } else {
+ NeedWaitStates = DotWriteDifferentVALURead;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ MFMA = nullptr;
+ WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+ MaxWaitStates);
+ if (!MFMA)
+ continue;
+
+ unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+ int NeedWaitStates = MaxWaitStates;
+ switch (HazardDefLatency) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+ break;
+ case 4:
+ assert(isDGEMM(MFMA->getOpcode()));
+ NeedWaitStates =
+ IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+ : DMFMA4x4WriteVgprVALUReadWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates =
+ isDGEMM(MFMA->getOpcode())
+ ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
+ : DMFMA16x16WriteVgprVALUReadWaitStates
+ : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+ }
+
+ unsigned Opc = MI->getOpcode();
+ const int DMFMAToFMA64WaitStates = 2;
+ if ((Opc == AMDGPU::V_FMA_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_dpp) &&
+ WaitStatesNeeded < DMFMAToFMA64WaitStates) {
+ int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
+ getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ if (!IsVALU && !IsMemOrExport)
+ return WaitStatesNeeded;
+
+ for (const MachineOperand &Def : MI->defs()) {
+ const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
+ const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
+ const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+ const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+ const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
+ const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
+ const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
+ const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
+ const int DotWriteDifferentVALUWrite = 3;
+ const int MaxWaitStates = 19;
+ const int MaxWarWaitStates = 15;
+
+ Reg = Def.getReg();
+
+ DOT = nullptr;
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+ MaxWaitStates);
+ if (DOT && DOT->getOpcode() != MI->getOpcode())
+ WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
+ WaitStatesSinceDef);
+
+ MFMA = nullptr;
+ WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+ MaxWaitStates);
+ if (MFMA) {
+ int NeedWaitStates = MaxWaitStates;
+ switch (TSchedModel.computeInstrLatency(MFMA)) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+ break;
+ case 4:
+ assert(isDGEMM(MFMA->getOpcode()));
+ NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = isDGEMM(MFMA->getOpcode())
+ ? DMFMA16x16WriteVgprVALUWriteWaitStates
+ : SMFMA32x32WriteVgprVALUWawWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+
+ auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
+ !MI.readsRegister(Reg, &TRI))
+ return false;
+
+ const MachineOperand *SrcC =
+ TII.getNamedOperand(MI, AMDGPU::OpName::src2);
+ assert(SrcC);
+ if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
+ return false;
+
+ MFMA = &MI;
+ return true;
+ };
+
+ MFMA = nullptr;
+ int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
+ MaxWarWaitStates);
+ if (!MFMA)
+ continue;
+
+ unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+ int NeedWaitStates = MaxWaitStates;
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
+ break;
+ case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
+}
+
bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
if (!SU->isInstr())
return false;
- MachineInstr *MAI = nullptr;
- auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
+ const MachineInstr *MAI = nullptr;
+ auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
MAI = nullptr;
- if (SIInstrInfo::isMAI(*MI) &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
- MAI = MI;
+ if (SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+ MAI = &MI;
return MAI != nullptr;
};
MachineInstr *MI = SU->getInstr();
- if (IsMFMAFn(MI)) {
+ if (IsMFMAFn(*MI)) {
int W = getWaitStatesSince(IsMFMAFn, 16);
if (MAI)
return W < (int)TSchedModel.computeInstrLatency(MAI);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 447ca82..162121c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -32,7 +32,7 @@
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
public:
- typedef function_ref<bool(MachineInstr *)> IsHazardFn;
+ typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
private:
// Distinguish if we are called from scheduler or hazard recognizer
@@ -48,6 +48,7 @@
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
TargetSchedModel TSchedModel;
+ bool RunLdsBranchVmemWARHazardFixup;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -94,6 +95,9 @@
bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
+ int checkMAIHazards908(MachineInstr *MI);
+ int checkMAIHazards90A(MachineInstr *MI);
+ int checkMAIVALUHazards(MachineInstr *MI);
int checkMAILdStHazards(MachineInstr *MI);
public:
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index fc7105b..9f98f9a 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -190,6 +190,14 @@
if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
return NSA_Status::FIXED;
+ // InlineSpiller does not call LRM::assign() after an LI split leaving
+ // it in an inconsistent state, so we cannot call LRM::unassign().
+ // See llvm bug #48911.
+ // Skip reassign if a register has originated from such split.
+ // FIXME: Remove the workaround when bug #48911 is fixed.
+ if (VRM->getPreSplitReg(Reg))
+ return NSA_Status::FIXED;
+
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
new file mode 100644
index 0000000..a51399d
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -0,0 +1,162 @@
+//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass combines split register tuple initialization into a single psuedo:
+///
+/// undef %0.sub1:sreg_64 = S_MOV_B32 1
+/// %0.sub0:sreg_64 = S_MOV_B32 2
+/// =>
+/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
+///
+/// This is to allow rematerialization of a value instead of spilling. It is
+/// supposed to be done after register coalescer to allow it to do its job and
+/// before actual register allocation to allow rematerialization.
+///
+/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
+/// although the same shall be possible with other register classes and
+/// instructions if necessary.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
+
+namespace {
+
+class GCNPreRAOptimizations : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ bool processReg(Register Reg);
+
+public:
+ static char ID;
+
+ GCNPreRAOptimizations() : MachineFunctionPass(ID) {
+ initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Pre-RA optimizations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE,
+ "AMDGPU Pre-RA optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations",
+ false, false)
+
+char GCNPreRAOptimizations::ID = 0;
+
+char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID;
+
+FunctionPass *llvm::createGCNPreRAOptimizationsPass() {
+ return new GCNPreRAOptimizations();
+}
+
+bool GCNPreRAOptimizations::processReg(Register Reg) {
+ MachineInstr *Def0 = nullptr;
+ MachineInstr *Def1 = nullptr;
+ uint64_t Init = 0;
+
+ for (MachineInstr &I : MRI->def_instructions(Reg)) {
+ if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
+ !I.getOperand(1).isImm() || I.getNumOperands() != 2)
+ return false;
+
+ switch (I.getOperand(0).getSubReg()) {
+ default:
+ return false;
+ case AMDGPU::sub0:
+ if (Def0)
+ return false;
+ Def0 = &I;
+ Init |= I.getOperand(1).getImm() & 0xffffffff;
+ break;
+ case AMDGPU::sub1:
+ if (Def1)
+ return false;
+ Def1 = &I;
+ Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+ break;
+ }
+ }
+
+ if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
+ << " =>\n");
+
+ if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
+ LIS->getInstructionIndex(*Def0)))
+ std::swap(Def0, Def1);
+
+ LIS->RemoveMachineInstrFromMaps(*Def0);
+ LIS->RemoveMachineInstrFromMaps(*Def1);
+ auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
+ .addImm(Init);
+
+ Def0->eraseFromParent();
+ Def1->eraseFromParent();
+ LIS->InsertMachineInstrInMaps(*NewI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+
+ LLVM_DEBUG(dbgs() << " " << *NewI);
+
+ return true;
+}
+
+bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ LIS = &getAnalysis<LiveIntervals>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
+ continue;
+ Changed |= processReg(Reg);
+ }
+
+ return Changed;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 7447ec2..3a68ed1 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -184,6 +184,10 @@
FeatureISAVersion9_0_9.Features
>;
+def : ProcessorModel<"gfx90a", SIDPFullSpeedModel,
+ FeatureISAVersion9_0_A.Features
+>;
+
def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
FeatureISAVersion9_0_C.Features
>;
@@ -204,6 +208,10 @@
FeatureISAVersion10_1_2.Features
>;
+def : ProcessorModel<"gfx1013", GFX10SpeedModel,
+ FeatureISAVersion10_1_3.Features
+>;
+
def : ProcessorModel<"gfx1030", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
@@ -219,3 +227,11 @@
def : ProcessorModel<"gfx1033", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
+
+def : ProcessorModel<"gfx1034", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1035", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
deleted file mode 100644
index a12e9ab..0000000
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ /dev/null
@@ -1,862 +0,0 @@
-//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Try to reassign registers on GFX10+ to reduce register bank
-/// conflicts.
-///
-/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
-/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
-/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
-/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
-///
-/// The shader can read one dword from each of these banks once per cycle.
-/// If an instruction has to read more register operands from the same bank
-/// an additional cycle is needed. HW attempts to pre-load registers through
-/// input operand gathering, but a stall cycle may occur if that fails. For
-/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
-/// potentially incuring 2 stall cycles.
-///
-/// The pass tries to reassign registers to reduce bank conflicts.
-///
-/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
-/// that 4 has to be subtracted from an SGPR bank number to get the real value.
-/// This also corresponds to bit numbers in bank masks used in the pass.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
- cl::desc("Verify stall cycles in the regbanks reassign pass"),
- cl::value_desc("0|1|2"),
- cl::init(0), cl::Hidden);
-
-#define DEBUG_TYPE "amdgpu-regbanks-reassign"
-
-#define NUM_VGPR_BANKS 4
-#define NUM_SGPR_BANKS 8
-#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
-#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
-#define VGPR_BANK_MASK 0xf
-#define SGPR_BANK_MASK 0xff0
-#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
-
-STATISTIC(NumStallsDetected,
- "Number of operand read stalls detected");
-STATISTIC(NumStallsRecovered,
- "Number of operand read stalls recovered");
-
-namespace {
-
-class GCNRegBankReassign : public MachineFunctionPass {
-
- class OperandMask {
- public:
- OperandMask(unsigned r, unsigned s, unsigned m)
- : Reg(r), SubReg(s), Mask(m) {}
- Register Reg;
- unsigned SubReg;
- unsigned Mask;
- };
-
- class Candidate {
- public:
- Candidate(MachineInstr *mi, Register reg, unsigned subreg,
- unsigned freebanks)
- : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump(const GCNRegBankReassign *P) const {
- MI->dump();
- dbgs() << P->printReg(Reg) << " to banks ";
- dumpFreeBanks(FreeBanks);
- dbgs() << '\n';
- }
-#endif
-
- MachineInstr *MI;
- Register Reg;
- unsigned SubReg;
- unsigned FreeBanks;
- };
-
- class CandidateList : public std::map<unsigned, std::list<Candidate>> {
- public:
- void push(unsigned Weight, const Candidate&& C) {
- operator[](Weight).push_front(C);
- }
-
- Candidate &back() {
- return rbegin()->second.back();
- }
-
- void pop_back() {
- rbegin()->second.pop_back();
- if (rbegin()->second.empty())
- erase(rbegin()->first);
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump(const GCNRegBankReassign *P) const {
- dbgs() << "\nCandidates:\n\n";
- for (auto &B : *this) {
- dbgs() << " Weight " << B.first << ":\n";
- for (auto &C : B.second)
- C.dump(P);
- }
- dbgs() << "\n\n";
- }
-#endif
- };
-
-public:
- static char ID;
-
-public:
- GCNRegBankReassign() : MachineFunctionPass(ID) {
- initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "GCN RegBank Reassign"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineLoopInfo>();
- AU.addRequired<LiveIntervals>();
- AU.addRequired<VirtRegMap>();
- AU.addRequired<LiveRegMatrix>();
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
-private:
- const GCNSubtarget *ST;
-
- const MachineRegisterInfo *MRI;
-
- const SIRegisterInfo *TRI;
-
- MachineLoopInfo *MLI;
-
- VirtRegMap *VRM;
-
- LiveRegMatrix *LRM;
-
- LiveIntervals *LIS;
-
- unsigned MaxNumVGPRs;
-
- unsigned MaxNumSGPRs;
-
- BitVector RegsUsed;
-
- SmallVector<OperandMask, 8> OperandMasks;
-
- CandidateList Candidates;
-
- const MCPhysReg *CSRegs;
-
- // Returns bank for a phys reg.
- unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
-
- // Return a bit set for each register bank used. 4 banks for VGPRs and
- // 8 banks for SGPRs.
- // Registers already processed and recorded in RegsUsed are excluded.
- // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
- uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
-
- // Analyze one instruction returning the number of stalls and a mask of the
- // banks used by all operands.
- // If Reg and Bank are provided, assume all uses of Reg will be replaced with
- // a register chosen from Bank.
- std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
- Register Reg = Register(),
- unsigned SubReg = 0, int Bank = -1);
-
- // Return true if register is regular VGPR or SGPR or their tuples.
- // Returns false for special registers like m0, vcc etc.
- bool isReassignable(Register Reg) const;
-
- // Check if registers' defs are old and may be pre-loaded.
- // Returns 0 if both registers are old enough, 1 or 2 if one or both
- // registers will not likely be pre-loaded.
- unsigned getOperandGatherWeight(const MachineInstr& MI,
- Register Reg1,
- Register Reg2,
- unsigned StallCycles) const;
-
-
- // Find all bank bits in UsedBanks where Mask can be relocated to.
- unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
-
- // Find all bank bits in UsedBanks where Mask can be relocated to.
- // Bank is relative to the register and not its subregister component.
- // Returns 0 is a register is not reassignable.
- unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
- unsigned UsedBanks) const;
-
- // Add cadidate instruction to the work list.
- void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
- unsigned StallCycles);
-
- // Collect cadidate instructions across function. Returns a number stall
- // cycles detected. Only counts stalls if Collect is false.
- unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
-
- // Remove all candidates that read specified register.
- void removeCandidates(Register Reg);
-
- // Compute stalls within the uses of SrcReg replaced by a register from
- // Bank. If Bank is -1 does not perform substitution. If Collect is set
- // candidates are collected and added to work list.
- unsigned computeStallCycles(Register SrcReg,
- Register Reg = Register(),
- unsigned SubReg = 0, int Bank = -1,
- bool Collect = false);
-
- // Search for a register in Bank unused within LI.
- // Returns phys reg or NoRegister.
- MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
- unsigned SubReg) const;
-
- // Try to reassign candidate. Returns number or stall cycles saved.
- unsigned tryReassign(Candidate &C);
-
- bool verifyCycles(MachineFunction &MF,
- unsigned OriginalCycles, unsigned CyclesSaved);
-
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-public:
- Printable printReg(Register Reg, unsigned SubReg = 0) const {
- return Printable([Reg, SubReg, this](raw_ostream &OS) {
- if (Reg.isPhysical()) {
- OS << llvm::printReg(Reg, TRI);
- return;
- }
- if (!VRM->isAssignedReg(Reg))
- OS << "<unassigned> " << llvm::printReg(Reg, TRI);
- else
- OS << llvm::printReg(Reg, TRI) << '('
- << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
- if (SubReg)
- OS << ':' << TRI->getSubRegIndexName(SubReg);
- });
- }
-
- static Printable printBank(unsigned Bank) {
- return Printable([Bank](raw_ostream &OS) {
- OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
- });
- }
-
- static void dumpFreeBanks(unsigned FreeBanks) {
- for (unsigned L = 0; L < NUM_BANKS; ++L)
- if (FreeBanks & (1 << L))
- dbgs() << printBank(L) << ' ';
- }
-#endif
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
-INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
- false, false)
-
-
-char GCNRegBankReassign::ID = 0;
-
-char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
-
-unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
- unsigned SubReg) const {
- assert(Reg.isPhysical());
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
- if (Size == 16)
- Reg = TRI->get32BitRegister(Reg);
- else if (Size > 32) {
- if (SubReg) {
- const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
- Reg = TRI->getSubReg(Reg, SubReg);
- if (TRI->getRegSizeInBits(*SubRC) > 32)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- } else {
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- }
- }
-
- if (TRI->hasVGPRs(RC)) {
- unsigned RegNo = Reg - AMDGPU::VGPR0;
- return RegNo % NUM_VGPR_BANKS;
- }
-
- unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
- return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
-}
-
-uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
- int Bank) {
- if (Reg.isVirtual()) {
- if (!VRM->isAssignedReg(Reg))
- return 0;
-
- Reg = VRM->getPhys(Reg);
- if (!Reg)
- return 0;
- if (SubReg)
- Reg = TRI->getSubReg(Reg, SubReg);
- }
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
-
- if (Size == 16) {
- Reg = TRI->get32BitRegister(Reg);
- Size = 1;
- } else {
- Size /= 32;
- if (Size > 1)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- }
-
- if (TRI->hasVGPRs(RC)) {
- // VGPRs have 4 banks assigned in a round-robin fashion.
- unsigned RegNo = Reg - AMDGPU::VGPR0;
- uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
- unsigned Used = 0;
- // Bitmask lacks an extract method
- for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(RegNo + I))
- Used |= 1 << I;
- RegsUsed.set(RegNo, RegNo + Size);
- Mask &= ~Used;
- Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
- return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
- }
-
- // SGPRs have 8 banks holding 2 consequitive registers each.
- unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
- unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
- if (RegNo + StartBit >= RegsUsed.size())
- return 0;
-
- if (Size > 1)
- Size /= 2;
- unsigned Mask = (1 << Size) - 1;
- unsigned Used = 0;
- for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(StartBit + RegNo + I))
- Used |= 1 << I;
- RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
- Mask &= ~Used;
- Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
- : unsigned(Bank - SGPR_BANK_OFFSET);
- Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
- // Reserve 4 bank ids for VGPRs.
- return Mask << SGPR_BANK_OFFSET;
-}
-
-std::pair<unsigned, unsigned>
-GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
- unsigned SubReg, int Bank) {
- unsigned StallCycles = 0;
- unsigned UsedBanks = 0;
-
- if (MI.isDebugValue())
- return std::make_pair(StallCycles, UsedBanks);
-
- RegsUsed.reset();
- OperandMasks.clear();
- for (const auto& Op : MI.explicit_uses()) {
- // Undef can be assigned to any register, so two vregs can be assigned
- // the same phys reg within the same instruction.
- if (!Op.isReg() || Op.isUndef())
- continue;
-
- const Register R = Op.getReg();
- const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
-
- // Do not compute stalls for AGPRs
- if (TRI->hasAGPRs(RC))
- continue;
-
- // Do not compute stalls if sub-register covers all banks
- if (Op.getSubReg()) {
- LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
- if (TRI->hasVGPRs(RC)) {
- if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
- continue;
- } else {
- if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
- continue;
- }
- }
-
- unsigned ShiftedBank = Bank;
-
- if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
- unsigned RegOffset =
- TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
- unsigned Offset = TRI->getChannelFromSubReg(
- Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
- if (Bank < NUM_VGPR_BANKS) {
- unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
- ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
- } else if (Bank >= SGPR_BANK_OFFSET) {
- unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
- ShiftedBank = SGPR_BANK_OFFSET +
- (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
- }
- }
-
- uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
- (Reg == R) ? ShiftedBank : -1);
- StallCycles += countPopulation(UsedBanks & Mask);
- UsedBanks |= Mask;
- OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
- }
-
- return std::make_pair(StallCycles, UsedBanks);
-}
-
-unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
- Register Reg1,
- Register Reg2,
- unsigned StallCycles) const
-{
- unsigned Defs = 0;
- MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
- MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
- for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
- if (MI.isDebugInstr())
- continue;
- --Def;
- if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
- continue;
- if (Def->modifiesRegister(Reg1, TRI))
- Defs |= 1;
- if (Def->modifiesRegister(Reg2, TRI))
- Defs |= 2;
- }
- return countPopulation(Defs);
-}
-
-bool GCNRegBankReassign::isReassignable(Register Reg) const {
- if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
- return false;
-
- const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
-
- Register PhysReg = VRM->getPhys(Reg);
-
- if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
- return false;
-
- for (auto U : MRI->use_nodbg_operands(Reg)) {
- if (U.isImplicit())
- return false;
- const MachineInstr *UseInst = U.getParent();
- if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
- return false;
- }
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
-
- // TODO: Support 16 bit registers. Those needs to be moved with their
- // parent VGPR_32 and potentially a sibling 16 bit sub-register.
- if (Size < 32)
- return false;
-
- if (TRI->hasVGPRs(RC))
- return true;
-
- if (Size == 16)
- return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
-
- if (Size > 32)
- PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
-
- return AMDGPU::SGPR_32RegClass.contains(PhysReg);
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
- unsigned UsedBanks) const {
- unsigned Size = countPopulation(Mask);
- unsigned FreeBanks = 0;
- unsigned Bank = findFirstSet(Mask);
-
- UsedBanks &= ~Mask;
-
- // Find free VGPR banks
- if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
- for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
- if (Bank == I)
- continue;
- unsigned NewMask = ((1 << Size) - 1) << I;
- NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
- if (!(UsedBanks & NewMask))
- FreeBanks |= 1 << I;
- }
- return FreeBanks;
- }
-
- // Find free SGPR banks
- // SGPR tuples must be aligned, so step is size in banks it
- // crosses.
- Bank -= SGPR_BANK_OFFSET;
- for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
- if (Bank == I)
- continue;
- unsigned NewMask = ((1 << Size) - 1) << I;
- NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
- if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
- FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
- }
-
- return FreeBanks;
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
- unsigned SubReg,
- unsigned Mask,
- unsigned UsedBanks) const {
- if (!isReassignable(Reg))
- return 0;
-
- unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
-
- unsigned Offset = TRI->getChannelFromSubReg(SubReg);
- if (Offset && (Mask & VGPR_BANK_MASK)) {
- unsigned Shift = Offset;
- if (Shift >= NUM_VGPR_BANKS)
- return 0;
- unsigned VB = FreeBanks & VGPR_BANK_MASK;
- FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
- VGPR_BANK_MASK;
- } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
- unsigned Shift = Offset >> 1;
- if (Shift >= NUM_SGPR_BANKS)
- return 0;
- unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
- FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
- SGPR_BANK_SHIFTED_MASK;
- FreeBanks <<= SGPR_BANK_OFFSET;
- }
-
- LLVM_DEBUG(if (FreeBanks) {
- dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
- << " to banks: "; dumpFreeBanks(FreeBanks);
- dbgs() << '\n'; });
-
- return FreeBanks;
-}
-
-void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
- unsigned UsedBanks,
- unsigned StallCycles) {
- LLVM_DEBUG(MI.dump());
-
- if (!StallCycles)
- return;
-
- LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
-
- for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
- for (unsigned J = I + 1; J != E; ++J) {
- if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
- continue;
-
- Register Reg1 = OperandMasks[I].Reg;
- Register Reg2 = OperandMasks[J].Reg;
- unsigned SubReg1 = OperandMasks[I].SubReg;
- unsigned SubReg2 = OperandMasks[J].SubReg;
- unsigned Mask1 = OperandMasks[I].Mask;
- unsigned Mask2 = OperandMasks[J].Mask;
- unsigned Size1 = countPopulation(Mask1);
- unsigned Size2 = countPopulation(Mask2);
-
- LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
- " and " << printReg(Reg2, SubReg2) << '\n');
-
- unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
- Weight += MLI->getLoopDepth(MI.getParent()) * 10;
-
- LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
-
- unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
- unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
- if (FreeBanks1)
- Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
- Candidate(&MI, Reg1, SubReg1, FreeBanks1));
- if (FreeBanks2)
- Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
- Candidate(&MI, Reg2, SubReg2, FreeBanks2));
- }
- }
-}
-
-unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
- unsigned SubReg, int Bank,
- bool Collect) {
- unsigned TotalStallCycles = 0;
- SmallSet<const MachineInstr *, 16> Visited;
-
- for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
- if (MI.isBundle())
- continue;
- if (!Visited.insert(&MI).second)
- continue;
- unsigned StallCycles;
- unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
- TotalStallCycles += StallCycles;
- if (Collect)
- collectCandidates(MI, UsedBanks, StallCycles);
- }
-
- return TotalStallCycles;
-}
-
-MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
- unsigned SubReg) const {
- const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
- unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
- : MaxNumSGPRs;
- unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
- : AMDGPU::SGPR0);
-
- for (MCRegister Reg : RC->getRegisters()) {
- // Check occupancy limit.
- if (TRI->isSubRegisterEq(Reg, MaxReg))
- break;
-
- if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
- continue;
-
- for (unsigned I = 0; CSRegs[I]; ++I)
- if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
- !LRM->isPhysRegUsed(CSRegs[I]))
- return MCRegister::from(AMDGPU::NoRegister);
-
- LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
-
- if (!LRM->checkInterference(LI, Reg))
- return Reg;
- }
-
- return MCRegister::from(AMDGPU::NoRegister);
-}
-
-unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
- if (!LIS->hasInterval(C.Reg))
- return 0;
-
- LiveInterval &LI = LIS->getInterval(C.Reg);
- LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
- LI.dump());
-
- // For each candidate bank walk all instructions in the range of live
- // interval and check if replacing the register with one belonging to
- // the candidate bank reduces conflicts.
-
- unsigned OrigStalls = computeStallCycles(C.Reg);
- LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
- if (!OrigStalls)
- return 0;
-
- struct BankStall {
- BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
- bool operator<(const BankStall &RHS) const {
- if (Stalls == RHS.Stalls)
- return Bank < RHS.Bank;
- return Stalls > RHS.Stalls;
- }
- unsigned Bank;
- unsigned Stalls;
- };
- SmallVector<BankStall, 8> BankStalls;
-
- for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
- if (C.FreeBanks & (1 << Bank)) {
- LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
- unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
- if (Stalls < OrigStalls) {
- LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
- << Stalls << '\n');
- BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
- }
- }
- }
- llvm::sort(BankStalls);
-
- MCRegister OrigReg = VRM->getPhys(C.Reg);
- LRM->unassign(LI);
- while (!BankStalls.empty()) {
- BankStall BS = BankStalls.pop_back_val();
- MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
- if (Reg == AMDGPU::NoRegister) {
- LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
- << '\n');
- continue;
- }
- LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
- << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
- << " in bank " << printBank(BS.Bank) << '\n');
-
- LRM->assign(LI, Reg);
-
- LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
-
- return OrigStalls - BS.Stalls;
- }
- LRM->assign(LI, OrigReg);
-
- return 0;
-}
-
-unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
- bool Collect) {
- unsigned TotalStallCycles = 0;
-
- for (MachineBasicBlock &MBB : MF) {
-
- LLVM_DEBUG(if (Collect) {
- if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
- else dbgs() << MBB.getName(); dbgs() << ":\n";
- });
-
- for (MachineInstr &MI : MBB.instrs()) {
- if (MI.isBundle())
- continue; // we analyze the instructions inside the bundle individually
-
- unsigned StallCycles;
- unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
-
- if (Collect)
- collectCandidates(MI, UsedBanks, StallCycles);
-
- TotalStallCycles += StallCycles;
- }
-
- LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
- }
-
- return TotalStallCycles;
-}
-
-void GCNRegBankReassign::removeCandidates(Register Reg) {
- typename CandidateList::iterator Next;
- for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
- Next = std::next(I);
- I->second.remove_if([Reg, this](const Candidate& C) {
- return C.MI->readsRegister(Reg, TRI);
- });
- if (I->second.empty())
- Candidates.erase(I);
- }
-}
-
-bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
- unsigned OriginalCycles,
- unsigned CyclesSaved) {
- unsigned StallCycles = collectCandidates(MF, false);
- LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
- << " stall cycles left\n");
- return StallCycles + CyclesSaved == OriginalCycles;
-}
-
-bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
- ST = &MF.getSubtarget<GCNSubtarget>();
- if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
- return false;
-
- MRI = &MF.getRegInfo();
- TRI = ST->getRegisterInfo();
- MLI = &getAnalysis<MachineLoopInfo>();
- VRM = &getAnalysis<VirtRegMap>();
- LRM = &getAnalysis<LiveRegMatrix>();
- LIS = &getAnalysis<LiveIntervals>();
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned Occupancy = MFI->getOccupancy();
- MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
- MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
- MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
- MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
-
- CSRegs = MRI->getCalleeSavedRegs();
- unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
- // Not a tight bound
- AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
- RegsUsed.resize(NumRegBanks);
-
- LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
- << '\n');
-
- unsigned StallCycles = collectCandidates(MF);
- NumStallsDetected += StallCycles;
-
- LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
- "function " << MF.getName() << '\n');
-
- LLVM_DEBUG(Candidates.dump(this));
-
- unsigned CyclesSaved = 0;
- while (!Candidates.empty()) {
- Candidate C = Candidates.back();
- unsigned LocalCyclesSaved = tryReassign(C);
- CyclesSaved += LocalCyclesSaved;
-
- if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
- report_fatal_error("RegBank reassign stall cycles verification failed.");
-
- Candidates.pop_back();
- if (LocalCyclesSaved) {
- removeCandidates(C.Reg);
- computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
-
- LLVM_DEBUG(Candidates.dump(this));
- }
- }
- NumStallsRecovered += CyclesSaved;
-
- LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
- << " cycles saved in function " << MF.getName() << '\n');
-
- Candidates.clear();
-
- if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
- report_fatal_error("RegBank reassign stall cycles verification failed.");
-
- RegsUsed.clear();
-
- return CyclesSaved > 0;
-}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index aeec3e8..3456f9a 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -125,12 +125,14 @@
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
- const auto VGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+ const auto VGPROcc =
+ std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
- const auto OtherVGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
+ const auto OtherVGPROcc =
+ std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -161,7 +163,8 @@
}
}
return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
- (getVGPRNum() < O.getVGPRNum());
+ (getVGPRNum(ST.hasGFX90AInsts()) <
+ O.getVGPRNum(ST.hasGFX90AInsts()));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -169,7 +172,9 @@
void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
OS << "VGPRs: " << Value[VGPR32] << ' ';
OS << "AGPRs: " << Value[AGPR32];
- if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+ if (ST) OS << "(O"
+ << ST->getOccupancyWithNumVGPRs(getVGPRNum(ST->hasGFX90AInsts()))
+ << ')';
OS << ", SGPRs: " << getSGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
OS << ", LVGPR WT: " << getVGPRTuplesWeight()
@@ -384,6 +389,7 @@
void GCNDownwardRPTracker::advanceToNext() {
LastTrackedMI = &*NextMI++;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
// Add new registers or mask bits.
for (const auto &MO : LastTrackedMI->operands()) {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ba8c85a..257561c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -42,12 +42,19 @@
clear();
}
- bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
+ bool empty() const { return getSGPRNum() == 0 && getVGPRNum(false) == 0; }
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
unsigned getSGPRNum() const { return Value[SGPR32]; }
- unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }
+ unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+ if (UnifiedVGPRFile) {
+ return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
+ : Value[VGPR32] + Value[AGPR32];
+ }
+ return std::max(Value[VGPR32], Value[AGPR32]);
+ }
+ unsigned getAGPRNum() const { return Value[AGPR32]; }
unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
Value[AGPR_TUPLE]); }
@@ -55,7 +62,7 @@
unsigned getOccupancy(const GCNSubtarget &ST) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
- ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
}
void inc(unsigned Reg,
@@ -160,7 +167,7 @@
public:
GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
- const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+ MachineBasicBlock::const_iterator getNext() const { return NextMI; }
// Reset tracker to the point before the MI
// filling live regs upon this point using LIS.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 6e25502..0212b8e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -20,7 +20,8 @@
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) :
- GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
+ GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
+ HasExcessPressure(false), MF(nullptr) { }
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -103,11 +104,13 @@
// marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+ HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+ HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
}
@@ -121,6 +124,7 @@
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ HasExcessPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -279,6 +283,15 @@
if (SU->isBottomReady())
Bot.removeReady(SU);
+ if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) {
+ for (SDep &Dep : SU->Preds) {
+ if (Dep.isCluster()) {
+ HasClusteredNodes = true;
+ break;
+ }
+ }
+ }
+
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
return SU;
@@ -320,22 +333,30 @@
PressureBefore.print(dbgs()));
}
+ GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+ // Set HasClusteredNodes to true for late stages where we have already
+ // collected it. That way pickNode() will not scan SDep's when not needed.
+ S.HasClusteredNodes = Stage > InitialSchedule;
+ S.HasExcessPressure = false;
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false;
+ if (Stage == InitialSchedule && S.HasClusteredNodes)
+ RegionsWithClusters[RegionIdx] = true;
+ if (S.HasExcessPressure)
+ RegionsWithHighRP[RegionIdx] = true;
if (!LIS)
return;
// Check the results of scheduling.
- GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
auto PressureAfter = getRealRegPressure();
LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
PressureAfter.print(dbgs()));
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
- PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+ PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
@@ -366,9 +387,12 @@
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- if (PressureAfter.getVGPRNum() > MaxVGPRs ||
- PressureAfter.getSGPRNum() > MaxSGPRs)
+ if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+ PressureAfter.getAGPRNum() > MaxVGPRs ||
+ PressureAfter.getSGPRNum() > MaxSGPRs) {
RescheduleRegions[RegionIdx] = true;
+ RegionsWithHighRP[RegionIdx] = true;
+ }
if (WavesAfter >= MinOccupancy) {
if (Stage == UnclusteredReschedule &&
@@ -378,6 +402,9 @@
PressureAfter.less(ST, PressureBefore) ||
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
+ if (!RegionsWithClusters[RegionIdx] &&
+ (Stage + 1) == UnclusteredReschedule)
+ RescheduleRegions[RegionIdx] = false;
return;
} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
@@ -385,7 +412,8 @@
}
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
- RescheduleRegions[RegionIdx] = true;
+ RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
+ (Stage + 1) != UnclusteredReschedule;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
@@ -460,7 +488,9 @@
I = Rgn.first;
auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
auto LRS = BBLiveInMap.lookup(NonDbgMI);
+#ifdef EXPENSIVE_CHECKS
assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
+#endif
RPTracker.reset(*I, &LRS);
}
@@ -516,7 +546,11 @@
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
+ RegionsWithClusters.resize(Regions.size());
+ RegionsWithHighRP.resize(Regions.size());
RescheduleRegions.set();
+ RegionsWithClusters.reset();
+ RegionsWithHighRP.reset();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
@@ -561,7 +595,10 @@
SavedMutations.swap(Mutations);
for (auto Region : Regions) {
- if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) {
+ if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
+ (Stage == ClusteredLowOccupancyReschedule &&
+ !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
+
++RegionIdx;
continue;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 2d81d99..15eba3f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -50,6 +50,14 @@
unsigned TargetOccupancy;
+ // schedule() have seen a clustered memory operation. Set it to false
+ // before a region scheduling to know if the region had such clusters.
+ bool HasClusteredNodes;
+
+ // schedule() have seen a an excess register pressure and had to track
+ // register pressure for actual scheduling heuristics.
+ bool HasExcessPressure;
+
MachineFunction *MF;
public:
@@ -96,6 +104,12 @@
// or we generally desire to reschedule it.
BitVector RescheduleRegions;
+ // Record regions which use clustered loads/stores.
+ BitVector RegionsWithClusters;
+
+ // Record regions with high register pressure.
+ BitVector RegionsWithHighRP;
+
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7a71781..bd0c400 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -41,24 +41,16 @@
using AMDGPUSubtarget::getMaxWavesPerEU;
public:
- enum TrapHandlerAbi {
- TrapHandlerAbiNone = 0,
- TrapHandlerAbiHsa = 1
+ // Following 2 enums are documented at:
+ // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+ enum class TrapHandlerAbi {
+ NONE = 0x00,
+ AMDHSA = 0x01,
};
- enum TrapID {
- TrapIDHardwareReserved = 0,
- TrapIDHSADebugTrap = 1,
- TrapIDLLVMTrap = 2,
- TrapIDLLVMDebugTrap = 3,
- TrapIDDebugBreakpoint = 7,
- TrapIDDebugReserved8 = 8,
- TrapIDDebugReservedFE = 0xfe,
- TrapIDDebugReservedFF = 0xff
- };
-
- enum TrapRegValues {
- LLVMTrapHandlerRegValue = 1
+ enum class TrapID {
+ LLVMAMDHSATrap = 0x02,
+ LLVMAMDHSADebugTrap = 0x03,
};
private:
@@ -82,6 +74,7 @@
bool FastFMAF32;
bool FastDenormalF32;
bool HalfRate64Ops;
+ bool FullRate64Ops;
// Dynamically set bits that enable features.
bool FlatForGlobal;
@@ -95,6 +88,7 @@
// for XNACK.
bool EnableXNACK;
+ bool EnableTgSplit;
bool EnableCuMode;
bool TrapHandler;
@@ -110,14 +104,17 @@
bool FP64;
bool FMA;
bool MIMG_R128;
- bool GCN3Encoding;
+ bool IsGCN;
bool CIInsts;
bool GFX8Insts;
bool GFX9Insts;
+ bool GFX90AInsts;
bool GFX10Insts;
bool GFX10_3Insts;
bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
+ bool NegativeScratchOffsetBug;
+ bool NegativeUnalignedScratchOffsetBug;
bool HasSMemRealTime;
bool HasIntClamp;
bool HasFmaMixInsts;
@@ -132,10 +129,15 @@
bool HasSDWAOutModsVOPC;
bool HasDPP;
bool HasDPP8;
+ bool Has64BitDPP;
+ bool HasPackedFP32Ops;
+ bool HasExtendedImageInsts;
bool HasR128A16;
bool HasGFX10A16;
bool HasG16;
bool HasNSAEncoding;
+ unsigned NSAMaxSize;
+ bool GFX10_AEncoding;
bool GFX10_BEncoding;
bool HasDLInsts;
bool HasDot1Insts;
@@ -144,6 +146,7 @@
bool HasDot4Insts;
bool HasDot5Insts;
bool HasDot6Insts;
+ bool HasDot7Insts;
bool HasMAIInsts;
bool HasPkFmacF16Inst;
bool HasAtomicFaddInsts;
@@ -157,6 +160,7 @@
bool HasVscnt;
bool HasGetWaveIdInst;
bool HasSMemTimeInst;
+ bool HasShaderCyclesRegister;
bool HasRegisterBanking;
bool HasVOP3Literal;
bool HasNoDataDepHazard;
@@ -165,12 +169,19 @@
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool ScalarFlatScratchInsts;
+ bool HasArchitectedFlatScratch;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
+ bool R600ALUInst;
+ bool CaymanISA;
+ bool CFALUBug;
bool LDSMisalignedBug;
bool HasMFMAInlineLiteralBug;
+ bool HasVertexCache;
+ short TexVTXClauseSize;
bool UnalignedBufferAccess;
bool UnalignedDSAccess;
+ bool HasPackedTID;
bool ScalarizeGlobal;
bool HasVcmpxPermlaneHazard;
@@ -180,6 +191,7 @@
bool HasVcmpxExecWARHazard;
bool HasLdsBranchVmemWARHazard;
bool HasNSAtoVMEMBug;
+ bool HasNSAClauseBug;
bool HasOffset3fBug;
bool HasFlatSegmentOffsetBug;
bool HasImageStoreD16Bug;
@@ -241,6 +253,10 @@
return RegBankInfo.get();
}
+ const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
+ return TargetID;
+ }
+
// Nothing implemented, just prevent crashes on use.
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
@@ -271,6 +287,11 @@
unsigned getConstantBusLimit(unsigned Opcode) const;
+ /// Returns if the result of this instruction with a 16-bit result returned in
+ /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
+ /// the original value.
+ bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
+
bool hasIntClamp() const {
return HasIntClamp;
}
@@ -295,6 +316,10 @@
return HalfRate64Ops;
}
+ bool hasFullRate64Ops() const {
+ return FullRate64Ops;
+ }
+
bool hasAddr64() const {
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
@@ -370,7 +395,12 @@
}
TrapHandlerAbi getTrapHandlerAbi() const {
- return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+ return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
+ }
+
+ bool supportsGetDoorbellID() const {
+ // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
+ return getGeneration() >= GFX9;
}
/// True if the offset field of DS instructions works as expected. On SI, the
@@ -510,6 +540,10 @@
return TargetID.isXnackOnOrAny();
}
+ bool isTgSplitEnabled() const {
+ return EnableTgSplit;
+ }
+
bool isCuModeEnabled() const {
return EnableCuMode;
}
@@ -666,6 +700,10 @@
return HasDot6Insts;
}
+ bool hasDot7Insts() const {
+ return HasDot7Insts;
+ }
+
bool hasMAIInsts() const {
return HasMAIInsts;
}
@@ -694,6 +732,10 @@
return HasSMemTimeInst;
}
+ bool hasShaderCyclesRegister() const {
+ return HasShaderCyclesRegister;
+ }
+
bool hasRegisterBanking() const {
return HasRegisterBanking;
}
@@ -780,6 +822,9 @@
return GFX8Insts;
}
+ /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+ bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+
bool hasDPP() const {
return HasDPP;
}
@@ -796,6 +841,22 @@
return HasDPP8;
}
+ bool has64BitDPP() const {
+ return Has64BitDPP;
+ }
+
+ bool hasPackedFP32Ops() const {
+ return HasPackedFP32Ops;
+ }
+
+ bool hasFmaakFmamkF32Insts() const {
+ return getGeneration() >= GFX10;
+ }
+
+ bool hasExtendedImageInsts() const {
+ return HasExtendedImageInsts;
+ }
+
bool hasR128A16() const {
return HasR128A16;
}
@@ -818,6 +879,12 @@
bool hasNSAEncoding() const { return HasNSAEncoding; }
+ unsigned getNSAMaxSize() const { return NSAMaxSize; }
+
+ bool hasGFX10_AEncoding() const {
+ return GFX10_AEncoding;
+ }
+
bool hasGFX10_BEncoding() const {
return GFX10_BEncoding;
}
@@ -840,6 +907,12 @@
return SGPRInitBug;
}
+ bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+
+ bool hasNegativeUnalignedScratchOffsetBug() const {
+ return NegativeUnalignedScratchOffsetBug;
+ }
+
bool hasMFMAInlineLiteralBug() const {
return HasMFMAInlineLiteralBug;
}
@@ -894,8 +967,17 @@
return HasNSAtoVMEMBug;
}
+ bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+
bool hasHardClauses() const { return getGeneration() >= GFX10; }
+ bool hasGFX90AInsts() const { return GFX90AInsts; }
+
+ /// Return if operations acting on VGPR tuples require even alignment.
+ bool needsAlignedVGPRs() const { return GFX90AInsts; }
+
+ bool hasPackedTID() const { return HasPackedTID; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -917,6 +999,10 @@
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ /// \returns true if the flat_scratch register is initialized by the HW.
+ /// In this case it is readonly.
+ bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const {
@@ -955,9 +1041,24 @@
return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
}
- /// \returns Reserved number of SGPRs for given function \p MF.
+ /// \returns Reserved number of SGPRs. This is common
+ /// utility function called by MachineFunction and
+ /// Function variants of getReservedNumSGPRs.
+ unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
+ /// \returns Reserved number of SGPRs for given machine function \p MF.
unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+ /// \returns Reserved number of SGPRs for given function \p F.
+ unsigned getReservedNumSGPRs(const Function &F) const;
+
+ /// \returns max num SGPRs. This is the common utility
+ /// function called by MachineFunction and Function
+ /// variants of getMaxNumSGPRs.
+ unsigned getBaseMaxNumSGPRs(const Function &F,
+ std::pair<unsigned, unsigned> WavesPerEU,
+ unsigned PreloadedSGPRs,
+ unsigned ReservedNumSGPRs) const;
+
/// \returns Maximum number of SGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of SGPRs explicitly
/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
@@ -968,6 +1069,16 @@
/// unit requirement.
unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+ /// \returns Maximum number of SGPRs that meets number of waves per execution
+ /// unit requirement for function \p F, or number of SGPRs explicitly
+ /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumSGPRs(const Function &F) const;
+
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
@@ -1000,6 +1111,20 @@
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
}
+ /// \returns max num VGPRs. This is the common utility function
+ /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+ unsigned getBaseMaxNumVGPRs(const Function &F,
+ std::pair<unsigned, unsigned> WavesPerEU) const;
+ /// \returns Maximum number of VGPRs that meets number of waves per execution
+ /// unit requirement for function \p F, or number of VGPRs explicitly
+ /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumVGPRs(const Function &F) const;
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 426648d..bb2c298 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -80,9 +80,12 @@
const auto *SymA = Target.getSymA();
assert(SymA);
- Ctx.reportError(Fixup.getLoc(),
- Twine("undefined label '") + SymA->getSymbol().getName() + "'");
- return ELF::R_AMDGPU_NONE;
+ if (SymA->getSymbol().isUndefined()) {
+ Ctx.reportError(Fixup.getLoc(), Twine("undefined label '") +
+ SymA->getSymbol().getName() + "'");
+ return ELF::R_AMDGPU_NONE;
+ }
+ return ELF::R_AMDGPU_REL16;
}
llvm_unreachable("unhandled relocation type");
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fbf7dc2..9ba0ffb 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -10,6 +10,7 @@
#include "AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCExpr.h"
@@ -146,7 +147,7 @@
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
bool IsFlatSeg = !(Desc.TSFlags &
- (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch));
+ (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch));
if (IsFlatSeg) { // Unsigned offset
printU16ImmDecOperand(MI, OpNo, O);
@@ -201,20 +202,19 @@
printNamedBit(MI, OpNo, O, "gds");
}
-void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- if (AMDGPU::isGFX10Plus(STI))
- printNamedBit(MI, OpNo, O, "dlc");
-}
-
-void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "glc");
-}
-
-void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "slc");
+void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm();
+ if (Imm & CPol::GLC)
+ O << " glc";
+ if (Imm & CPol::SLC)
+ O << " slc";
+ if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI))
+ O << " dlc";
+ if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
+ O << " scc";
+ if (Imm & ~CPol::ALL)
+ O << " /* unexpected cache policy bit */";
}
void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
@@ -362,22 +362,30 @@
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Opcode = MI->getOpcode();
+ auto Flags = MII.get(Opcode).TSFlags;
+
if (OpNo == 0) {
- if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
- O << "_e64 ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
- O << "_dpp ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
- O << "_sdwa ";
- else
- O << "_e32 ";
+ if (Flags & SIInstrFlags::VOP3) {
+ if (!getVOP3IsSingle(Opcode))
+ O << "_e64";
+ } else if (Flags & SIInstrFlags::DPP) {
+ O << "_dpp";
+ } else if (Flags & SIInstrFlags::SDWA) {
+ O << "_sdwa";
+ } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
+ ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) {
+ O << "_e32";
+ }
+ O << " ";
}
printOperand(MI, OpNo, STI, O);
// Print default vcc/vcc_lo operand.
- switch (MI->getOpcode()) {
+ switch (Opcode) {
default: break;
case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
@@ -601,6 +609,10 @@
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case MCOI::OPERAND_IMMEDIATE:
printImmediate32(Op.getImm(), STI, O);
break;
@@ -608,6 +620,7 @@
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
printImmediate64(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
@@ -656,18 +669,19 @@
// custom printer.
llvm_unreachable("unexpected immediate operand type");
}
- } else if (Op.isFPImm()) {
+ } else if (Op.isDFPImm()) {
+ double Value = bit_cast<double>(Op.getDFPImm());
// We special case 0.0 because otherwise it will be printed as an integer.
- if (Op.getFPImm() == 0.0)
+ if (Value == 0.0)
O << "0.0";
else {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
int RCID = Desc.OpInfo[OpNo].RegClass;
unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
if (RCBits == 32)
- printImmediate32(FloatToBits(Op.getFPImm()), STI, O);
+ printImmediate32(FloatToBits(Value), STI, O);
else if (RCBits == 64)
- printImmediate64(DoubleToBits(Op.getFPImm()), STI, O);
+ printImmediate64(DoubleToBits(Value), STI, O);
else
llvm_unreachable("Invalid register class size");
}
@@ -727,7 +741,7 @@
if (OpNo + 1 < MI->getNumOperands() &&
(InputModifiers & SISrcMods::ABS) == 0) {
const MCOperand &Op = MI->getOperand(OpNo + 1);
- NegMnemo = Op.isImm() || Op.isFPImm();
+ NegMnemo = Op.isImm() || Op.isDFPImm();
}
if (NegMnemo) {
O << "neg(";
@@ -793,7 +807,16 @@
using namespace AMDGPU::DPP;
unsigned Imm = MI->getOperand(OpNo).getImm();
- if (Imm <= DppCtrl::QUAD_PERM_LAST) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src0);
+
+ if (Src0Idx >= 0 &&
+ Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID &&
+ !AMDGPU::isLegal64BitDPPControl(Imm)) {
+ O << " /* 64 bit dpp only supports row_newbcast */";
+ return;
+ } else if (Imm <= DppCtrl::QUAD_PERM_LAST) {
O << "quad_perm:[";
O << formatDec(Imm & 0x3) << ',';
O << formatDec((Imm & 0xc) >> 2) << ',';
@@ -853,11 +876,15 @@
O << "row_bcast:31";
} else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
(Imm <= DppCtrl::ROW_SHARE_LAST)) {
- if (!AMDGPU::isGFX10Plus(STI)) {
- O << "/* row_share is not supported on ASICs earlier than GFX10 */";
+ if (AMDGPU::isGFX90A(STI)) {
+ O << "row_newbcast:";
+ } else if (AMDGPU::isGFX10Plus(STI)) {
+ O << "row_share:";
+ } else {
+ O << " /* row_newbcast/row_share is not supported on ASICs earlier "
+ "than GFX90A/GFX10 */";
return;
}
- O << "row_share:";
printU4ImmDecOperand(MI, OpNo, O);
} else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
(Imm <= DppCtrl::ROW_XMASK_LAST)) {
@@ -891,7 +918,7 @@
raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm) {
- O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+ O << " bound_ctrl:1";
}
}
@@ -1236,8 +1263,8 @@
decodeMsg(Imm16, MsgId, OpId, StreamId);
if (isValidMsgId(MsgId, STI) &&
- isValidMsgOp(MsgId, OpId) &&
- isValidMsgStream(MsgId, OpId, StreamId)) {
+ isValidMsgOp(MsgId, OpId, STI) &&
+ isValidMsgStream(MsgId, OpId, StreamId, STI)) {
O << "sendmsg(" << getMsgName(MsgId);
if (msgRequiresOp(MsgId)) {
O << ", " << getMsgOpName(MsgId, OpId);
@@ -1560,12 +1587,12 @@
}
} else if (Op.isImm()) {
O << Op.getImm();
- } else if (Op.isFPImm()) {
+ } else if (Op.isDFPImm()) {
// We special case 0.0 because otherwise it will be printed as an integer.
- if (Op.getFPImm() == 0.0)
+ if (Op.getDFPImm() == 0.0)
O << "0.0";
else {
- O << Op.getFPImm();
+ O << bit_cast<double>(Op.getDFPImm());
}
} else if (Op.isExpr()) {
const MCExpr *Exp = Op.getExpr();
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 8d13aa6..3cb4fcb 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -68,12 +68,8 @@
const MCSubtargetInfo &STI, raw_ostream &O);
void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printCPol(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 1836237..5c728bd 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -42,6 +42,7 @@
HasNoDeadStrip = true;
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
+ UsesCFIForDebug = true;
DwarfRegNumForCFI = true;
UseIntegratedAssembler = false;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index f0eb11b..9a9a2c9 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -31,6 +31,20 @@
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
+static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor,
+ uint32_t &Stepping, bool Sramecc, bool Xnack) {
+ if (Major == 9 && Minor == 0) {
+ switch (Stepping) {
+ case 0:
+ case 2:
+ case 4:
+ case 6:
+ if (Xnack)
+ Stepping++;
+ }
+ }
+}
+
bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -86,14 +100,18 @@
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013: AK = GK_GFX1013; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: AK = GK_GFX1031; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: AK = GK_GFX1032; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -145,14 +163,18 @@
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
+ case GK_GFX1013: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013;
case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
case GK_GFX1031: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031;
case GK_GFX1032: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032;
case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
+ case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034;
+ case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -180,8 +202,8 @@
getPALMetadata()->reset();
}
-void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
- OS << "\t.amdgcn_target \"" << Target << "\"\n";
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() {
+ OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n";
}
void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
@@ -191,15 +213,14 @@
}
void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
- OS << "\t.hsa_code_object_isa " <<
- Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) <<
- ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
-
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
+ convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
+ OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << ","
+ << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
}
void
@@ -225,8 +246,8 @@
<< Alignment.value() << '\n';
}
-bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
- OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
+bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
+ OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;
}
@@ -258,17 +279,32 @@
return true;
}
-bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
+bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
- OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
- OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
+ const uint32_t Encoded_s_nop = 0xbf800000;
+ uint32_t Encoded_pad = Encoded_s_code_end;
+
+ // Instruction cache line size in bytes.
+ const unsigned Log2CacheLineSize = 6;
+ const unsigned CacheLineSize = 1u << Log2CacheLineSize;
+
+ // Extra padding amount in bytes to support prefetch mode 3.
+ unsigned FillSize = 3 * CacheLineSize;
+
+ if (AMDGPU::isGFX90A(STI)) {
+ Encoded_pad = Encoded_s_nop;
+ FillSize = 16 * CacheLineSize;
+ }
+
+ OS << "\t.p2alignl " << Log2CacheLineSize << ", " << Encoded_pad << '\n';
+ OS << "\t.fill " << (FillSize / 4) << ", 4, " << Encoded_pad << '\n';
return true;
}
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+ bool ReserveVCC, bool ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
OS << "\t.amdhsa_kernel " << KernelName << '\n';
@@ -281,10 +317,13 @@
<< '\n';
OS << "\t\t.amdhsa_private_segment_fixed_size "
<< KD.private_segment_fixed_size << '\n';
+ OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(
+ OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
@@ -297,9 +336,10 @@
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -307,10 +347,12 @@
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
- PRINT_FIELD(
- OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ PRINT_FIELD(OS,
+ (hasArchitectedFlatScratch(STI)
+ ? ".amdhsa_enable_private_segment"
+ : ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
+ KD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@@ -331,12 +373,30 @@
OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+ if (AMDGPU::isGFX90A(STI))
+ OS << "\t\t.amdhsa_accum_offset " <<
+ (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
+ << '\n';
+
if (!ReserveVCC)
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
- if (IVersion.Major >= 7 && !ReserveFlatScr)
+ if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
- if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
- OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVer) {
+ default:
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ if (getTargetID()->isXnackSupported())
+ OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
+ break;
+ }
+ }
PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
compute_pgm_rsrc1,
@@ -360,6 +420,10 @@
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ if (AMDGPU::isGFX90A(STI))
+ PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
+ compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
if (IVersion.Major >= 10) {
PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
compute_pgm_rsrc1,
@@ -405,23 +469,7 @@
AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S,
const MCSubtargetInfo &STI)
- : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) {
- MCAssembler &MCA = getStreamer().getAssembler();
- unsigned EFlags = MCA.getELFHeaderEFlags();
-
- EFlags &= ~ELF::EF_AMDGPU_MACH;
- EFlags |= getElfMach(STI.getCPU());
-
- EFlags &= ~ELF::EF_AMDGPU_XNACK;
- if (AMDGPU::hasXNACK(STI))
- EFlags |= ELF::EF_AMDGPU_XNACK;
-
- EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
- if (AMDGPU::hasSRAMECC(STI))
- EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
-
- MCA.setELFHeaderEFlags(EFlags);
-}
+ : AMDGPUTargetStreamer(S), STI(STI), Streamer(S) {}
MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
@@ -431,6 +479,9 @@
// We use it for emitting the accumulated PAL metadata as a .note record.
// The PAL metadata is reset after it is emitted.
void AMDGPUTargetELFStreamer::finish() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCA.setELFHeaderEFlags(getEFlags());
+
std::string Blob;
const char *Vendor = getPALMetadata()->getVendor();
unsigned Type = getPALMetadata()->getType();
@@ -456,7 +507,7 @@
unsigned NoteFlags = 0;
// TODO Apparently, this is currently needed for OpenCL as mentioned in
// https://reviews.llvm.org/D74995
- if (Os == Triple::AMDHSA)
+ if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
NoteFlags = ELF::SHF_ALLOC;
S.PushSection();
@@ -472,24 +523,150 @@
S.PopSection();
}
-void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+unsigned AMDGPUTargetELFStreamer::getEFlags() {
+ switch (STI.getTargetTriple().getArch()) {
+ default:
+ llvm_unreachable("Unsupported Arch");
+ case Triple::r600:
+ return getEFlagsR600();
+ case Triple::amdgcn:
+ return getEFlagsAMDGCN();
+ }
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsR600() {
+ assert(STI.getTargetTriple().getArch() == Triple::r600);
+
+ return getElfMach(STI.getCPU());
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDGCN() {
+ assert(STI.getTargetTriple().getArch() == Triple::amdgcn);
+
+ switch (STI.getTargetTriple().getOS()) {
+ default:
+ // TODO: Why are some tests have "mingw" listed as OS?
+ // llvm_unreachable("Unsupported OS");
+ case Triple::UnknownOS:
+ return getEFlagsUnknownOS();
+ case Triple::AMDHSA:
+ return getEFlagsAMDHSA();
+ case Triple::AMDPAL:
+ return getEFlagsAMDPAL();
+ case Triple::Mesa3D:
+ return getEFlagsMesa3D();
+ }
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
+ // TODO: Why are some tests have "mingw" listed as OS?
+ // assert(STI.getTargetTriple().getOS() == Triple::UnknownOS);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
+ assert(STI.getTargetTriple().getOS() == Triple::AMDHSA);
+
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return getEFlagsV3();
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return getEFlagsV4();
+ }
+ }
+
+ llvm_unreachable("HSA OS ABI Version identification must be defined");
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() {
+ assert(STI.getTargetTriple().getOS() == Triple::AMDPAL);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsMesa3D() {
+ assert(STI.getTargetTriple().getOS() == Triple::Mesa3D);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsV3() {
+ unsigned EFlagsV3 = 0;
+
+ // mach.
+ EFlagsV3 |= getElfMach(STI.getCPU());
+
+ // xnack.
+ if (getTargetID()->isXnackOnOrAny())
+ EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_XNACK_V3;
+ // sramecc.
+ if (getTargetID()->isSramEccOnOrAny())
+ EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_V3;
+
+ return EFlagsV3;
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsV4() {
+ unsigned EFlagsV4 = 0;
+
+ // mach.
+ EFlagsV4 |= getElfMach(STI.getCPU());
+
+ // xnack.
+ switch (getTargetID()->getXnackSetting()) {
+ case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Any:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Off:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::On:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4;
+ break;
+ }
+ // sramecc.
+ switch (getTargetID()->getSramEccSetting()) {
+ case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Any:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ANY_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Off:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::On:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ON_V4;
+ break;
+ }
+
+ return EFlagsV4;
+}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
uint32_t Major, uint32_t Minor) {
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
- ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
+ ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt32(Major);
OS.emitInt32(Minor);
});
}
void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
uint16_t VendorNameSize = VendorName.size() + 1;
uint16_t ArchNameSize = ArchName.size() + 1;
@@ -497,8 +674,9 @@
sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
VendorNameSize + ArchNameSize;
+ convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
- ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
+ ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt16(VendorNameSize);
OS.emitInt16(ArchNameSize);
OS.emitInt32(Major);
@@ -546,7 +724,7 @@
SymbolELF->setSize(MCConstantExpr::create(Size, getContext()));
}
-bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
+bool AMDGPUTargetELFStreamer::EmitISAVersion() {
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
auto &Context = getContext();
@@ -556,10 +734,10 @@
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_ISA_NAME,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
- OS.emitBytes(IsaVersionString);
+ OS.emitBytes(getTargetID()->toString());
OS.emitLabel(DescEnd);
});
return true;
@@ -607,7 +785,7 @@
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
OS.emitBytes(HSAMetadataString);
@@ -616,14 +794,28 @@
return true;
}
-bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
+bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
+ const uint32_t Encoded_s_nop = 0xbf800000;
+ uint32_t Encoded_pad = Encoded_s_code_end;
+
+ // Instruction cache line size in bytes.
+ const unsigned Log2CacheLineSize = 6;
+ const unsigned CacheLineSize = 1u << Log2CacheLineSize;
+
+ // Extra padding amount in bytes to support prefetch mode 3.
+ unsigned FillSize = 3 * CacheLineSize;
+
+ if (AMDGPU::isGFX90A(STI)) {
+ Encoded_pad = Encoded_s_nop;
+ FillSize = 16 * CacheLineSize;
+ }
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.emitValueToAlignment(64, Encoded_s_code_end, 4);
- for (unsigned I = 0; I < 48; ++I)
- OS.emitInt32(Encoded_s_code_end);
+ OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4);
+ for (unsigned I = 0; I < FillSize; I += 4)
+ OS.emitInt32(Encoded_pad);
OS.PopSection();
return true;
}
@@ -631,8 +823,7 @@
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) {
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
@@ -659,8 +850,11 @@
Streamer.emitLabel(KernelDescriptorSymbol);
Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
+ Streamer.emitInt32(KernelDescriptor.kernarg_size);
+
for (uint8_t Res : KernelDescriptor.reserved0)
Streamer.emitInt8(Res);
+
// FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
// expression being created is:
// (start of kernel code) - (start of kernel descriptor)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 1ad6453..cef34a5 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUPALMetadata.h"
#include "llvm/MC/MCStreamer.h"
@@ -23,6 +24,7 @@
class MDNode;
class Module;
class Type;
+class formatted_raw_ostream;
namespace AMDGPU {
namespace HSAMD {
@@ -38,6 +40,9 @@
AMDGPUPALMetadata PALMetadata;
protected:
+ // TODO: Move HSAMetadataStream to AMDGPUTargetStreamer.
+ Optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;
+
MCContext &getContext() const { return Streamer.getContext(); }
public:
@@ -45,15 +50,15 @@
AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
- virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+ virtual void EmitDirectiveAMDGCNTarget() = 0;
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;
- virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) = 0;
+ virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) = 0;
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
@@ -63,7 +68,7 @@
Align Alignment) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
+ virtual bool EmitISAVersion() = 0;
/// \returns True on success, false on failure.
virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
@@ -84,16 +89,32 @@
virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitCodeEnd() = 0;
+ virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) = 0;
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) = 0;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) = 0;
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
+
+ const Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() const {
+ return TargetID;
+ }
+ Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
+ return TargetID;
+ }
+ void initializeTargetID(const MCSubtargetInfo &STI) {
+ assert(TargetID == None && "TargetID can only be initialized once");
+ TargetID.emplace(STI);
+ }
+ void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
+ initializeTargetID(STI);
+
+ assert(getTargetID() != None && "TargetID is None");
+ getTargetID()->setTargetIDFromFeaturesString(FeatureString);
+ }
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -103,14 +124,14 @@
void finish() override;
- void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+ void EmitDirectiveAMDGCNTarget() override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
- void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
+ void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
@@ -119,7 +140,7 @@
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
- bool EmitISAVersion(StringRef IsaVersionString) override;
+ bool EmitISAVersion() override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
@@ -128,22 +149,34 @@
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitCodeEnd() override;
+ bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
+ const MCSubtargetInfo &STI;
MCStreamer &Streamer;
- Triple::OSType Os;
void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc);
+ unsigned getEFlags();
+
+ unsigned getEFlagsR600();
+ unsigned getEFlagsAMDGCN();
+
+ unsigned getEFlagsUnknownOS();
+ unsigned getEFlagsAMDHSA();
+ unsigned getEFlagsAMDPAL();
+ unsigned getEFlagsMesa3D();
+
+ unsigned getEFlagsV3();
+ unsigned getEFlagsV4();
+
public:
AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -151,14 +184,14 @@
void finish() override;
- void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+ void EmitDirectiveAMDGCNTarget() override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
- void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
+ void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
@@ -167,7 +200,7 @@
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
- bool EmitISAVersion(StringRef IsaVersionString) override;
+ bool EmitISAVersion() override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
@@ -176,13 +209,12 @@
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitCodeEnd() override;
+ bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 1a1ffcd..dbce4b2 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -71,6 +71,9 @@
unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+private:
+ uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
};
} // end anonymous namespace
@@ -219,7 +222,7 @@
Imm = C->getValue();
} else {
- assert(!MO.isFPImm());
+ assert(!MO.isDFPImm());
if (!MO.isImm())
return ~0;
@@ -234,12 +237,17 @@
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -274,16 +282,40 @@
}
}
+uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
+ using namespace AMDGPU::VOP3PEncoding;
+ using namespace AMDGPU::OpName;
+
+ if (AMDGPU::getNamedOperandIdx(Opcode, op_sel_hi) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Opcode, src2) != -1)
+ return 0;
+ if (AMDGPU::getNamedOperandIdx(Opcode, src1) != -1)
+ return OP_SEL_HI_2;
+ if (AMDGPU::getNamedOperandIdx(Opcode, src0) != -1)
+ return OP_SEL_HI_1 | OP_SEL_HI_2;
+ }
+ return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
+}
+
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
verifyInstructionPredicates(MI,
computeAvailableFeatures(STI.getFeatureBits()));
+ int Opcode = MI.getOpcode();
uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ const MCInstrDesc &Desc = MCII.get(Opcode);
unsigned bytes = Desc.getSize();
+ // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
+ // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
+ if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+ Encoding |= getImplicitOpSelHiEncoding(Opcode);
+ }
+
for (unsigned i = 0; i < bytes; i++) {
OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
}
@@ -431,6 +463,7 @@
MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 54c8cdf..bacb790 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -11,12 +11,14 @@
//
// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
-// - MIMGEncGfx10Default: gfx default (non-NSA) encoding
+// - MIMGEncGfx90a: encoding for gfx90a for atomics
+// - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding
// - MIMGEncGfx10NSA: gfx10 NSA encoding
class MIMGEncoding;
def MIMGEncGfx6 : MIMGEncoding;
def MIMGEncGfx8 : MIMGEncoding;
+def MIMGEncGfx90a : MIMGEncoding;
def MIMGEncGfx10Default : MIMGEncoding;
def MIMGEncGfx10NSA : MIMGEncoding;
@@ -39,6 +41,8 @@
bit Coordinates = 1;
bit LodOrClampOrMip = 0;
bit HasD16 = 0;
+ bit IsAtomicRet = 0;
+ bit MSAA = 0;
}
def MIMGBaseOpcode : GenericEnum {
@@ -50,7 +54,7 @@
let CppTypeName = "MIMGBaseOpcodeInfo";
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
- "LodOrClampOrMip", "HasD16"];
+ "LodOrClampOrMip", "HasD16", "MSAA"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
let PrimaryKey = ["BaseOpcode"];
@@ -64,7 +68,7 @@
def MIMGDimInfoTable : GenericTable {
let FilterClass = "AMDGPUDimProps";
let CppTypeName = "MIMGDimInfo";
- let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
+ let Fields = ["Dim", "NumCoords", "NumGradients", "MSAA", "DA", "Encoding", "AsmSuffix"];
string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Dim"];
@@ -81,9 +85,17 @@
let Key = ["AsmSuffix"];
}
-class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> {
- field bits<8> SI_GFX10 = si_gfx10;
- field bits<8> VI = vi;
+def MIMG {
+ int NOP = -1;
+}
+
+class mimgopc <int base, int vi = base, int si = base> {
+ field bits<8> BASE = base; // Opcode for all but atomics
+ field bits<8> VI = vi; // VI is only used for atomic instructions
+ field bits<8> SI = si; // SI is only used for atomic instructions
+ bit HAS_BASE = !ne(base, MIMG.NOP);
+ bit HAS_VI = !ne(vi, MIMG.NOP);
+ bit HAS_SI = !ne(si, MIMG.NOP);
}
class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> {
@@ -198,14 +210,24 @@
// Base class of all pre-gfx10 MIMG instructions.
class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx6789<op> {
- let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
- let AssemblerPredicate = isGFX6GFX7GFX8GFX9;
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
+ let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx6;
let d16 = !if(BaseOpcode.HasD16, ?, 0);
}
+class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx90a<op> {
+ let SubtargetPredicate = isGFX90APlus;
+ let AssemblerPredicate = isGFX90APlus;
+
+ let MIMGEncoding = MIMGEncGfx90a;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+}
+
// Base class of all non-NSA gfx10 MIMG instructions.
class MIMG_gfx10<int op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
@@ -218,8 +240,8 @@
let nsa = 0;
}
-// Base class for all NSA MIMG instructions. Note that 1-dword addresses always
-// use non-NSA variants.
+// Base class for all NSA MIMG instructions.
+// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
let SubtargetPredicate = isGFX10Plus;
@@ -235,169 +257,229 @@
let nsa = nsah.NSA;
}
-class MIMG_NoSampler_Helper <bits<8> op, string asm,
+class MIMG_NoSampler_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_NoSampler_gfx10<int op, string opcode,
+class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass addr_rc,
+ string dns="">
+ : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
+class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
+multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
- bit enableDisasm> {
+ bit enableDisasm,
+ bit ExtendedImageInst = 1> {
let ssamp = 0 in {
let VAddrDwords = 1 in {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ if !not(ExtendedImageInst) then
+ def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "GFX90A", "")>;
+ def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
- def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
- def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ if op.HAS_BASE then {
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ if !not(ExtendedImageInst) then
+ def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
- def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
- def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ if op.HAS_BASE then {
+ def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ if !not(ExtendedImageInst) then
+ def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
- def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
- def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+ if !not(ExtendedImageInst) then
+ def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
-multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
- bit isResInfo = 0> {
+multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
+ bit isResInfo = 0,
+ bit msaa = 0> {
def "" : MIMGBaseOpcode {
let Coordinates = !not(isResInfo);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
+ let MSAA = msaa;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
mayLoad = !not(isResInfo) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
let VDataDwords = 2 in
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
let VDataDwords = 3 in
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
let VDataDwords = 4 in
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
let VDataDwords = 5 in
- defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
}
}
-class MIMG_Store_Helper <bits<8> op, string asm,
+class MIMG_Store_Helper <mimgopc op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG_gfx6789<op, (outs), dns> {
+ : MIMG_gfx6789<op.BASE, (outs), dns> {
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Store_gfx10<int op, string opcode,
+class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
+ RegisterClass data_rc,
+ RegisterClass addr_rc,
+ string dns = "">
+ : MIMG_gfx90a<op.BASE, (outs), dns> {
+ let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs), dns> {
+ : MIMG_gfx10<op.BASE, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Store_nsa_gfx10<int op, string opcode,
+class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_Store_Addr_Helper <int op, string asm,
+multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
RegisterClass data_rc,
bit enableDisasm> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
DisableWQM = 1, ssamp = 0 in {
let VAddrDwords = 1 in {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "GFX90A", "")>;
+ def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
- def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ if op.HAS_BASE then {
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
- def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ if op.HAS_BASE then {
+ def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
- def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
- def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+ def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
-multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
+multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
def "" : MIMGBaseOpcode {
let Store = 1;
let LodOrClampOrMip = mip;
@@ -425,43 +507,63 @@
let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
}
-class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, string dns="">
+ : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da);
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+}
+
+class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc,
+ : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
!if(enableDasm, "GFX6GFX7", "")> {
let AssemblerPredicate = isGFX6GFX7;
}
-class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
: MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
- let AssemblerPredicate = isGFX8GFX9;
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx8;
}
-class MIMG_Atomic_gfx10<mimg op, string opcode,
+class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+ let AssemblerPredicate = isGFX90APlus;
+ let MIMGEncoding = MIMGEncGfx90a;
+}
+
+class MIMG_Atomic_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
bit enableDisasm = 0>
- : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst),
+ : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst),
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
-class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
+class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
bit enableDisasm = 0>
- : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs,
+ : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -469,95 +571,137 @@
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
-multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
RegisterClass data_rc,
- bit enableDasm = 0> {
+ bit enableDasm = 0,
+ bit isFP = 0> {
let hasSideEffects = 1, // FIXME: remove this
mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
- ssamp = 0 in {
+ ssamp = 0, FPAtomic = isFP in {
let VAddrDwords = 1 in {
- def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
- def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
- def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ if op.HAS_SI then {
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_VI then {
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_BASE then {
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
}
let VAddrDwords = 2 in {
- def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
- def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ if op.HAS_SI then {
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_VI then {
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ }
}
let VAddrDwords = 3 in {
- def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
- def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ if op.HAS_SI then {
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_VI then {
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ }
}
let VAddrDwords = 4 in {
- def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
- def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ if op.HAS_SI then {
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_VI then {
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ }
}
}
}
-multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
- def "" : MIMGBaseOpcode {
- let Atomic = 1;
- let AtomicX2 = isCmpSwap;
- }
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics
+ let IsAtomicRet = 1 in {
+ def "" : MIMGBaseOpcode {
+ let Atomic = 1;
+ let AtomicX2 = isCmpSwap;
+ }
- let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
- // _V* variants have different dst size, but the size is encoded implicitly,
- // using dmask and tfe. Only 32-bit variant is registered with disassembler.
- // Other variants are reconstructed by disassembler using dmask and tfe.
- let VDataDwords = !if(isCmpSwap, 2, 1) in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
- let VDataDwords = !if(isCmpSwap, 4, 2) in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
- }
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ // _V* variants have different dst size, but the size is encoded implicitly,
+ // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+ // Other variants are reconstructed by disassembler using dmask and tfe.
+ let VDataDwords = !if(isCmpSwap, 2, 1) in
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>;
+ let VDataDwords = !if(isCmpSwap, 4, 2) in
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>;
+ }
+ } // End IsAtomicRet = 1
}
-class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Sampler_gfx10<int op, string opcode,
+class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
+ RegisterClass src_rc, string dns="">
+ : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$a16$tfe$lwe"
+ #"$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Sampler_nsa_gfx10<int op, string opcode,
+class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$a16$tfe$lwe"
+ #"$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -569,8 +713,11 @@
!if(!eq(NumWords, 2), VReg_64,
!if(!eq(NumWords, 3), VReg_96,
!if(!eq(NumWords, 4), VReg_128,
+ !if(!eq(NumWords, 5), VReg_160,
+ !if(!eq(NumWords, 6), VReg_192,
+ !if(!eq(NumWords, 7), VReg_224,
!if(!le(NumWords, 8), VReg_256,
- !if(!le(NumWords, 16), VReg_512, ?)))))));
+ !if(!le(NumWords, 16), VReg_512, ?))))))))));
// Whether the instruction variant with this vaddr size should be enabled for
// the auto-generated disassembler.
@@ -588,9 +735,9 @@
bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
}
-class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
- list<MIMGAddrSize> List = lst;
- int Min = min;
+class MIMGAddrSizes_dw_range<list<int> range> {
+ int Min = !head(range);
+ int Max = !if(!empty(!tail(range)), Min, !head(!tail(range)));
}
class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
@@ -600,8 +747,8 @@
list<int> AllNumAddrWords =
!foreach(dw, !if(sample.Gradients,
!if(!eq(sample.LodOrClamp, ""),
- [2, 3, 4, 5, 6, 7, 9],
- [2, 3, 4, 5, 7, 8, 10]),
+ [2, 3, 4, 5, 6, 7, 8, 9],
+ [2, 3, 4, 5, 6, 7, 8, 9, 10]),
!if(!eq(sample.LodOrClamp, ""),
[1, 2, 3],
[1, 2, 3, 4])),
@@ -611,12 +758,17 @@
// required numbers of address words. The disassembler defaults to the
// smallest register class.
list<MIMGAddrSize> MachineInstrs =
- !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
- !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
- MIMGAddrSizes_tmp<
- !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
- !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
- lhs)).List;
+ !foldl([]<MIMGAddrSize>,
+ !foreach(range,
+ // V4 is generated for V3 and V4
+ // V8 is generated for V5 through V8
+ // V16 is generated for V9 through V16
+ [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]],
+ MIMGAddrSizes_dw_range<range>),
+ lhs, dw,
+ !if(isRangeInList<dw.Min, dw.Max, AllNumAddrWords>.ret,
+ !listconcat(lhs, [MIMGAddrSize<dw.Max, !empty(lhs)>]),
+ lhs));
// For NSA, generate machine instructions for all possible numbers of words
// except 1 (which is already covered by the non-NSA case).
@@ -632,25 +784,34 @@
lhs))));
}
-multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm,
+multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
AMDGPUSampleVariant sample, RegisterClass dst_rc,
- bit enableDisasm = 0> {
+ bit enableDisasm = 0,
+ bit ExtendedImageInst = 1> {
foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
let VAddrDwords = addr.NumWords in {
- def _V # addr.NumWords
- : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
- def _V # addr.NumWords # _gfx10
- : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if !not(ExtendedImageInst) then
+ def _V # addr.NumWords # _gfx90a
+ : MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "GFX90A", "")>;
+ def _V # addr.NumWords # _gfx10
+ : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
let VAddrDwords = addr.NumWords in {
- def _V # addr.NumWords # _nsa_gfx10
- : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V # addr.NumWords # _nsa_gfx10
+ : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
}
@@ -663,9 +824,10 @@
let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
}
-multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
bit isG16 = 0, bit isGetLod = 0,
- string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
+ string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
+ bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = !not(isGetLod);
let G16 = isG16;
@@ -674,22 +836,22 @@
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
mayLoad = !not(isGetLod) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst>;
let VDataDwords = 2 in
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst>;
let VDataDwords = 4 in
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst>;
}
}
-multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
: MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
@@ -697,7 +859,7 @@
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
- Gather4 = 1, hasPostISelHook = 0 in {
+ Gather4 = 1 in {
let VDataDwords = 2 in
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
@@ -707,11 +869,11 @@
}
}
-multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
-class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A16>
- : MIMG_gfx10<op, (outs VReg_128:$vdata), "AMDGPU"> {
+class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
+ : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
@@ -720,25 +882,23 @@
let nsa = 0;
}
-class MIMG_IntersectRay_nsa_gfx10<int op, string opcode, int num_addrs, bit A16>
- : MIMG_nsa_gfx10<op, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16>
+ : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
let InOperandList = !con(nsah.AddrIns,
(ins SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
}
-multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> {
+multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16> {
def "" : MIMGBaseOpcode;
- let SubtargetPredicate = HasGFX10_BEncoding,
- AssemblerPredicate = HasGFX10_BEncoding,
+ let SubtargetPredicate = HasGFX10_AEncoding,
+ AssemblerPredicate = HasGFX10_AEncoding,
AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
dmask = 0xf,
unorm = 1,
d16 = 0,
- glc = 0,
- slc = 0,
- dlc = 0,
+ cpol = 0,
tfe = 0,
lwe = 0,
r128 = 1,
@@ -762,131 +922,133 @@
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
-defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
-defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
-defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
-defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
-defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
-defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
+defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <mimgopc<0x08>, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>;
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>;
-defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
-defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
-defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//let FPAtomic = 1 in {
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-//} // End let FPAtomic = 1
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">;
+defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">;
+defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 0, 1>;
+defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
+defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>;
+let OtherPredicates = [HasExtendedImageInsts] in {
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>;
+//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x61>, ?>;
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
-defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+} // End OtherPredicates = [HasExtendedImageInsts]
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-let SubtargetPredicate = HasGFX10_BEncoding in
-defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
+let SubtargetPredicate = HasGFX10_AEncoding in
+defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
-defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>;
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 12, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 9, 1>;
/********** ========================================= **********/
/********** Table of dimension-aware image intrinsics **********/
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index c012090..002ef18 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -451,9 +451,9 @@
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+ case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
- case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
case ISD::FCOS:
@@ -765,78 +765,11 @@
DAG.getConstantFP(numbers::pif, DL, MVT::f32));
}
-SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
-
- SDValue Lo = Op.getOperand(0);
- SDValue Hi = Op.getOperand(1);
- SDValue Shift = Op.getOperand(2);
- SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue One = DAG.getConstant(1, DL, VT);
-
- SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
- SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
- SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
- SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
- // The dance around Width1 is necessary for 0 special case.
- // Without it the CompShift might be 32, producing incorrect results in
- // Overflow. So we do the shift in two steps, the alternative is to
- // add a conditional to filter the special case.
-
- SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
- Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
-
- SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
- HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
- SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
-
- SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
- SDValue LoBig = Zero;
-
- Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
- Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
-}
-
-SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
-
- SDValue Lo = Op.getOperand(0);
- SDValue Hi = Op.getOperand(1);
- SDValue Shift = Op.getOperand(2);
- SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue One = DAG.getConstant(1, DL, VT);
-
- const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
-
- SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
- SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
- SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
- SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
- // The dance around Width1 is necessary for 0 special case.
- // Without it the CompShift might be 32, producing incorrect results in
- // Overflow. So we do the shift in two steps, the alternative is to
- // add a conditional to filter the special case.
-
- SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
- Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
-
- SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
- SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
- LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
-
- SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
- SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
-
- Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
- Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+SDValue R600TargetLowering::LowerShiftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Lo, Hi;
+ expandShiftParts(Op.getNode(), Lo, Hi, DAG);
+ return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
}
SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
@@ -1239,7 +1172,7 @@
Align Alignment = StoreNode->getAlign();
if (Alignment < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
StoreNode->getMemOperand()->getFlags(),
nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
@@ -1640,7 +1573,7 @@
}
bool R600TargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1655,7 +1588,7 @@
if (IsFast)
*IsFast = true;
- return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+ return VT.bitsGT(MVT::i32) && Alignment >= Align(4);
}
static SDValue CompactSwizzlableVector(
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index b560da8..920cf3c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -50,10 +50,19 @@
const SelectionDAG &DAG) const override;
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS, unsigned Align,
+ EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
+ virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+ bool LegalOperations) const override {
+ // R600 has "custom" lowering for truncating stores despite not supporting
+ // those instructions. If we allow that custom lowering in the DAG combiner
+ // then all truncates are merged into truncating stores, giving worse code
+ // generation. This hook prevents the DAG combiner performing that combine.
+ return isTruncStoreLegal(ValVT, MemVT);
+ }
+
private:
unsigned Gen;
/// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -85,8 +94,7 @@
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
unsigned mainop, unsigned ovf) const;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 5fd912e..8f1a069 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -301,7 +301,8 @@
}
}
SmallVector<ReturnInst*, 8> Returns;
- CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+ CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly,
+ Returns);
// Build new MDNode.
SmallVector<Metadata *, 6> KernelMDArgs;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
deleted file mode 100644
index 3b753cb..0000000
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Any MIMG instructions that use tfe or lwe require an initialization of the
-/// result register that will be written in the case of a memory access failure
-/// The required code is also added to tie this init code to the result of the
-/// img instruction
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-
-#define DEBUG_TYPE "si-img-init"
-
-using namespace llvm;
-
-namespace {
-
-class SIAddIMGInit : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIAddIMGInit() : MachineFunctionPass(ID) {
- initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
-
-char SIAddIMGInit::ID = 0;
-
-char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
-
-FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
-
-bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *RI = ST.getRegisterInfo();
- bool Changed = false;
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
- ++BI) {
- MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
-
- auto Opcode = MI.getOpcode();
- if (TII->isMIMG(Opcode) && !MI.mayStore()) {
- MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
- MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
- MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
-
- if (!TFE && !LWE) // intersect_ray
- continue;
-
- unsigned TFEVal = TFE->getImm();
- unsigned LWEVal = LWE->getImm();
- unsigned D16Val = D16 ? D16->getImm() : 0;
-
- if (TFEVal || LWEVal) {
- // At least one of TFE or LWE are non-zero
- // We have to insert a suitable initialization of the result value and
- // tie this to the dest of the image instruction.
-
- const DebugLoc &DL = MI.getDebugLoc();
-
- int DstIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
-
- // Calculate which dword we have to initialize to 0.
- MachineOperand *MO_Dmask =
- TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
-
- // check that dmask operand is found.
- assert(MO_Dmask && "Expected dmask operand in instruction");
-
- unsigned dmask = MO_Dmask->getImm();
- // Determine the number of active lanes taking into account the
- // Gather4 special case
- unsigned ActiveLanes =
- TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
-
- bool Packed = !ST.hasUnpackedD16VMem();
-
- unsigned InitIdx =
- D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
-
- // Abandon attempt if the dst size isn't large enough
- // - this is in fact an error but this is picked up elsewhere and
- // reported correctly.
- uint32_t DstSize =
- RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
- if (DstSize < InitIdx)
- continue;
-
- // Create a register for the intialization value.
- Register PrevDst =
- MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
- unsigned NewDst = 0; // Final initialized value will be in here
-
- // If PRTStrictNull feature is enabled (the default) then initialize
- // all the result registers to 0, otherwise just the error indication
- // register (VGPRn+1)
- unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
- unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1);
-
- if (DstSize == 1) {
- // In this case we can just initialize the result directly
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
- .addImm(0);
- NewDst = PrevDst;
- } else {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
- for (; SizeLeft; SizeLeft--, CurrIdx++) {
- NewDst =
- MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
- // Initialize dword
- Register SubReg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
- .addImm(0);
- // Insert into the super-reg
- BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
- .addReg(PrevDst)
- .addReg(SubReg)
- .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
-
- PrevDst = NewDst;
- }
- }
-
- // Add as an implicit operand
- MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
-
- // Tie the just added implicit operand to the dst
- MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
-
- Changed = true;
- }
- }
- }
- }
-
- return Changed;
-}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 625749d..397b2f8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -71,6 +71,8 @@
bool isElse(PHINode *Phi);
+ bool hasKill(const BasicBlock *BB);
+
void eraseIfUnused(PHINode *Phi);
void openIf(BranchInst *Term);
@@ -98,6 +100,7 @@
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
FunctionPass::getAnalysisUsage(AU);
@@ -181,6 +184,15 @@
return true;
}
+bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) {
+ for (const Instruction &I : *BB) {
+ if (const CallInst *CI = dyn_cast<CallInst>(&I))
+ if (CI->getIntrinsicID() == Intrinsic::amdgcn_kill)
+ return true;
+ }
+ return false;
+}
+
// Erase "Phi" if it is not used any more
void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
if (RecursivelyDeleteDeadPHINode(Phi)) {
@@ -339,7 +351,7 @@
if (isTopOfStack(BB)) {
PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
- if (Phi && Phi->getParent() == BB && isElse(Phi)) {
+ if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) {
insertElse(Term);
eraseIfUnused(Phi);
continue;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index c83802b..d3c0d79 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -91,7 +91,7 @@
D16Buf = UINT64_C(1) << 50,
// FLAT instruction accesses FLAT_GLBL segment.
- IsFlatGlobal = UINT64_C(1) << 51,
+ FlatGlobal = UINT64_C(1) << 51,
// Uses floating point double precision rounding mode
FPDPRounding = UINT64_C(1) << 52,
@@ -106,7 +106,13 @@
IsDOT = UINT64_C(1) << 55,
// FLAT instruction accesses FLAT_SCRATCH segment.
- IsFlatScratch = UINT64_C(1) << 56
+ FlatScratch = UINT64_C(1) << 56,
+
+ // Atomic without return.
+ IsAtomicNoRet = UINT64_C(1) << 57,
+
+ // Atomic with return.
+ IsAtomicRet = UINT64_C(1) << 58
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -136,6 +142,8 @@
OPERAND_REG_IMM_FP16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_V2INT32,
+ OPERAND_REG_IMM_V2FP32,
/// Operands with register or inline constant
OPERAND_REG_INLINE_C_INT16,
@@ -144,25 +152,30 @@
OPERAND_REG_INLINE_C_FP16,
OPERAND_REG_INLINE_C_FP32,
OPERAND_REG_INLINE_C_FP64,
- OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT16,
+ OPERAND_REG_INLINE_C_V2FP16,
+ OPERAND_REG_INLINE_C_V2INT32,
+ OPERAND_REG_INLINE_C_V2FP32,
/// Operands with an AccVGPR register or inline constant
OPERAND_REG_INLINE_AC_INT16,
OPERAND_REG_INLINE_AC_INT32,
OPERAND_REG_INLINE_AC_FP16,
OPERAND_REG_INLINE_AC_FP32,
- OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_FP64,
OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_V2INT32,
+ OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
- OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
- OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
- OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -263,15 +276,33 @@
} // namespace AMDGPU
namespace AMDGPU {
+namespace CPol {
+
+enum CPol {
+ GLC = 1,
+ SLC = 2,
+ DLC = 4,
+ SCC = 16,
+ ALL = GLC | SLC | DLC | SCC
+};
+
+} // namespace CPol
+
namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
enum Id { // Message ID, width(4) [3:0].
ID_UNKNOWN_ = -1,
ID_INTERRUPT = 1,
- ID_GS,
- ID_GS_DONE,
- ID_GS_ALLOC_REQ = 9,
- ID_GET_DOORBELL = 10,
+ ID_GS = 2,
+ ID_GS_DONE = 3,
+ ID_SAVEWAVE = 4, // added in GFX8
+ ID_STALL_WAVE_GEN = 5, // added in GFX9
+ ID_HALT_WAVES = 6, // added in GFX9
+ ID_ORDERED_PS_DONE = 7, // added in GFX9
+ ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
+ ID_GS_ALLOC_REQ = 9, // added in GFX9
+ ID_GET_DOORBELL = 10, // added in GFX9
+ ID_GET_DDID = 11, // added in GFX10
ID_SYSMSG = 15,
ID_GAPS_LAST_, // Indicate that sequence has gaps.
ID_GAPS_FIRST_ = ID_INTERRUPT,
@@ -289,16 +320,16 @@
OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_),
// GS operations are encoded in bits 5:4
OP_GS_NOP = 0,
- OP_GS_CUT,
- OP_GS_EMIT,
- OP_GS_EMIT_CUT,
+ OP_GS_CUT = 1,
+ OP_GS_EMIT = 2,
+ OP_GS_EMIT_CUT = 3,
OP_GS_LAST_,
OP_GS_FIRST_ = OP_GS_NOP,
// SYS operations are encoded in bits 6:4
OP_SYS_ECC_ERR_INTERRUPT = 1,
- OP_SYS_REG_RD,
- OP_SYS_HOST_TRAP_ACK,
- OP_SYS_TTRACE_PC,
+ OP_SYS_REG_RD = 2,
+ OP_SYS_HOST_TRAP_ACK = 3,
+ OP_SYS_TTRACE_PC = 4,
OP_SYS_LAST_,
OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
};
@@ -640,6 +671,7 @@
namespace DPP {
+// clang-format off
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
QUAD_PERM_ID = 0xE4, // identity permutation
@@ -674,12 +706,17 @@
BCAST31 = 0x143,
DPP_UNUSED8_FIRST = 0x144,
DPP_UNUSED8_LAST = 0x14F,
+ ROW_NEWBCAST_FIRST= 0x150,
+ ROW_NEWBCAST_LAST = 0x15F,
+ ROW_SHARE0 = 0x150,
ROW_SHARE_FIRST = 0x150,
ROW_SHARE_LAST = 0x15F,
+ ROW_XMASK0 = 0x160,
ROW_XMASK_FIRST = 0x160,
ROW_XMASK_LAST = 0x16F,
DPP_LAST = ROW_XMASK_LAST
};
+// clang-format on
enum DppFiMode {
DPP_FI_0 = 0,
@@ -716,6 +753,17 @@
};
} // namespace Exp
+
+namespace VOP3PEncoding {
+
+enum OpSel : uint64_t {
+ OP_SEL_HI_0 = UINT64_C(1) << 59,
+ OP_SEL_HI_1 = UINT64_C(1) << 60,
+ OP_SEL_HI_2 = UINT64_C(1) << 14,
+};
+
+} // namespace VOP3PEncoding
+
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 34f59bf..d5c56bf 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -581,8 +581,9 @@
continue;
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM: {
+ case AMDGPU::STRICT_WWM: {
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d5fa9af..ad91052 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -90,6 +90,8 @@
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+ bool tryFoldCndMask(MachineInstr &MI) const;
+ bool tryFoldZeroHighBits(MachineInstr &MI) const;
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
const MachineOperand *isClamp(const MachineInstr &MI) const;
@@ -97,6 +99,9 @@
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
+ bool tryFoldRegSequence(MachineInstr &MI);
+ bool tryFoldLCSSAPhi(MachineInstr &MI);
+ bool tryFoldLoad(MachineInstr &MI);
public:
SIFoldOperands() : MachineFunctionPass(ID) {
@@ -135,6 +140,8 @@
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
+ case AMDGPU::V_FMAC_F64_e64:
+ return AMDGPU::V_FMA_F64_e64;
}
return AMDGPU::INSTRUCTION_LIST_END;
}
@@ -332,8 +339,8 @@
if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
return;
LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
- << " operand " << OpNo << "\n " << *MI << '\n');
- FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+ << " operand " << OpNo << "\n " << *MI);
+ FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
}
static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
@@ -484,37 +491,37 @@
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
-// Find a def of the UseReg, check if it is a reg_seqence and find initializers
+// Find a def of the UseReg, check if it is a reg_sequence and find initializers
// for each subreg, tracking it to foldable inline immediate if possible.
// Returns true on success.
static bool getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
Register UseReg, uint8_t OpTy,
const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
- MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+ MachineInstr *Def = MRI.getVRegDef(UseReg);
if (!Def || !Def->isRegSequence())
return false;
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
MachineOperand *Sub = &Def->getOperand(I);
- assert (Sub->isReg());
+ assert(Sub->isReg());
- for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
- SubDef && Sub->isReg() && !Sub->getSubReg() &&
- TII->isFoldableCopy(*SubDef);
- SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
+ for (MachineInstr *SubDef = MRI.getVRegDef(Sub->getReg());
+ SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
+ !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
+ SubDef = MRI.getVRegDef(Sub->getReg())) {
MachineOperand *Op = &SubDef->getOperand(1);
if (Op->isImm()) {
if (TII->isInlineConstant(*Op, OpTy))
Sub = Op;
break;
}
- if (!Op->isReg())
+ if (!Op->isReg() || Op->getReg().isPhysical())
break;
Sub = Op;
}
- Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
+ Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
}
return true;
@@ -531,8 +538,10 @@
return false;
uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
- if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
+ (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
return false;
if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
@@ -548,12 +557,23 @@
if (!UseReg.isVirtual())
return false;
- if (llvm::any_of(FoldList, [UseMI](const FoldCandidate &FC) {
- return FC.UseMI == UseMI;
- }))
+ if (isUseMIInFoldList(FoldList, UseMI))
return false;
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
+
+ // Maybe it is just a COPY of an immediate itself.
+ MachineInstr *Def = MRI.getVRegDef(UseReg);
+ MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
+ MachineOperand &DefOp = Def->getOperand(1);
+ if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
+ TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
+ UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
+ return true;
+ }
+ }
+
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
return false;
@@ -605,22 +625,17 @@
Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
- MachineRegisterInfo::use_nodbg_iterator Next;
- for (MachineRegisterInfo::use_nodbg_iterator
- RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end();
- RSUse != RSE; RSUse = Next) {
- Next = std::next(RSUse);
-
- MachineInstr *RSUseMI = RSUse->getParent();
+ for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
+ MachineInstr *RSUseMI = RSUse.getParent();
if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
- RSUse.getOperandNo(), FoldList))
+ RSUseMI->getOperandNo(&RSUse), FoldList))
continue;
- if (RSUse->getSubReg() != RegSeqDstSubReg)
+ if (RSUse.getSubReg() != RegSeqDstSubReg)
continue;
- foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
CopiesToReplace);
}
@@ -680,19 +695,15 @@
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
if (!DestReg.isPhysical()) {
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
- MachineRegisterInfo::use_nodbg_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
- for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg),
- E = MRI->use_nodbg_end();
- Use != E; Use = NextUse) {
- NextUse = std::next(Use);
+ for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
// There's no point trying to fold into an implicit operand.
- if (Use->isImplicit())
+ if (Use.isImplicit())
continue;
- FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
- &UseMI->getOperand(1));
- CopyUses.push_back(FC);
+ CopyUses.emplace_back(Use.getParent(),
+ Use.getParent()->getOperandNo(&Use),
+ &UseMI->getOperand(1));
}
for (auto &F : CopyUses) {
foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
@@ -728,8 +739,7 @@
if (UseMI->isCopy() && OpToFold.isReg() &&
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
- LLVM_DEBUG(dbgs() << "Folding " << OpToFold
- << "\n into " << *UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
@@ -813,7 +823,7 @@
B.addImm(Defs[I].second);
}
- LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
return;
}
@@ -825,6 +835,10 @@
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
+ else if (ST->hasGFX90AInsts() &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
return;
}
@@ -1033,14 +1047,19 @@
// Try to simplify operations with a constant that may appear after instruction
// selection.
// TODO: See if a frame index with a fixed offset can fold.
-static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
- const SIInstrInfo *TII,
- MachineInstr *MI,
- MachineOperand *ImmOp) {
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
+ MachineInstr *MI) {
unsigned Opc = MI->getOpcode();
- if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
- Opc == AMDGPU::S_NOT_B32) {
- MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx == -1)
+ return false;
+ MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+
+ if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+ Opc == AMDGPU::S_NOT_B32) &&
+ Src0->isImm()) {
+ MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
return true;
}
@@ -1048,9 +1067,6 @@
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return false;
-
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
if (!Src0->isImm() && !Src1->isImm())
@@ -1134,35 +1150,61 @@
}
// Try to fold an instruction into a simpler one
-static bool tryFoldInst(const SIInstrInfo *TII,
- MachineInstr *MI) {
- unsigned Opc = MI->getOpcode();
+bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
+ Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
+ return false;
- if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
- Opc == AMDGPU::V_CNDMASK_B32_e64 ||
- Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
- const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
- int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
- if (Src1->isIdenticalTo(*Src0) &&
- (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
- (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
- LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
- auto &NewDesc =
- TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
- int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- if (Src2Idx != -1)
- MI->RemoveOperand(Src2Idx);
- MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
- if (Src1ModIdx != -1)
- MI->RemoveOperand(Src1ModIdx);
- if (Src0ModIdx != -1)
- MI->RemoveOperand(Src0ModIdx);
- mutateCopyOp(*MI, NewDesc);
- LLVM_DEBUG(dbgs() << *MI << '\n');
- return true;
- }
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src1->isIdenticalTo(*Src0)) {
+ auto *Src0Imm = getImmOrMaterializedImm(*MRI, *Src0);
+ auto *Src1Imm = getImmOrMaterializedImm(*MRI, *Src1);
+ if (!Src1Imm->isIdenticalTo(*Src0Imm))
+ return false;
+ }
+
+ int Src1ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ int Src0ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
+ (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
+ auto &NewDesc =
+ TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1)
+ MI.RemoveOperand(Src2Idx);
+ MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+ if (Src1ModIdx != -1)
+ MI.RemoveOperand(Src1ModIdx);
+ if (Src0ModIdx != -1)
+ MI.RemoveOperand(Src0ModIdx);
+ mutateCopyOp(MI, NewDesc);
+ LLVM_DEBUG(dbgs() << MI);
+ return true;
+}
+
+bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
+ if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_AND_B32_e32)
+ return false;
+
+ MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1));
+ if (!Src0->isImm() || Src0->getImm() != 0xffff)
+ return false;
+
+ Register Src1 = MI.getOperand(2).getReg();
+ MachineInstr *SrcDef = MRI->getVRegDef(Src1);
+ if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) {
+ Register Dst = MI.getOperand(0).getReg();
+ MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
+ MI.eraseFromParent();
+ return true;
}
return false;
@@ -1177,20 +1219,9 @@
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
- if (FoldingImm) {
- unsigned NumLiteralUses = 0;
- MachineOperand *NonInlineUse = nullptr;
- int NonInlineUseOpNo = -1;
-
- MachineRegisterInfo::use_nodbg_iterator NextUse;
- for (MachineRegisterInfo::use_nodbg_iterator
- Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
- Use != E; Use = NextUse) {
- NextUse = std::next(Use);
- MachineInstr *UseMI = Use->getParent();
- unsigned OpNo = Use.getOperandNo();
-
+ if (OpToFold.isImm()) {
+ for (auto &UseMI :
+ make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
// Folding the immediate may reveal operations that can be constant
// folded or replaced with a copy. This can happen for example after
// frame indices are lowered to constants or from splitting 64-bit
@@ -1199,18 +1230,21 @@
// We may also encounter cases where one or both operands are
// immediates materialized into a register, which would ordinarily not
// be folded due to multiple uses or operand constraints.
+ if (tryConstantFoldOp(*MRI, TII, &UseMI))
+ LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
+ }
+ }
- if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
+ if (FoldingImm) {
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
- // Some constant folding cases change the same immediate's use to a new
- // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
- // again. The same constant folded instruction could also have a second
- // use operand.
- NextUse = MRI->use_nodbg_begin(Dst.getReg());
- FoldList.clear();
- continue;
- }
+ for (auto &Use :
+ make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
+ MachineInstr *UseMI = Use.getParent();
+ unsigned OpNo = UseMI->getOperandNo(&Use);
// Try to fold any inline immediate uses, and then only fold other
// constants if they have one use.
@@ -1230,11 +1264,10 @@
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList,
- CopiesToReplace);
+ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
- NonInlineUse = &*Use;
+ NonInlineUse = &Use;
NonInlineUseOpNo = OpNo;
}
}
@@ -1246,16 +1279,13 @@
}
} else {
// Folding register.
- SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess;
- for (MachineRegisterInfo::use_nodbg_iterator
- Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
- Use != E; ++Use) {
- UsesToProcess.push_back(Use);
- }
+ SmallVector <MachineOperand *, 4> UsesToProcess;
+ for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
+ UsesToProcess.push_back(&Use);
for (auto U : UsesToProcess) {
MachineInstr *UseMI = U->getParent();
- foldOperand(OpToFold, UseMI, U.getOperandNo(),
+ foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
FoldList, CopiesToReplace);
}
}
@@ -1265,11 +1295,8 @@
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
- SmallPtrSet<MachineInstr *, 16> Folded;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.OpToFold);
- if (Folded.count(Fold.UseMI))
- continue;
if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
Register Reg = Fold.OpToFold->getReg();
MachineInstr *DefMI = Fold.OpToFold->getParent();
@@ -1288,9 +1315,7 @@
}
LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
<< static_cast<int>(Fold.UseOpNo) << " of "
- << *Fold.UseMI << '\n');
- if (tryFoldInst(TII, Fold.UseMI))
- Folded.insert(Fold.UseMI);
+ << *Fold.UseMI);
} else if (Fold.isCommuted()) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
@@ -1341,23 +1366,10 @@
}
}
-// We obviously have multiple uses in a clamp since the register is used twice
-// in the same instruction.
-static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
- int Count = 0;
- for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
- I != E; ++I) {
- if (++Count > 1)
- return false;
- }
-
- return true;
-}
-
// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
const MachineOperand *ClampSrc = isClamp(MI);
- if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+ if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
@@ -1370,8 +1382,7 @@
if (!DefClamp)
return false;
- LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
- << '\n');
+ LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
@@ -1382,6 +1393,18 @@
static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {
+ case AMDGPU::V_MUL_F64_e64: {
+ switch (Val) {
+ case 0x3fe0000000000000: // 0.5
+ return SIOutMods::DIV2;
+ case 0x4000000000000000: // 2.0
+ return SIOutMods::MUL2;
+ case 0x4010000000000000: // 4.0
+ return SIOutMods::MUL4;
+ default:
+ return SIOutMods::NONE;
+ }
+ }
case AMDGPU::V_MUL_F32_e64: {
switch (static_cast<uint32_t>(Val)) {
case 0x3f000000: // 0.5
@@ -1418,11 +1441,13 @@
SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
+ case AMDGPU::V_MUL_F64_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
- (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
+ ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) &&
+ MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1448,11 +1473,13 @@
return std::make_pair(RegOp, OMod);
}
+ case AMDGPU::V_ADD_F64_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
- (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
+ ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) &&
+ MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
@@ -1481,7 +1508,7 @@
std::tie(RegOp, OMod) = isOMod(MI);
if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
RegOp->getSubReg() != AMDGPU::NoSubRegister ||
- !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+ !MRI->hasOneNonDBGUser(RegOp->getReg()))
return false;
MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
@@ -1494,7 +1521,7 @@
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
return false;
- LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
DefOMod->setImm(OMod);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -1502,6 +1529,198 @@
return true;
}
+// Try to fold a reg_sequence with vgpr output and agpr inputs into an
+// instruction which can take an agpr. So far that means a store.
+bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
+ assert(MI.isRegSequence());
+ auto Reg = MI.getOperand(0).getReg();
+
+ if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
+ !MRI->hasOneNonDBGUse(Reg))
+ return false;
+
+ SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
+ return false;
+
+ for (auto &Def : Defs) {
+ const auto *Op = Def.first;
+ if (!Op->isReg())
+ return false;
+ if (TRI->isAGPR(*MRI, Op->getReg()))
+ continue;
+ // Maybe this is a COPY from AREG
+ const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
+ if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
+ return false;
+ if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
+ return false;
+ }
+
+ MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
+ MachineInstr *UseMI = Op->getParent();
+ while (UseMI->isCopy() && !Op->getSubReg()) {
+ Reg = UseMI->getOperand(0).getReg();
+ if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
+ return false;
+ Op = &*MRI->use_nodbg_begin(Reg);
+ UseMI = Op->getParent();
+ }
+
+ if (Op->getSubReg())
+ return false;
+
+ unsigned OpIdx = Op - &UseMI->getOperand(0);
+ const MCInstrDesc &InstDesc = UseMI->getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ switch (OpInfo.RegClass) {
+ case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_160RegClassID:
+ break;
+ default:
+ return false;
+ }
+
+ const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
+ auto Dst = MRI->createVirtualRegister(NewDstRC);
+ auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::REG_SEQUENCE), Dst);
+
+ for (unsigned I = 0; I < Defs.size(); ++I) {
+ MachineOperand *Def = Defs[I].first;
+ Def->setIsKill(false);
+ if (TRI->isAGPR(*MRI, Def->getReg())) {
+ RS.add(*Def);
+ } else { // This is a copy
+ MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
+ SubDef->getOperand(1).setIsKill(false);
+ RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
+ }
+ RS.addImm(Defs[I].second);
+ }
+
+ Op->setReg(Dst);
+ if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
+ Op->setReg(Reg);
+ RS->eraseFromParent();
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
+
+ // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
+ // in which case we can erase them all later in runOnMachineFunction.
+ if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
+ MI.eraseFromParentAndMarkDBGValuesForRemoval();
+ return true;
+}
+
+// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
+// This should allow folding of an AGPR into a consumer which may support it.
+// I.e.:
+//
+// loop: // loop:
+// %1:vreg = COPY %0:areg // exit:
+// exit: => // %1:areg = PHI %0:areg, %loop
+// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
+bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+
+ if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
+ return false;
+
+ Register PhiIn = PHI.getOperand(1).getReg();
+ Register PhiOut = PHI.getOperand(0).getReg();
+ if (PHI.getOperand(1).getSubReg() ||
+ !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
+ return false;
+
+ // A single use should not matter for correctness, but if it has another use
+ // inside the loop we may perform copy twice in a worst case.
+ if (!MRI->hasOneNonDBGUse(PhiIn))
+ return false;
+
+ MachineInstr *Copy = MRI->getVRegDef(PhiIn);
+ if (!Copy || !Copy->isCopy())
+ return false;
+
+ Register CopyIn = Copy->getOperand(1).getReg();
+ if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
+ return false;
+
+ const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
+ Register NewReg = MRI->createVirtualRegister(ARC);
+ PHI.getOperand(1).setReg(CopyIn);
+ PHI.getOperand(0).setReg(NewReg);
+
+ MachineBasicBlock *MBB = PHI.getParent();
+ BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
+ TII->get(AMDGPU::COPY), PhiOut)
+ .addReg(NewReg, RegState::Kill);
+ Copy->eraseFromParent(); // We know this copy had a single use.
+
+ LLVM_DEBUG(dbgs() << "Folded " << PHI);
+
+ return true;
+}
+
+// Attempt to convert VGPR load to an AGPR load.
+bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
+ assert(MI.mayLoad());
+ if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
+ return false;
+
+ MachineOperand &Def = MI.getOperand(0);
+ if (!Def.isDef())
+ return false;
+
+ Register DefReg = Def.getReg();
+
+ if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
+ return false;
+
+ SmallVector<const MachineInstr*, 8> Users;
+ SmallVector<Register, 8> MoveRegs;
+ for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
+ Users.push_back(&I);
+ }
+ if (Users.empty())
+ return false;
+
+ // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
+ while (!Users.empty()) {
+ const MachineInstr *I = Users.pop_back_val();
+ if (!I->isCopy() && !I->isRegSequence())
+ return false;
+ Register DstReg = I->getOperand(0).getReg();
+ if (TRI->isAGPR(*MRI, DstReg))
+ continue;
+ MoveRegs.push_back(DstReg);
+ for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
+ Users.push_back(&U);
+ }
+ }
+
+ const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+ MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
+ if (!TII->isOperandLegal(MI, 0, &Def)) {
+ MRI->setRegClass(DefReg, RC);
+ return false;
+ }
+
+ while (!MoveRegs.empty()) {
+ Register Reg = MoveRegs.pop_back_val();
+ MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
+ }
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI);
+
+ return true;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -1520,14 +1739,21 @@
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
- MachineBasicBlock::iterator I, Next;
-
MachineOperand *CurrentKnownM0Val = nullptr;
- for (I = MBB->begin(); I != MBB->end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
+ for (auto &MI : make_early_inc_range(*MBB)) {
+ tryFoldCndMask(MI);
- tryFoldInst(TII, &MI);
+ if (tryFoldZeroHighBits(MI))
+ continue;
+
+ if (MI.isRegSequence() && tryFoldRegSequence(MI))
+ continue;
+
+ if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+ continue;
+
+ if (MI.mayLoad() && tryFoldLoad(MI))
+ continue;
if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.
@@ -1575,11 +1801,31 @@
// %3 = COPY %vgpr0; VGPR_32:%3
// ...
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
- MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && !Dst.getReg().isVirtual())
+ if (!MI.getOperand(0).getReg().isVirtual())
continue;
foldInstOperand(MI, OpToFold);
+
+ // If we managed to fold all uses of this copy then we might as well
+ // delete it now.
+ // The only reason we need to follow chains of copies here is that
+ // tryFoldRegSequence looks forward through copies before folding a
+ // REG_SEQUENCE into its eventual users.
+ auto *InstToErase = &MI;
+ while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
+ auto &SrcOp = InstToErase->getOperand(1);
+ auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
+ InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+ InstToErase = nullptr;
+ if (!SrcReg || SrcReg.isPhysical())
+ break;
+ InstToErase = MRI->getVRegDef(SrcReg);
+ if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
+ break;
+ }
+ if (InstToErase && InstToErase->isRegSequence() &&
+ MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
+ InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
}
}
return true;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index a12e013..80ee7a0 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -6,10 +6,11 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file
-/// This pass creates bundles of SMEM and VMEM instructions forming memory
-/// clauses if XNACK is enabled. Def operands of clauses are marked as early
-/// clobber to make sure we will not override any source within a clause.
+/// \file This pass extends the live ranges of registers used as pointers in
+/// sequences of adjacent SMEM and VMEM instructions if XNACK is enabled. A
+/// load that would overwrite a pointer would require breaking the soft clause.
+/// Artificially extend the live ranges of the pointer operands by adding
+/// implicit-def early-clobber operands throughout the soft clause.
///
//===----------------------------------------------------------------------===//
@@ -59,10 +60,8 @@
}
private:
- template <typename Callable>
- void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const;
-
- bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool canBundle(const MachineInstr &MI, const RegUse &Defs,
+ const RegUse &Uses) const;
bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
@@ -106,12 +105,12 @@
// There no sense to create store clauses, they do not define anything,
// thus there is nothing to set early-clobber.
static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
- if (MI.isDebugValue() || MI.isBundled())
+ assert(!MI.isDebugInstr() && "debug instructions should not reach here");
+ if (MI.isBundled())
return false;
if (!MI.mayLoad() || MI.mayStore())
return false;
- if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
- AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+ if (SIInstrInfo::isAtomic(MI))
return false;
if (IsVMEMClause && !isVMEMClauseInst(MI))
return false;
@@ -148,63 +147,10 @@
return S;
}
-template <typename Callable>
-void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
- Callable Func) const {
- if (LaneMask.all() || Reg.isPhysical() ||
- LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
- Func(0);
- return;
- }
-
- const TargetRegisterClass *RC = MRI->getRegClass(Reg);
- unsigned E = TRI->getNumSubRegIndices();
- SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
- for (unsigned Idx = 1; Idx < E; ++Idx) {
- // Is this index even compatible with the given class?
- if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
- continue;
- LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
- // Early exit if we found a perfect match.
- if (SubRegMask == LaneMask) {
- Func(Idx);
- return;
- }
-
- if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
- continue;
-
- CoveringSubregs.push_back(Idx);
- }
-
- llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) {
- LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
- LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
- unsigned NA = MaskA.getNumLanes();
- unsigned NB = MaskB.getNumLanes();
- if (NA != NB)
- return NA > NB;
- return MaskA.getHighestLane() > MaskB.getHighestLane();
- });
-
- for (unsigned Idx : CoveringSubregs) {
- LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
- if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
- continue;
-
- Func(Idx);
- LaneMask &= ~SubRegMask;
- if (LaneMask.none())
- return;
- }
-
- llvm_unreachable("Failed to find all subregs to cover lane mask");
-}
-
// Returns false if there is a use of a def already in the map.
// In this case we must break the clause.
-bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
- RegUse &Defs, RegUse &Uses) const {
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
+ const RegUse &Uses) const {
// Check interference with defs.
for (const MachineOperand &MO : MI.operands()) {
// TODO: Prologue/Epilogue Insertion pass does not process bundled
@@ -221,7 +167,7 @@
if (MO.isTied())
return false;
- RegUse &Map = MO.isDef() ? Uses : Defs;
+ const RegUse &Map = MO.isDef() ? Uses : Defs;
auto Conflict = Map.find(Reg);
if (Conflict == Map.end())
continue;
@@ -249,9 +195,19 @@
RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+
+ // Don't push over half the register budget. We don't want to introduce
+ // spilling just to form a soft clause.
+ //
+ // FIXME: This pressure check is fundamentally broken. First, this is checking
+ // the global pressure, not the pressure at this specific point in the
+ // program. Second, it's not accounting for the increased liveness of the use
+ // operands due to the early clobber we will introduce. Third, the pressure
+ // tracking does not account for the alignment requirements for SGPRs, or the
+ // fragmentation of registers the allocator will need to satisfy.
if (Occupancy >= MFI->getMinAllowedOccupancy() &&
- MaxPressure.getVGPRNum() <= MaxVGPRs &&
- MaxPressure.getSGPRNum() <= MaxSGPRs) {
+ MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
+ MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
LastRecordedOccupancy = Occupancy;
return true;
}
@@ -328,6 +284,9 @@
MachineInstr &MI = *I;
Next = std::next(I);
+ if (MI.isMetaInstruction())
+ continue;
+
bool IsVMEM = isVMEMClauseInst(MI);
if (!isValidClauseInst(MI, IsVMEM))
@@ -347,8 +306,13 @@
continue;
}
+ MachineBasicBlock::iterator LastClauseInst = Next;
unsigned Length = 1;
for ( ; Next != E && Length < FuncMaxClause; ++Next) {
+ // Debug instructions should not change the kill insertion.
+ if (Next->isMetaInstruction())
+ continue;
+
if (!isValidClauseInst(*Next, IsVMEM))
break;
@@ -358,6 +322,7 @@
if (!processRegUses(*Next, Defs, Uses, RPT))
break;
+ LastClauseInst = Next;
++Length;
}
if (Length < 2) {
@@ -368,36 +333,74 @@
Changed = true;
MFI->limitOccupancy(LastRecordedOccupancy);
- auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
- Ind->insertMachineInstrInMaps(*B);
+ assert(!LastClauseInst->isMetaInstruction());
- // Restore the state after processing the bundle.
- RPT.reset(*B, &LiveRegsCopy);
+ SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(MI);
+ SlotIndex ClauseLiveOutIdx =
+ LIS->getInstructionIndex(*LastClauseInst).getNextIndex();
- for (auto BI = I; BI != Next; ++BI) {
- BI->bundleWithPred();
- Ind->removeSingleMachineInstrFromMaps(*BI);
+ // Track the last inserted kill.
+ MachineInstrBuilder Kill;
- for (MachineOperand &MO : BI->defs())
- if (MO.readsReg())
- MO.setIsInternalRead(true);
- }
-
- for (auto &&R : Defs) {
- forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
- unsigned S = R.second.first | RegState::EarlyClobber;
- if (!SubReg)
- S &= ~(RegState::Undef | RegState::Dead);
- B.addDef(R.first, S, SubReg);
- });
- }
-
+ // Insert one kill per register, with operands covering all necessary
+ // subregisters.
for (auto &&R : Uses) {
- forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
- B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
- });
+ Register Reg = R.first;
+ if (Reg.isPhysical())
+ continue;
+
+ // Collect the register operands we should extend the live ranges of.
+ SmallVector<std::tuple<unsigned, unsigned>> KillOps;
+ const LiveInterval &LI = LIS->getInterval(R.first);
+
+ if (!LI.hasSubRanges()) {
+ if (!LI.liveAt(ClauseLiveOutIdx)) {
+ KillOps.emplace_back(R.second.first | RegState::Kill,
+ AMDGPU::NoSubRegister);
+ }
+ } else {
+ LaneBitmask KilledMask;
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (SR.liveAt(ClauseLiveInIdx) && !SR.liveAt(ClauseLiveOutIdx))
+ KilledMask |= SR.LaneMask;
+ }
+
+ if (KilledMask.none())
+ continue;
+
+ SmallVector<unsigned> KilledIndexes;
+ bool Success = TRI->getCoveringSubRegIndexes(
+ *MRI, MRI->getRegClass(Reg), KilledMask, KilledIndexes);
+ (void)Success;
+ assert(Success && "Failed to find subregister mask to cover lanes");
+ for (unsigned SubReg : KilledIndexes) {
+ KillOps.emplace_back(R.second.first | RegState::Kill, SubReg);
+ }
+ }
+
+ if (KillOps.empty())
+ continue;
+
+ // We only want to extend the live ranges of used registers. If they
+ // already have existing uses beyond the bundle, we don't need the kill.
+ //
+ // It's possible all of the use registers were already live past the
+ // bundle.
+ Kill = BuildMI(*MI.getParent(), std::next(LastClauseInst),
+ DebugLoc(), TII->get(AMDGPU::KILL));
+ for (auto &Op : KillOps)
+ Kill.addUse(Reg, std::get<0>(Op), std::get<1>(Op));
+ Ind->insertMachineInstrInMaps(*Kill);
}
+ if (!Kill) {
+ RPT.reset(MI, &LiveRegsCopy);
+ continue;
+ }
+
+ // Restore the state after processing the end of the bundle.
+ RPT.reset(*Kill, &LiveRegsCopy);
+
for (auto &&R : Defs) {
Register Reg = R.first;
Uses.erase(Reg);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0398d27..c9883d3 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -20,18 +20,16 @@
#define DEBUG_TYPE "frame-info"
+static cl::opt<bool> EnableSpillVGPRToAGPR(
+ "amdgpu-spill-vgpr-to-agpr",
+ cl::desc("Enable spilling VGPRs to AGPRs"),
+ cl::ReallyHidden,
+ cl::init(true));
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
+// Find a scratch register that we can use in the prologue. We avoid using
+// callee-save registers since they may appear to be free when this is called
+// from canUseAsPrologue (during shrink wrapping), but then no longer be free
+// when this is called from emitPrologue.
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
LivePhysRegs &LiveRegs,
const TargetRegisterClass &RC,
@@ -55,12 +53,6 @@
}
}
- // If we require an unused register, this is used in contexts where failure is
- // an option and has an alternative plan. In other contexts, this must
- // succeed0.
- if (!Unused)
- report_fatal_error("failed to find free scratch register");
-
return MCRegister();
}
@@ -72,10 +64,8 @@
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-#ifndef NDEBUG
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
-#endif
// We need to save and restore the current FP/BP.
@@ -105,7 +95,7 @@
int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
TargetStackID::SGPRSpill);
- if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+ if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
// 3: There's no free lane to spill, and no free register to save FP/BP,
// so we're forced to spill another VGPR to use for the spill.
FrameIndex = NewFI;
@@ -131,166 +121,45 @@
// We need to specially emit stack operations here because a different frame
// register is used than in the rest of the function, as getFrameRegister would
// use.
-static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &FuncInfo,
+ LivePhysRegs &LiveRegs, MachineFunction &MF,
MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, Register SpillReg,
- Register ScratchRsrcReg, Register SPReg, int FI) {
- MachineFunction *MF = MBB.getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineBasicBlock::iterator I, Register SpillReg,
+ int FI) {
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- int64_t Offset = MFI.getObjectOffset(FI);
-
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
- MFI.getObjectAlign(FI));
-
- if (ST.enableFlatScratch()) {
- if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
- .addReg(SpillReg, RegState::Kill)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
- } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
- .addReg(SpillReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- return;
- }
-
- // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
- // offset in the spill.
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
+ FrameInfo.getObjectAlign(FI));
LiveRegs.addReg(SpillReg);
-
- if (ST.enableFlatScratch()) {
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(SPReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
- .addReg(SpillReg, RegState::Kill)
- .addReg(OffsetReg, RegState::Kill)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- } else {
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
- .addReg(SpillReg, RegState::Kill)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- }
-
+ TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true,
+ FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
+ &LiveRegs);
LiveRegs.removeReg(SpillReg);
}
-static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, Register SpillReg,
- Register ScratchRsrcReg, Register SPReg, int FI) {
- MachineFunction *MF = MBB.getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
- int64_t Offset = MFI.getObjectOffset(FI);
+static void buildEpilogRestore(const GCNSubtarget &ST,
+ const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &FuncInfo,
+ LivePhysRegs &LiveRegs, MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register SpillReg,
+ int FI) {
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
- MFI.getObjectAlign(FI));
-
- if (ST.enableFlatScratch()) {
- if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(SPReg)
- .addImm(Offset);
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR),
- SpillReg)
- .addReg(OffsetReg, RegState::Kill)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
-
- if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- return;
- }
-
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
+ FrameInfo.getObjectAlign(FI));
+ TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false,
+ FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
+ &LiveRegs);
}
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -384,8 +253,7 @@
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
.addReg(FlatScrInit)
.addImm(EncodedOffset) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addMemOperand(MMO);
// Mask the offset in [47:0] of the descriptor
@@ -445,9 +313,9 @@
// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
- .addReg(FlatScrInitLo)
- .addReg(ScratchWaveOffsetReg);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
// Convert offset to 256-byte units.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
@@ -545,6 +413,7 @@
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
@@ -622,7 +491,7 @@
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
- .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
+ .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
}
if (hasFP(MF)) {
@@ -631,12 +500,18 @@
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
+ bool NeedsFlatScratchInit =
+ MFI->hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
+ if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
+ !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
- if (MFI->hasFlatScratchInit()) {
+ if (NeedsFlatScratchInit) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
@@ -663,6 +538,7 @@
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
buildGitPtr(MBB, I, DL, TII, Rsrc01);
@@ -681,10 +557,23 @@
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
.addImm(EncodedOffset) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
+
+ // The driver will always set the SRD for wave 64 (bits 118:117 of
+ // descriptor / bits 22:21 of third sub-reg will be 0b11)
+ // If the shader is actually wave32 we have to modify the const_index_stride
+ // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
+ // reason the driver does this is that there can be cases where it presents
+ // 2 shaders with different wave size (e.g. VsFs).
+ // TODO: convert to using SCRATCH instructions or multiple SRD buffers
+ if (ST.isWave32()) {
+ const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
+ BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
+ .addImm(21)
+ .addReg(Rsrc03);
+ }
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -716,8 +605,7 @@
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addMemOperand(MMO)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
@@ -785,11 +673,28 @@
case TargetStackID::SGPRSpill:
return true;
case TargetStackID::ScalableVector:
+ case TargetStackID::WasmLocal:
return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
}
+static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo *FuncInfo,
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, bool IsProlog) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ if (IsProlog) {
+ LiveRegs.addLiveIns(MBB);
+ } else {
+ // In epilog.
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
+ }
+}
+
// Activate all lanes, returns saved exec.
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
MachineFunction &MF,
@@ -804,28 +709,14 @@
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
DebugLoc DL;
- if (LiveRegs.empty()) {
- if (IsProlog) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
- if (FuncInfo->SGPRForFPSaveRestoreCopy)
- LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-
- if (FuncInfo->SGPRForBPSaveRestoreCopy)
- LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
- } else {
- // In epilog.
- LiveRegs.init(*ST.getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
- }
- }
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ if (!ScratchExecCopy)
+ report_fatal_error("failed to find free scratch register");
- if (!IsProlog)
- LiveRegs.removeReg(ScratchExecCopy);
+ LiveRegs.addReg(ScratchExecCopy);
const unsigned OrSaveExec =
ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
@@ -834,6 +725,13 @@
return ScratchExecCopy;
}
+// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+// Otherwise we are spilling to memory.
+static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -865,29 +763,122 @@
// turn on all lanes before doing the spill to memory.
Register ScratchExecCopy;
- bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
- bool SpillFPToMemory = false;
- // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
- // Otherwise we are spilling the FP to memory.
- if (HasFPSaveIndex) {
- SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
+ Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+ Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
+
+ // VGPRs used for SGPR->VGPR spills
+ for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
+ FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI)
+ continue;
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI,
+ /*IsProlog*/ true);
+
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
+ *Reg.FI);
}
- bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
- bool SpillBPToMemory = false;
- // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
- // Otherwise we are spilling the BP to memory.
- if (HasBPSaveIndex) {
- SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
+ // VGPRs used for Whole Wave Mode
+ for (const auto &Reg : FuncInfo->WWMReservedRegs) {
+ auto VGPR = Reg.first;
+ auto FI = Reg.second;
+ if (!FI)
+ continue;
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
+
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
+ }
+
+ if (ScratchExecCopy) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
+ LiveRegs.addReg(ScratchExecCopy);
+ }
+
+ if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
+
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(FramePtrReg);
+
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ FramePtrFI);
+ }
+
+ if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) {
+ const int BasePtrFI = *BPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(BasePtrReg);
+
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ BasePtrFI);
+ }
+
+ // In this case, spill the FP to a reserved VGPR.
+ if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
+
+ assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
+ assert(Spill.size() == 1);
+
+ // Save FP before setting it up.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
+ .addReg(FramePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
+
+ // In this case, spill the BP to a reserved VGPR.
+ if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) {
+ const int BasePtrFI = *BPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+ assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+ assert(Spill.size() == 1);
+
+ // Save BP before setting it up.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
+ .addReg(BasePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
}
// Emit the copy if we need an FP, and are using a free SGPR to save it.
if (FuncInfo->SGPRForFPSaveRestoreCopy) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
- .addReg(FramePtrReg)
- .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+ FuncInfo->SGPRForFPSaveRestoreCopy)
+ .addReg(FramePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
}
// Emit the copy if we need a BP, and are using a free SGPR to save it.
@@ -914,102 +905,13 @@
MBB.sortUniqueLiveIns();
}
+ if (!LiveRegs.empty()) {
+ LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+ }
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
-
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
-
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
- FuncInfo->getScratchRSrcReg(),
- StackPtrReg,
- Reg.FI.getValue());
- }
-
- if (HasFPSaveIndex && SpillFPToMemory) {
- assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
-
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
-
- MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
- .addReg(FramePtrReg);
-
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- FuncInfo->FramePointerSaveIndex.getValue());
- }
-
- if (HasBPSaveIndex && SpillBPToMemory) {
- assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
-
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
-
- MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
- .addReg(BasePtrReg);
-
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- *FuncInfo->BasePointerSaveIndex);
- }
-
- if (ScratchExecCopy) {
- // FIXME: Split block and make terminator.
- unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy, RegState::Kill);
- LiveRegs.addReg(ScratchExecCopy);
- }
-
- // In this case, spill the FP to a reserved VGPR.
- if (HasFPSaveIndex && !SpillFPToMemory) {
- const int FI = FuncInfo->FramePointerSaveIndex.getValue();
- assert(!MFI.isDeadObjectIndex(FI));
-
- assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
- assert(Spill.size() == 1);
-
- // Save FP before setting it up.
- // FIXME: This should respect spillSGPRToVGPR;
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
- .addReg(FramePtrReg)
- .addImm(Spill[0].Lane)
- .addReg(Spill[0].VGPR, RegState::Undef);
- }
-
- // In this case, spill the BP to a reserved VGPR.
- if (HasBPSaveIndex && !SpillBPToMemory) {
- const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
- assert(!MFI.isDeadObjectIndex(BasePtrFI));
-
- assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
- assert(Spill.size() == 1);
-
- // Save BP before setting it up.
- // FIXME: This should respect spillSGPRToVGPR;
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
- .addReg(BasePtrReg)
- .addImm(Spill[0].Lane)
- .addReg(Spill[0].VGPR, RegState::Undef);
- }
-
- if (TRI.needsStackRealignment(MF)) {
+ if (TRI.hasStackRealignment(MF)) {
HasFP = true;
const unsigned Alignment = MFI.getMaxAlign().value();
@@ -1017,23 +919,16 @@
if (LiveRegs.empty()) {
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
- LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
- LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
}
- Register ScratchSPReg = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
- assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
- ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
-
- // s_add_u32 tmp_reg, s32, NumBytes
- // s_and_b32 s32, tmp_reg, 0b111...0000
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+ // s_add_i32 s33, s32, NumBytes
+ // s_and_b32 s33, s33, 0b111...0000
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
.addReg(StackPtrReg)
.addImm((Alignment - 1) * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
- .addReg(ScratchSPReg, RegState::Kill)
+ .addReg(FramePtrReg, RegState::Kill)
.addImm(-Alignment * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
@@ -1054,7 +949,7 @@
}
if (HasFP && RoundedSize != 0) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
@@ -1101,58 +996,47 @@
const Register BasePtrReg =
TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
- bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
- bool SpillFPToMemory = false;
- if (HasFPSaveIndex) {
- SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
-
- bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
- bool SpillBPToMemory = false;
- if (HasBPSaveIndex) {
- SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
+ Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+ Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
if (RoundedSize != 0 && hasFP(MF)) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * getScratchScaleFactor(ST))
- .setMIFlag(MachineInstr::FrameDestroy);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
+ .setMIFlag(MachineInstr::FrameDestroy);
}
if (FuncInfo->SGPRForFPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
.addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
- .setMIFlag(MachineInstr::FrameSetup);
+ .setMIFlag(MachineInstr::FrameDestroy);
}
if (FuncInfo->SGPRForBPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
.addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
- .setMIFlag(MachineInstr::FrameSetup);
+ .setMIFlag(MachineInstr::FrameDestroy);
}
- Register ScratchExecCopy;
- if (HasFPSaveIndex) {
- const int FI = FuncInfo->FramePointerSaveIndex.getValue();
- assert(!MFI.isDeadObjectIndex(FI));
- if (SpillFPToMemory) {
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+ if (FPSaveIndex) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
+ if (spilledToMemory(MF, FramePtrFI)) {
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
- MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ FramePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
- .addReg(TempVGPR, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
} else {
// Reload from VGPR spill.
- assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
+ FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
assert(Spill.size() == 1);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
.addReg(Spill[0].VGPR)
@@ -1160,19 +1044,20 @@
}
}
- if (HasBPSaveIndex) {
- const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ if (BPSaveIndex) {
+ const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
- if (SpillBPToMemory) {
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+ if (spilledToMemory(MF, BasePtrFI)) {
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
- MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ BasePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
- .addReg(TempVGPR, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
} else {
// Reload from VGPR spill.
assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
@@ -1185,17 +1070,31 @@
}
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
+ Register ScratchExecCopy;
+ for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
+ if (!Reg.FI)
continue;
if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- Reg.FI.getValue());
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
+ *Reg.FI);
+ }
+
+ for (const auto &Reg : FuncInfo->WWMReservedRegs) {
+ auto VGPR = Reg.first;
+ auto FI = Reg.second;
+ if (!FI)
+ continue;
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
+
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
}
if (ScratchExecCopy) {
@@ -1240,9 +1139,73 @@
MachineFrameInfo &MFI = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+ && EnableSpillVGPRToAGPR;
+
+ if (SpillVGPRToAGPR) {
+ // To track the spill frame indices handled in this pass.
+ BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+
+ bool SeenDbgInstr = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator Next;
+ for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ if (MI.isDebugInstr())
+ SeenDbgInstr = true;
+
+ if (TII->isVGPRSpill(MI)) {
+ // Try to eliminate stack used by VGPR spills before frame
+ // finalization.
+ unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ int FI = MI.getOperand(FIOp).getIndex();
+ Register VReg =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
+ TRI->isAGPR(MRI, VReg))) {
+ // FIXME: change to enterBasicBlockEnd()
+ RS->enterBasicBlock(MBB);
+ TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
+ SpillFIs.set(FI);
+ continue;
+ }
+ }
+ }
+ }
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+ MBB.addLiveIn(Reg);
+
+ for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+
+ if (!SpillFIs.empty() && SeenDbgInstr) {
+ // FIXME: The dead frame indices are replaced with a null register from
+ // the debug value instructions. We should instead, update it with the
+ // correct register value. But not sure the register value alone is
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
+ SpillFIs[MI.getOperand(0).getIndex()]) {
+ MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+ MI.getOperand(0).setIsDebug();
+ }
+ }
+ }
+ }
+ }
+
FuncInfo->removeDeadFrameIndices(MFI);
assert(allSGPRSpillsAreDead(MF) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
@@ -1253,16 +1216,8 @@
if (!allStackObjectsAreDead(MFI)) {
assert(RS && "RegScavenger required if spilling");
- if (FuncInfo->isEntryFunction()) {
- int ScavengeFI = MFI.CreateFixedObject(
- TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
- RS->addScavengingFrameIndex(ScavengeFI);
- } else {
- int ScavengeFI = MFI.CreateStackObject(
- TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
- TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
- RS->addScavengingFrameIndex(ScavengeFI);
- }
+ // Add an emergency spill slot
+ RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
}
}
@@ -1280,7 +1235,13 @@
const SIRegisterInfo *TRI = ST.getRegisterInfo();
// Ignore the SGPRs the default implementation found.
- SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+ SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
+
+ // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
+ // In gfx908 there was do AGPR loads and stores and thus spilling also
+ // require a temporary VGPR.
+ if (!ST.hasGFX90AInsts())
+ SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
// hasFP only knows about stack objects that already exist. We're now
// determining the stack slots that will be created, so we have to predict
@@ -1335,7 +1296,7 @@
SavedRegs.reset(MFI->getStackPtrOffsetReg());
const BitVector AllSavedRegs = SavedRegs;
- SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+ SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
// If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
@@ -1409,10 +1370,12 @@
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = MFI->getStackPtrOffsetReg();
- unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
- BuildMI(MBB, I, DL, TII->get(Op), SPReg)
- .addReg(SPReg)
- .addImm(Amount * getScratchScaleFactor(ST));
+ Amount *= getScratchScaleFactor(ST);
+ if (IsDestroy)
+ Amount = -Amount;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
+ .addReg(SPReg)
+ .addImm(Amount);
} else if (CalleePopAmount != 0) {
llvm_unreachable("is this used?");
}
@@ -1450,8 +1413,9 @@
}
return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
- MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
- MF.getTarget().Options.DisableFramePointerElim(MF);
+ MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
+ MF) ||
+ MF.getTarget().Options.DisableFramePointerElim(MF);
}
// This is essentially a reduced version of hasFP for entry functions. Since the
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 839437b..d98acfc 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,11 +19,13 @@
#include "SIRegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
@@ -80,36 +82,49 @@
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
- addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+
+ addRegisterClass(MVT::f64, V64RegClass);
+ addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+
+ addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
@@ -123,7 +138,7 @@
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -139,6 +154,8 @@
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -148,6 +165,8 @@
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v6i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v7i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -170,6 +189,8 @@
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
@@ -197,8 +218,16 @@
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
@@ -239,6 +268,7 @@
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
@@ -249,10 +279,10 @@
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
- case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
case ISD::SCALAR_TO_VECTOR:
break;
+ case ISD::INSERT_SUBVECTOR:
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
break;
@@ -284,6 +314,20 @@
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
+ for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
+ }
+
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
@@ -336,17 +380,14 @@
// Avoid stack access for these.
// TODO: Generalize to more vector types.
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
-
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
-
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
@@ -362,9 +403,13 @@
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
- // Deal with vec5 vector operations when widened to vec8.
+ // Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
@@ -384,6 +429,7 @@
}
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
@@ -525,8 +571,8 @@
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -718,6 +764,19 @@
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+
+ if (Subtarget->hasPackedFP32Ops()) {
+ setOperationAction(ISD::FADD, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
+
+ for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ }
+ }
}
setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
@@ -1128,17 +1187,6 @@
MachineMemOperand::MOVolatile;
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1150,6 +1198,22 @@
MachineMemOperand::MODereferenceable;
return true;
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1191,6 +1255,9 @@
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
@@ -1210,9 +1277,9 @@
}
return AM.Scale == 0 &&
- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,
- /*Signed=*/false));
+ (AM.BaseOffs == 0 ||
+ Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
@@ -1220,7 +1287,7 @@
return AM.Scale == 0 &&
(AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
- /*Signed=*/true));
+ SIInstrFlags::FlatGlobal));
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
@@ -1385,10 +1452,15 @@
return true;
}
+ // Either, the alignment requirements are "enabled", or there is an
+ // unaligned LDS access related hardware bug though alignment requirements
+ // are "disabled". In either case, we need to check for proper alignment
+ // requirements.
+ //
if (Size == 64) {
- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
- // aligned, 8 byte access in a single operation using ds_read2/write2_b32
- // with adjacent offsets.
+ // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
+ // can do a 4 byte aligned, 8 byte access in a single operation using
+ // ds_read2/write2_b32 with adjacent offsets.
bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
@@ -1396,22 +1468,23 @@
return AlignedBy4;
}
if (Size == 96) {
- // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
- bool Aligned = Alignment >= Align(16);
+ // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
+ // gfx8 and older.
+ bool AlignedBy16 = Alignment >= Align(16);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy16;
- return Aligned;
+ return AlignedBy16;
}
if (Size == 128) {
- // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
- // can do a 8 byte aligned, 16 byte access in a single operation using
- // ds_read2/write2_b64.
- bool Aligned = Alignment >= Align(8);
+ // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
+ // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
+ // single operation using ds_read2/write2_b64.
+ bool AlignedBy8 = Alignment >= Align(8);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy8;
- return Aligned;
+ return AlignedBy8;
}
}
@@ -1467,8 +1540,8 @@
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Alignment,
- MachineMemOperand::Flags Flags, bool *IsFast) const {
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1482,7 +1555,7 @@
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Align(Alignment), Flags, IsFast);
+ Alignment, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1535,8 +1608,8 @@
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
- int NumElts = VT.getVectorNumElements();
- if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ VT.getScalarType().bitsLE(MVT::i16))
return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -1799,23 +1872,37 @@
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+ unsigned Mask = (Subtarget->hasPackedTID() &&
+ Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
}
if (Info.hasWorkItemIDY()) {
- Register Reg = AMDGPU::VGPR1;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 10));
+ } else {
+ unsigned Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ }
}
if (Info.hasWorkItemIDZ()) {
- Register Reg = AMDGPU::VGPR2;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 20));
+ } else {
+ unsigned Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
}
}
@@ -1865,12 +1952,32 @@
return ArgDescriptor::createRegister(Reg);
}
-static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+// If this has a fixed position, we still should allocate the register in the
+// CCInfo state. Technically we could get away with this for values passed
+// outside of the normal argument range.
+static void allocateFixedSGPRInputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ MCRegister Reg) {
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
}
-static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+}
+
+static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
/// Allocate implicit function VGPR arguments at the end of allocated user
@@ -1919,29 +2026,29 @@
// TODO: Unify handling with private memory pointers.
if (Info.hasDispatchPtr())
- ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
if (Info.hasQueuePtr())
- ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
if (Info.hasImplicitArgPtr())
- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
if (Info.hasDispatchID())
- ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
if (Info.hasWorkGroupIDX())
- ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
if (Info.hasWorkGroupIDY())
- ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
if (Info.hasWorkGroupIDZ())
- ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
}
// Allocate special inputs passed in user SGPRs.
@@ -2203,6 +2310,8 @@
return DAG.getEntryNode();
}
+ Info->allocateModuleLDSGlobal(Fn.getParent());
+
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -2767,6 +2876,7 @@
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2781,6 +2891,11 @@
if (!mayTailCallThisCC(CalleeCC))
return false;
+ // For a divergent call target, we need to do a waterfall loop over the
+ // possible callees which precludes us from using a simple jump.
+ if (Callee->isDivergent())
+ return false;
+
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -2888,12 +3003,6 @@
if (!CLI.CB)
report_fatal_error("unsupported libcall legalization");
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
- return lowerUnhandledCall(CLI, InVals,
- "unsupported indirect call to function ");
- }
-
if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
return lowerUnhandledCall(CLI, InVals,
"unsupported required tail call to function ");
@@ -3054,7 +3163,10 @@
// locations, which are supposed to be immutable?
Chain = addTokenForArgument(Chain, DAG, MFI, FI);
} else {
- DstAddr = PtrOff;
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
+ MVT::i32);
+ DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Alignment =
commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
@@ -4150,11 +4262,35 @@
return BB;
}
case AMDGPU::DS_GWS_INIT:
- case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (Subtarget->needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ Register DataReg = Op->getReg();
+ bool IsAGPR = TRI->isAGPR(MRI, DataReg);
+ Register Undef = MRI.createVirtualRegister(
+ IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
+ : &AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(DataReg, 0, Op->getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ Op->setReg(NewVR);
+ Op->setSubReg(AMDGPU::sub0);
+ MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
- case AMDGPU::DS_GWS_BARRIER:
// A s_waitcnt 0 is required to be the instruction immediately following.
if (getSubtarget()->hasGWSAutoReplay()) {
bundleInstWithWaitcnt(MI);
@@ -4360,7 +4496,8 @@
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4381,7 +4518,8 @@
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4456,6 +4594,9 @@
return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::FMA:
return splitTernaryVectorOp(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -5092,12 +5233,35 @@
}
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
+ return lowerTrapEndpgm(Op, DAG);
+
+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return lowerTrapHsaQueuePtr(Op, DAG);
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return Subtarget->supportsGetDoorbellID() ?
+ lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
+ }
+ }
+
+ llvm_unreachable("Unknown trap handler");
+}
+
+SDValue SITargetLowering::lowerTrapEndpgm(
+ SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+}
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled())
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+SDValue SITargetLowering::lowerTrapHsaQueuePtr(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -5108,22 +5272,37 @@
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
QueuePtr, SDValue());
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
ToReg,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ DAG.getTargetConstant(TrapID, SL, MVT::i16),
SGPR01,
ToReg.getValue(1)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
+SDValue SITargetLowering::lowerTrapHsa(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
+
SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled()) {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -5133,9 +5312,10 @@
return Chain;
}
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
SDValue Ops[] = {
Chain,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
@@ -5666,23 +5846,10 @@
ArrayRef<SDValue> Elts) {
assert(!Elts.empty());
MVT Type;
- unsigned NumElts;
+ unsigned NumElts = Elts.size();
- if (Elts.size() == 1) {
- Type = MVT::f32;
- NumElts = 1;
- } else if (Elts.size() == 2) {
- Type = MVT::v2f32;
- NumElts = 2;
- } else if (Elts.size() == 3) {
- Type = MVT::v3f32;
- NumElts = 3;
- } else if (Elts.size() <= 4) {
- Type = MVT::v4f32;
- NumElts = 4;
- } else if (Elts.size() <= 8) {
- Type = MVT::v8f32;
- NumElts = 8;
+ if (NumElts <= 8) {
+ Type = MVT::getVectorVT(MVT::f32, NumElts);
} else {
assert(Elts.size() <= 16);
Type = MVT::v16f32;
@@ -5704,28 +5871,6 @@
return DAG.getBuildVector(Type, DL, VecElts);
}
-static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
- SDValue *GLC, SDValue *SLC, SDValue *DLC) {
- auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
-
- uint64_t Value = CachePolicyConst->getZExtValue();
- SDLoc DL(CachePolicy);
- if (GLC) {
- *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
SDValue Src, int ExtraElts) {
EVT SrcVT = Src.getValueType();
@@ -5752,7 +5897,7 @@
ArrayRef<EVT> ResultTypes,
bool IsTexFail, bool Unpacked, bool IsD16,
int DMaskPop, int NumVDataDwords,
- const SDLoc &DL, LLVMContext &Context) {
+ const SDLoc &DL) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
@@ -5835,11 +5980,11 @@
return Value == 0;
}
-static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
- MVT PackVectorVT,
- SmallVectorImpl<SDValue> &PackedAddrs,
- unsigned DimIdx, unsigned EndIdx,
- unsigned NumGradients) {
+static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
+ MVT PackVectorVT,
+ SmallVectorImpl<SDValue> &PackedAddrs,
+ unsigned DimIdx, unsigned EndIdx,
+ unsigned NumGradients) {
SDLoc DL(Op);
for (unsigned I = DimIdx; I < EndIdx; I++) {
SDValue Addr = Op.getOperand(I);
@@ -5994,56 +6139,64 @@
MVT VAddrVT =
Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
MVT VAddrScalarVT = VAddrVT.getScalarType();
- MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+ MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
VAddrScalarVT = VAddrVT.getScalarType();
+ MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
- if (IsA16 || IsG16) {
- if (IsA16) {
- if (!ST->hasA16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit addresses\n");
- return Op;
- }
- if (!IsG16) {
- LLVM_DEBUG(
- dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
- "need 16 bit derivatives but got 32 bit derivatives\n");
- return Op;
- }
- } else if (!ST->hasG16()) {
+
+ if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
+ // 16 bit gradients are supported, but are tied to the A16 control
+ // so both gradients and addresses must be 16 bit
+ LLVM_DEBUG(
+ dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+ "require 16 bit args for both gradients and addresses");
+ return Op;
+ }
+
+ if (IsA16) {
+ if (!ST->hasA16()) {
LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
+ "support 16 bit addresses\n");
return Op;
}
+ }
- if (BaseOpcode->Gradients && !IsA16) {
- if (!ST->hasG16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
- return Op;
- }
- // Activate g16
- const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
- AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
- IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
- }
+ // We've dealt with incorrect input so we know that if IsA16, IsG16
+ // are set then we have to compress/pack operands (either address,
+ // gradient or both)
+ // In the case where a16 and gradients are tied (no G16 support) then we
+ // have already verified that both IsA16 and IsG16 are true
+ if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
+ // Activate g16
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
+ }
- // Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
- ArgOffset + Intr->GradientStart, PackEndIdx,
- Intr->NumGradients);
-
- if (!IsA16) {
- // Add uncompressed address
- for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
- VAddrs.push_back(Op.getOperand(I));
- }
+ // Add gradients (packed or unpacked)
+ if (IsG16) {
+ // Pack the gradients
+ // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+ packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
+ ArgOffset + Intr->GradientStart,
+ ArgOffset + Intr->CoordStart, Intr->NumGradients);
} else {
- for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
+ for (unsigned I = ArgOffset + Intr->GradientStart;
+ I < ArgOffset + Intr->CoordStart; I++)
+ VAddrs.push_back(Op.getOperand(I));
+ }
+
+ // Add addresses (packed or unpacked)
+ if (IsA16) {
+ packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
+ ArgOffset + Intr->CoordStart, VAddrEnd,
+ 0 /* No gradients */);
+ } else {
+ // Add uncompressed address
+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
@@ -6058,8 +6211,9 @@
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- bool UseNSA =
- ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
+ VAddrs.size() >= 3 &&
+ VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
SDValue VAddr;
if (!UseNSA)
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
@@ -6120,19 +6274,12 @@
}
}
- SDValue GLC;
- SDValue SLC;
- SDValue DLC;
- if (BaseOpcode->Atomic) {
- GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- } else {
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- }
+ unsigned CPol = cast<ConstantSDNode>(
+ Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return Op;
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
@@ -6148,16 +6295,17 @@
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
- if (IsGFX10Plus)
- Ops.push_back(DLC);
- Ops.push_back(GLC);
- Ops.push_back(SLC);
+ Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
- Ops.push_back(TFE);
- Ops.push_back(LWE);
+ if (!Subtarget->hasGFX90AInsts()) {
+ Ops.push_back(TFE); //tfe
+ } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
+ report_fatal_error("TFE is not supported on this GPU");
+ }
+ Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -6175,7 +6323,15 @@
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->hasGFX90AInsts()) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ report_fatal_error(
+ "requested image instruction is not supported on this GPU");
+ }
+ if (Opcode == -1 &&
+ Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
@@ -6194,15 +6350,13 @@
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
- } else if (!BaseOpcode->Store) {
- return constructRetValue(DAG, NewNode,
- OrigResultTypes, IsTexFail,
- Subtarget->hasUnpackedD16VMem(), IsD16,
- DMaskLanes, NumVDataDwords, DL,
- *DAG.getContext());
}
-
- return SDValue(NewNode, 0);
+ if (BaseOpcode->Store)
+ return SDValue(NewNode, 0);
+ return constructRetValue(DAG, NewNode,
+ OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16,
+ DMaskLanes, NumVDataDwords, DL);
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
@@ -6448,11 +6602,8 @@
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
- SDValue GLC;
- SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
- if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
- IsGFX10Plus ? &DLC : nullptr))
+ unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ if (CPol & ~AMDGPU::CPol::ALL)
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -6607,6 +6758,9 @@
case Intrinsic::amdgcn_alignbit:
return DAG.getNode(ISD::FSHR, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_perm:
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
Module *M = const_cast<Module *>(MF.getFunction().getParent());
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
@@ -6626,28 +6780,29 @@
}
}
-// This function computes an appropriate offset to pass to
-// MachineMemOperand::setOffset() based on the offset inputs to
-// an intrinsic. If any of the offsets are non-contstant or
-// if VIndex is non-zero then this function returns 0. Otherwise,
-// it returns the sum of VOffset, SOffset, and Offset.
-static unsigned getBufferOffsetForMMO(SDValue VOffset,
- SDValue SOffset,
- SDValue Offset,
- SDValue VIndex = SDValue()) {
-
+/// Update \p MMO based on the offset inputs to an intrinsic.
+static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
+ SDValue SOffset, SDValue Offset,
+ SDValue VIndex = SDValue()) {
if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
- !isa<ConstantSDNode>(Offset))
- return 0;
-
- if (VIndex) {
- if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
- return 0;
+ !isa<ConstantSDNode>(Offset)) {
+ // The combined offset is not known to be constant, so we cannot represent
+ // it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
}
- return cast<ConstantSDNode>(VOffset)->getSExtValue() +
- cast<ConstantSDNode>(SOffset)->getSExtValue() +
- cast<ConstantSDNode>(Offset)->getSExtValue();
+ if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
+ !cast<ConstantSDNode>(VIndex)->isNullValue())) {
+ // The strided index component of the address is not known to be zero, so we
+ // cannot represent it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
+ }
+
+ MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
+ cast<ConstantSDNode>(SOffset)->getSExtValue() +
+ cast<ConstantSDNode>(Offset)->getSExtValue());
}
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
@@ -6670,13 +6825,21 @@
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
M->getMemOperand());
}
+// Return a value to use for the idxen operand by examining the vindex operand.
+static unsigned getIdxEn(SDValue VIndex) {
+ if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
+ // No need to set idxen if vindex is known to be zero.
+ return VIndexC->getZExtValue() != 0;
+ return 1;
+}
+
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -6697,8 +6860,7 @@
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
@@ -6811,9 +6973,7 @@
case Intrinsic::amdgcn_buffer_load_format: {
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6824,11 +6984,7 @@
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
-
- unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
@@ -6836,7 +6992,7 @@
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -6868,7 +7024,7 @@
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
@@ -6888,8 +7044,7 @@
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
- Ops[2]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
@@ -6900,9 +7055,7 @@
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6983,9 +7136,7 @@
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // vdata
@@ -6997,14 +7148,12 @@
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
unsigned Opcode = 0;
switch (IntrID) {
@@ -7042,7 +7191,7 @@
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty()) {
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -7063,6 +7212,14 @@
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
@@ -7119,9 +7276,7 @@
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(5));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -7134,13 +7289,11 @@
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7161,7 +7314,7 @@
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7182,33 +7335,11 @@
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
- Ops[4]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- if (!Op.getValue(0).use_empty()) {
- DiagnosticInfoUnsupported
- NoFpRet(DAG.getMachineFunction().getFunction(),
- "return versions of fp atomics not supported",
- DL.getDebugLoc(), DS_Error);
- DAG.getContext()->diagnose(NoFpRet);
- return SDValue();
- }
- MemSDNode *M = cast<MemSDNode>(Op);
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- EVT VT = Op.getOperand(3).getValueType();
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
@@ -7224,6 +7355,11 @@
assert(RayDir.getValueType() == MVT::v4f16 ||
RayDir.getValueType() == MVT::v4f32);
+ if (!Subtarget->hasGFX10_AEncoding()) {
+ emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
+ return SDValue();
+ }
+
bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
bool Is64 = NodePtr.getValueType() == MVT::i64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
@@ -7279,7 +7415,55 @@
DAG.setNodeMemRefs(NewNode, {MemRef});
return SDValue(NewNode, 0);
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
+ LLVM_FALLTHROUGH;
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+ unsigned Opcode = 0;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd: {
+ EVT VT = Op.getOperand(3).getValueType();
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ }
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ }
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+ return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
default:
+
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
return lowerImage(Op, ImageDimIntr, DAG, true);
@@ -7448,9 +7632,7 @@
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData, // vdata
@@ -7461,7 +7643,7 @@
Op.getOperand(7), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7486,7 +7668,7 @@
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7511,7 +7693,7 @@
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7528,9 +7710,7 @@
VData = handleD16VData(VData, DAG);
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData,
@@ -7542,15 +7722,13 @@
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7597,7 +7775,7 @@
IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
@@ -7644,8 +7822,7 @@
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7725,9 +7902,9 @@
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
-unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG, SDValue *Offsets,
- Align Alignment) const {
+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+ SelectionDAG &DAG, SDValue *Offsets,
+ Align Alignment) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
@@ -7737,7 +7914,7 @@
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return SOffset + ImmOffset;
+ return;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7750,13 +7927,12 @@
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return 0;
+ return;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
- return 0;
}
// Handle 8 bit and 16 bit buffer loads
@@ -8263,8 +8439,8 @@
// Returns immediate value for setting the F32 denorm mode when using the
// S_DENORM_MODE instruction.
-static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
- const SDLoc &SL, const GCNSubtarget *ST) {
+static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+ const SDLoc &SL, const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
? FP_DENORM_FLUSH_NONE
@@ -8794,18 +8970,20 @@
}
// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
static bool isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
- default: break;
+ default:
+ break;
case ISD::SETCC:
+ case AMDGPUISD::FP_CLASS:
+ return true;
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- case AMDGPUISD::FP_CLASS:
- return true;
+ return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
}
return false;
}
@@ -9206,63 +9384,6 @@
return SDValue();
}
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - probably only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
@@ -9277,15 +9398,6 @@
if (Src.getValueType() != MVT::i16)
return SDValue();
- // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
- // FIXME: It is not universally true that the high bits are zeroed on gfx9.
- if (Src.getOpcode() == ISD::BITCAST) {
- SDValue BCSrc = Src.getOperand(0);
- if (BCSrc.getValueType() == MVT::f16 &&
- fp16SrcZerosHighBits(BCSrc.getOpcode()))
- return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
- }
-
return SDValue();
}
@@ -9482,19 +9594,18 @@
// Could be anything.
return false;
- case ISD::BITCAST: {
+ case ISD::BITCAST:
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::i16 &&
- Src.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncSrc = Src.getOperand(0);
+ if (Op.getValueType() == MVT::i16) {
+ SDValue TruncSrc = Op.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}
}
-
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -9527,6 +9638,45 @@
llvm_unreachable("invalid operation");
}
+bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ unsigned Opcode = MI->getOpcode();
+
+ if (Opcode == AMDGPU::G_FCANONICALIZE)
+ return true;
+
+ if (Opcode == AMDGPU::G_FCONSTANT) {
+ auto F = MI->getOperand(1).getFPImm()->getValueAPF();
+ if (F.isNaN() && F.isSignaling())
+ return false;
+ return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+ }
+
+ if (MaxDepth == 0)
+ return false;
+
+ switch (Opcode) {
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE: {
+ if (Subtarget->supportsMinMaxDenormModes() ||
+ denormalsEnabledForType(MRI.getType(Reg), MF))
+ return true;
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))
+ return false;
+ }
+ return true;
+ }
+ default:
+ return denormalsEnabledForType(MRI.getType(Reg), MF) &&
+ isKnownNeverSNaN(Reg, MRI);
+ }
+
+ llvm_unreachable("invalid operation");
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
@@ -9694,15 +9844,19 @@
}
// If there isn't a 16-bit med3 operation, convert to 32-bit.
- MVT NVT = MVT::i32;
- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ if (VT == MVT::i16) {
+ MVT NVT = MVT::i32;
+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ }
+
+ return SDValue();
}
static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
@@ -10408,7 +10562,7 @@
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
+ if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -10791,7 +10945,7 @@
unsigned NewDmask = 0;
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
- bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+ bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -11067,6 +11221,95 @@
return Node;
}
+// Any MIMG instructions that use tfe or lwe require an initialization of the
+// result register that will be written in the case of a memory access failure.
+// The required code is also added to tie this init code to the result of the
+// img instruction.
+void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+ MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+ if (!TFE && !LWE) // intersect_ray
+ return;
+
+ unsigned TFEVal = TFE ? TFE->getImm() : 0;
+ unsigned LWEVal = LWE->getImm();
+ unsigned D16Val = D16 ? D16->getImm() : 0;
+
+ if (!TFEVal && !LWEVal)
+ return;
+
+ // At least one of TFE or LWE are non-zero
+ // We have to insert a suitable initialization of the result value and
+ // tie this to the dest of the image instruction.
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ int DstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+ // Calculate which dword we have to initialize to 0.
+ MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+ // check that dmask operand is found.
+ assert(MO_Dmask && "Expected dmask operand in instruction");
+
+ unsigned dmask = MO_Dmask->getImm();
+ // Determine the number of active lanes taking into account the
+ // Gather4 special case
+ unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask);
+
+ bool Packed = !Subtarget->hasUnpackedD16VMem();
+
+ unsigned InitIdx =
+ D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+ // Abandon attempt if the dst size isn't large enough
+ // - this is in fact an error but this is picked up elsewhere and
+ // reported correctly.
+ uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ if (DstSize < InitIdx)
+ return;
+
+ // Create a register for the intialization value.
+ Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ unsigned NewDst = 0; // Final initialized value will be in here
+
+ // If PRTStrictNull feature is enabled (the default) then initialize
+ // all the result registers to 0, otherwise just the error indication
+ // register (VGPRn+1)
+ unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
+ unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+ for (; SizeLeft; SizeLeft--, CurrIdx++) {
+ NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ // Initialize dword
+ Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+ .addImm(0);
+ // Insert into the super-reg
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+ .addReg(PrevDst)
+ .addReg(SubReg)
+ .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
+
+ PrevDst = NewDst;
+ }
+
+ // Add as an implicit operand
+ MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
+
+ // Tie the just added implicit operand to the dst
+ MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+}
+
/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
@@ -11114,10 +11357,12 @@
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::glc1);
- if (Glc1Idx != -1)
- MI.RemoveOperand(Glc1Idx);
+ int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolIdx != -1) {
+ MachineOperand &CPol = MI.getOperand(CPolIdx);
+ CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+ }
MI.RemoveOperand(0);
MI.setDesc(TII->get(NoRetAtomicOp));
return;
@@ -11148,6 +11393,9 @@
}
return;
}
+
+ if (TII->isMIMG(MI) && !MI.mayStore())
+ AddIMGInit(MI);
}
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -11226,9 +11474,11 @@
//===----------------------------------------------------------------------===//
std::pair<unsigned, const TargetRegisterClass *>
-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
StringRef Constraint,
MVT VT) const {
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
+
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
const unsigned BitWidth = VT.getSizeInBits();
@@ -11257,7 +11507,7 @@
RC = &AMDGPU::VGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+ RC = TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11271,7 +11521,7 @@
RC = &AMDGPU::AGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+ RC = TRI->getAGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11444,6 +11694,47 @@
return false;
}
+static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
+ switch (UnalignedClassID) {
+ case AMDGPU::VReg_64RegClassID:
+ return AMDGPU::VReg_64_Align2RegClassID;
+ case AMDGPU::VReg_96RegClassID:
+ return AMDGPU::VReg_96_Align2RegClassID;
+ case AMDGPU::VReg_128RegClassID:
+ return AMDGPU::VReg_128_Align2RegClassID;
+ case AMDGPU::VReg_160RegClassID:
+ return AMDGPU::VReg_160_Align2RegClassID;
+ case AMDGPU::VReg_192RegClassID:
+ return AMDGPU::VReg_192_Align2RegClassID;
+ case AMDGPU::VReg_224RegClassID:
+ return AMDGPU::VReg_224_Align2RegClassID;
+ case AMDGPU::VReg_256RegClassID:
+ return AMDGPU::VReg_256_Align2RegClassID;
+ case AMDGPU::VReg_512RegClassID:
+ return AMDGPU::VReg_512_Align2RegClassID;
+ case AMDGPU::VReg_1024RegClassID:
+ return AMDGPU::VReg_1024_Align2RegClassID;
+ case AMDGPU::AReg_64RegClassID:
+ return AMDGPU::AReg_64_Align2RegClassID;
+ case AMDGPU::AReg_96RegClassID:
+ return AMDGPU::AReg_96_Align2RegClassID;
+ case AMDGPU::AReg_128RegClassID:
+ return AMDGPU::AReg_128_Align2RegClassID;
+ case AMDGPU::AReg_160RegClassID:
+ return AMDGPU::AReg_160_Align2RegClassID;
+ case AMDGPU::AReg_192RegClassID:
+ return AMDGPU::AReg_192_Align2RegClassID;
+ case AMDGPU::AReg_256RegClassID:
+ return AMDGPU::AReg_256_Align2RegClassID;
+ case AMDGPU::AReg_512RegClassID:
+ return AMDGPU::AReg_512_Align2RegClassID;
+ case AMDGPU::AReg_1024RegClassID:
+ return AMDGPU::AReg_1024_Align2RegClassID;
+ default:
+ return -1;
+ }
+}
+
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
@@ -11452,6 +11743,7 @@
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -11474,7 +11766,6 @@
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
@@ -11482,13 +11773,30 @@
}
}
+ // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
+ // classes if required. Ideally the register class constraints would differ
+ // per-subtarget, but there's no easy way to achieve that right now. This is
+ // not a problem for VGPRs because the correctly aligned VGPR class is implied
+ // from using them as the register class for legal types.
+ if (ST.needsAlignedVGPRs()) {
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ const Register Reg = Register::index2VirtReg(I);
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (!RC)
+ continue;
+ int NewClassID = getAlignedAGPRClassID(RC->getID());
+ if (NewClassID != -1)
+ MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
// Allocate a VGPR for future SGPR Spill if
// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
// FIXME: We won't need this hack if we split SGPR allocation from VGPR
- if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
- !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
+ if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
+ !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
Info->reserveVGPRforSGPRSpills(MF);
}
@@ -11690,8 +11998,37 @@
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ case AMDGPUISD::ATOMIC_CMP_SWAP:
+ case AMDGPUISD::ATOMIC_INC:
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_SWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_ADD:
+ case AMDGPUISD::BUFFER_ATOMIC_SUB:
+ case AMDGPUISD::BUFFER_ATOMIC_SMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_UMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_SMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_UMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_AND:
+ case AMDGPUISD::BUFFER_ATOMIC_OR:
+ case AMDGPUISD::BUFFER_ATOMIC_XOR:
+ case AMDGPUISD::BUFFER_ATOMIC_INC:
+ case AMDGPUISD::BUFFER_ATOMIC_DEC:
+ case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_CSUB:
+ case AMDGPUISD::BUFFER_ATOMIC_FADD:
+ case AMDGPUISD::BUFFER_ATOMIC_FMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_FMAX:
+ // Target-specific read-modify-write atomics are sources of divergence.
+ return true;
+ default:
+ if (auto *A = dyn_cast<AtomicSDNode>(N)) {
+ // Generic read-modify-write atomics are sources of divergence.
+ return A->readMem() && A->writeMem();
+ }
+ return false;
}
- return false;
}
bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
@@ -11707,6 +12044,19 @@
}
}
+bool SITargetLowering::denormalsEnabledForType(LLT Ty,
+ MachineFunction &MF) const {
+ switch (Ty.getScalarSizeInBits()) {
+ case 32:
+ return hasFP32Denormals(MF);
+ case 64:
+ case 16:
+ return hasFP64FP16Denormals(MF);
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
@@ -11745,24 +12095,57 @@
if (Ty->isHalfTy())
return AtomicExpansionKind::None;
- if (!Ty->isFloatTy())
+ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
- // TODO: Do have these for flat. Older targets also had them for buffers.
unsigned AS = RMW->getPointerAddressSpace();
- if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
- if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
+ Subtarget->hasAtomicFaddInsts()) {
+ // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
+ // floating point atomic instructions. May generate more efficient code,
+ // but may not respect rounding and denormal modes, and may give incorrect
+ // results for certain memory destinations.
+ if (RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() != "true")
return AtomicExpansionKind::CmpXChg;
- return RMW->use_empty() ? AtomicExpansionKind::None :
- AtomicExpansionKind::CmpXChg;
+ if (Subtarget->hasGFX90AInsts()) {
+ if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)
+ return AtomicExpansionKind::CmpXChg;
+
+ auto SSID = RMW->getSyncScopeID();
+ if (SSID == SyncScope::System ||
+ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
+ return AtomicExpansionKind::CmpXChg;
+
+ return AtomicExpansionKind::None;
+ }
+
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
+ return AtomicExpansionKind::CmpXChg;
+
+ return RMW->use_empty() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
}
// DS FP atomics do repect the denormal mode, but the rounding mode is fixed
// to round-to-nearest-even.
- return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
- AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+ // The only exception is DS_ADD_F64 which never flushes regardless of mode.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+ if (!Ty->isDoubleTy())
+ return AtomicExpansionKind::None;
+
+ return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
+ RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() == "true")
+ ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
+
+ return AtomicExpansionKind::CmpXChg;
}
default:
break;
@@ -11872,10 +12255,11 @@
return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
}
-std::pair<int, MVT>
+std::pair<InstructionCost, MVT>
SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
Type *Ty) const {
- auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> Cost =
+ TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
auto Size = DL.getTypeSizeInBits(Ty);
// Maximum load or store can handle 8 dwords for scalar and 4 for
// vector ALU. Let's assume anything above 8 dwords is expensive
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 823d6ec..f3d3426 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -144,7 +144,11 @@
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -227,10 +231,8 @@
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
- /// \returns 0 If there is a non-constant offset or if the offset is 0.
- /// Otherwise returns the constant offset.
- unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, Align Alignment = Align(4)) const;
+ void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+ SDValue *Offsets, Align Alignment = Align(4)) const;
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -283,7 +285,7 @@
}
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS, unsigned Alignment,
+ EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
@@ -393,6 +395,7 @@
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+ void AddIMGInit(MachineInstr &MI) const;
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
@@ -439,7 +442,10 @@
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
+ bool isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
+ bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
@@ -483,8 +489,8 @@
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
- Type *Ty) const;
+ std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL,
+ Type *Ty) const;
};
} // End namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 5611c9c..7ba20eb 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -63,30 +63,10 @@
HARDCLAUSE_ILLEGAL,
};
-HardClauseType getHardClauseType(const MachineInstr &MI) {
- // On current architectures we only get a benefit from clausing loads.
- if (MI.mayLoad()) {
- if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
- return HARDCLAUSE_VMEM;
- if (SIInstrInfo::isFLAT(MI))
- return HARDCLAUSE_FLAT;
- // TODO: LDS
- if (SIInstrInfo::isSMRD(MI))
- return HARDCLAUSE_SMEM;
- }
-
- // Don't form VALU clauses. It's not clear what benefit they give, if any.
-
- // In practice s_nop is the only internal instruction we're likely to see.
- // It's safe to treat the rest as illegal.
- if (MI.getOpcode() == AMDGPU::S_NOP)
- return HARDCLAUSE_INTERNAL;
- return HARDCLAUSE_ILLEGAL;
-}
-
class SIInsertHardClauses : public MachineFunctionPass {
public:
static char ID;
+ const GCNSubtarget *ST = nullptr;
SIInsertHardClauses() : MachineFunctionPass(ID) {}
@@ -95,6 +75,34 @@
MachineFunctionPass::getAnalysisUsage(AU);
}
+ HardClauseType getHardClauseType(const MachineInstr &MI) {
+
+ // On current architectures we only get a benefit from clausing loads.
+ if (MI.mayLoad()) {
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+ if (ST->hasNSAClauseBug()) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
+ return HARDCLAUSE_ILLEGAL;
+ }
+ return HARDCLAUSE_VMEM;
+ }
+ if (SIInstrInfo::isFLAT(MI))
+ return HARDCLAUSE_FLAT;
+ // TODO: LDS
+ if (SIInstrInfo::isSMRD(MI))
+ return HARDCLAUSE_SMEM;
+ }
+
+ // Don't form VALU clauses. It's not clear what benefit they give, if any.
+
+ // In practice s_nop is the only internal instruction we're likely to see.
+ // It's safe to treat the rest as illegal.
+ if (MI.getOpcode() == AMDGPU::S_NOP)
+ return HARDCLAUSE_INTERNAL;
+ return HARDCLAUSE_ILLEGAL;
+ }
+
// Track information about a clause as we discover it.
struct ClauseInfo {
// The type of all (non-internal) instructions in the clause.
@@ -132,12 +140,12 @@
if (skipFunction(MF.getFunction()))
return false;
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasHardClauses())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasHardClauses())
return false;
- const SIInstrInfo *SII = ST.getInstrInfo();
- const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *SII = ST->getInstrInfo();
+ const TargetRegisterInfo *TRI = ST->getRegisterInfo();
bool Changed = false;
for (auto &MBB : MF) {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
deleted file mode 100644
index 9d31cd5..0000000
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ /dev/null
@@ -1,504 +0,0 @@
-//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass inserts branches on the 0 exec mask over divergent branches
-/// branches when it's expected that jumping over the untaken control flow will
-/// be cheaper than having every workitem no-op through it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-insert-skips"
-
-static cl::opt<unsigned> SkipThresholdFlag(
- "amdgpu-skip-threshold-legacy",
- cl::desc("Number of instructions before jumping over divergent control flow"),
- cl::init(12), cl::Hidden);
-
-namespace {
-
-class SIInsertSkips : public MachineFunctionPass {
-private:
- const SIRegisterInfo *TRI = nullptr;
- const SIInstrInfo *TII = nullptr;
- unsigned SkipThreshold = 0;
- MachineDominatorTree *MDT = nullptr;
-
- MachineBasicBlock *EarlyExitBlock = nullptr;
- bool EarlyExitClearsExec = false;
-
- bool shouldSkip(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
-
- bool dominatesAllReachable(MachineBasicBlock &MBB);
- void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
- void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- DebugLoc DL);
-
- bool kill(MachineInstr &MI);
- void earlyTerm(MachineInstr &MI);
-
- bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
-
-public:
- static char ID;
-
- SIInsertSkips() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert s_cbranch_execz instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char SIInsertSkips::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
-
-char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
-
-static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
- if (MI.isMetaInstruction())
- return true;
-
- // Handle target specific opcodes.
- switch (MI.getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
- return true;
- default:
- return false;
- }
-}
-
-bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
- NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(*I))
- continue;
-
- // FIXME: Since this is required for correctness, this should be inserted
- // during SILowerControlFlow.
-
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
- return true;
-
- // These instructions are potentially expensive even if EXEC = 0.
- if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
- return true;
-
- ++NumInstr;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-/// Check whether \p MBB dominates all blocks that are reachable from it.
-bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
- for (MachineBasicBlock *Other : depth_first(&MBB)) {
- if (!MDT->dominates(&MBB, Other))
- return false;
- }
- return true;
-}
-
-static void generateEndPgm(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- const SIInstrInfo *TII, bool IsPS) {
- // "null export"
- if (IsPS) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(AMDGPU::Exp::ET_NULL)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addImm(1) // vm
- .addImm(0) // compr
- .addImm(0); // en
- }
- // s_endpgm
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
-}
-
-void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
- bool ClearExec) {
- MachineFunction *MF = MBB.getParent();
- DebugLoc DL;
-
- if (!EarlyExitBlock) {
- EarlyExitBlock = MF->CreateMachineBasicBlock();
- MF->insert(MF->end(), EarlyExitBlock);
- generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
- MF->getFunction().getCallingConv() ==
- CallingConv::AMDGPU_PS);
- EarlyExitClearsExec = false;
- }
-
- if (ClearExec && !EarlyExitClearsExec) {
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto ExitI = EarlyExitBlock->getFirstNonPHI();
- BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
- EarlyExitClearsExec = true;
- }
-}
-
-static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
- MachineDominatorTree *MDT) {
- MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
-
- // Update dominator tree
- using DomTreeT = DomTreeBase<MachineBasicBlock>;
- SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
- for (MachineBasicBlock *Succ : SplitBB->successors()) {
- DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
- DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
- }
- DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
- MDT->getBase().applyUpdates(DTUpdates);
-}
-
-/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
-/// iterator. Only applies to pixel shaders.
-void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL) {
- MachineFunction *MF = MBB.getParent();
- (void)MF;
- assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
-
- // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
- // basic block that has no further successors (e.g., there was an
- // `unreachable` there in IR). This can happen with original source of the
- // form:
- //
- // if (uniform_condition) {
- // write_to_memory();
- // discard;
- // }
- //
- // In this case, we write the "null_export; s_endpgm" skip code in the
- // already-existing basic block.
- auto NextBBI = std::next(MBB.getIterator());
- bool NoSuccessor =
- I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
-
- if (NoSuccessor) {
- generateEndPgm(MBB, I, DL, TII, true);
- } else {
- ensureEarlyExitBlock(MBB, false);
-
- MachineInstr *BranchMI =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(EarlyExitBlock);
-
- // Split the block if the branch will not come at the end.
- auto Next = std::next(BranchMI->getIterator());
- if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
-
- MBB.addSuccessor(EarlyExitBlock);
- MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
- }
-}
-
-/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
-/// Return true unless the terminator is a no-op.
-bool SIInsertSkips::kill(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- switch (MI.getOpcode()) {
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
- unsigned Opcode = 0;
-
- // The opcodes are inverted because the inline immediate has to be
- // the first operand, e.g. from "x < imm" to "imm > x"
- switch (MI.getOperand(2).getImm()) {
- case ISD::SETOEQ:
- case ISD::SETEQ:
- Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
- break;
- case ISD::SETOGT:
- case ISD::SETGT:
- Opcode = AMDGPU::V_CMPX_LT_F32_e64;
- break;
- case ISD::SETOGE:
- case ISD::SETGE:
- Opcode = AMDGPU::V_CMPX_LE_F32_e64;
- break;
- case ISD::SETOLT:
- case ISD::SETLT:
- Opcode = AMDGPU::V_CMPX_GT_F32_e64;
- break;
- case ISD::SETOLE:
- case ISD::SETLE:
- Opcode = AMDGPU::V_CMPX_GE_F32_e64;
- break;
- case ISD::SETONE:
- case ISD::SETNE:
- Opcode = AMDGPU::V_CMPX_LG_F32_e64;
- break;
- case ISD::SETO:
- Opcode = AMDGPU::V_CMPX_O_F32_e64;
- break;
- case ISD::SETUO:
- Opcode = AMDGPU::V_CMPX_U_F32_e64;
- break;
- case ISD::SETUEQ:
- Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
- break;
- case ISD::SETUGT:
- Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
- break;
- case ISD::SETUGE:
- Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
- break;
- case ISD::SETULT:
- Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
- break;
- case ISD::SETULE:
- Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
- break;
- case ISD::SETUNE:
- Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
- break;
- default:
- llvm_unreachable("invalid ISD:SET cond code");
- }
-
- const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
- if (ST.hasNoSdstCMPX())
- Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
-
- assert(MI.getOperand(0).isReg());
-
- if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
- MI.getOperand(0).getReg())) {
- Opcode = AMDGPU::getVOPe32(Opcode);
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .add(MI.getOperand(1))
- .add(MI.getOperand(0));
- } else {
- auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
- if (!ST.hasNoSdstCMPX())
- I.addReg(AMDGPU::VCC, RegState::Define);
-
- I.addImm(0) // src0 modifiers
- .add(MI.getOperand(1))
- .addImm(0) // src1 modifiers
- .add(MI.getOperand(0));
-
- I.addImm(0); // omod
- }
- return true;
- }
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const MachineOperand &Op = MI.getOperand(0);
- int64_t KillVal = MI.getOperand(1).getImm();
- assert(KillVal == 0 || KillVal == -1);
-
- // Kill all threads if Op0 is an immediate and equal to the Kill value.
- if (Op.isImm()) {
- int64_t Imm = Op.getImm();
- assert(Imm == 0 || Imm == -1);
-
- if (Imm == KillVal) {
- BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
- : AMDGPU::S_MOV_B64), Exec)
- .addImm(0);
- return true;
- }
- return false;
- }
-
- unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
- if (ST.isWave32())
- Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
- BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
- .addReg(Exec)
- .add(Op);
- return true;
- }
- default:
- llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
- }
-}
-
-void SIInsertSkips::earlyTerm(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc DL = MI.getDebugLoc();
-
- ensureEarlyExitBlock(MBB, true);
-
- auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
- .addMBB(EarlyExitBlock);
- auto Next = std::next(MI.getIterator());
-
- if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
-
- MBB.addSuccessor(EarlyExitBlock);
- MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
-}
-
-// Returns true if a branch over the block was inserted.
-bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
- MachineBasicBlock &SrcMBB) {
- MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
-
- if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
- return false;
-
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
-
- BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(DestBB);
-
- return true;
-}
-
-bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
- SkipThreshold = SkipThresholdFlag;
-
- SmallVector<MachineInstr *, 4> KillInstrs;
- SmallVector<MachineInstr *, 4> EarlyTermInstrs;
- bool MadeChange = false;
-
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
-
- switch (MI.getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
- MadeChange |= skipMaskBranch(MI, MBB);
- break;
-
- case AMDGPU::S_BRANCH:
- // Optimize out branches to the next block.
- // FIXME: Shouldn't this be handled by BranchFolding?
- if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
- assert(&MI == &MBB.back());
- MI.eraseFromParent();
- MadeChange = true;
- }
- break;
-
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- MadeChange = true;
- bool CanKill = kill(MI);
-
- // Check if we can add an early "if exec=0 { end shader }".
- //
- // Note that we _always_ do this if it is correct, even if the kill
- // happens fairly late in the shader, because the null export should
- // generally still be cheaper than normal export(s).
- //
- // TODO: The dominatesAllReachable check is conservative: if the
- // dominance is only missing due to _uniform_ branches, we could
- // in fact insert the early-exit as well.
- if (CanKill &&
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- // Mark the instruction for kill-if-dead insertion. We delay this
- // change because it modifies the CFG.
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
- }
-
- case AMDGPU::SI_KILL_CLEANUP:
- if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
-
- case AMDGPU::SI_EARLY_TERMINATE_SCC0:
- EarlyTermInstrs.push_back(&MI);
- break;
-
- default:
- break;
- }
- }
- }
-
- for (MachineInstr *Instr : EarlyTermInstrs) {
- // Early termination in GS does nothing
- if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
- earlyTerm(*Instr);
- Instr->eraseFromParent();
- }
- for (MachineInstr *Kill : KillInstrs) {
- skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
- Kill->getDebugLoc());
- Kill->eraseFromParent();
- }
- KillInstrs.clear();
- EarlyTermInstrs.clear();
- EarlyExitBlock = nullptr;
-
- return MadeChange;
-}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c127455..7d6f799 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -27,6 +27,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachinePostDominators.h"
@@ -131,7 +132,8 @@
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+ SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
+ AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
@@ -244,8 +246,8 @@
const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
- bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@@ -417,7 +419,7 @@
}
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
- DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
ForceEmitWaitcnt[LGKM_CNT] = true;
} else {
ForceEmitWaitcnt[LGKM_CNT] = false;
@@ -441,6 +443,9 @@
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
};
} // end anonymous namespace
@@ -451,8 +456,7 @@
const SIRegisterInfo *TRI,
unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
- assert(Op.isReg());
- if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
+ if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -463,9 +467,11 @@
unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
- if (TRI->isVGPR(*MRI, Op.getReg())) {
+ if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
Result.first = Reg - RegisterEncoding.VGPR0;
+ if (TRI->isAGPR(*MRI, Op.getReg()))
+ Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
@@ -491,7 +497,7 @@
const MachineRegisterInfo *MRI, unsigned OpNo,
unsigned Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
- assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+ assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
@@ -538,7 +544,7 @@
AMDGPU::OpName::data1),
CurrScore);
}
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+ } else if (SIInstrInfo::isAtomicRet(Inst) &&
Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
@@ -549,7 +555,8 @@
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
- if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
+ if (Op.isReg() && !Op.isDef() &&
+ TRI->isVectorRegister(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -560,7 +567,7 @@
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -569,7 +576,7 @@
} else if (TII->isMIMG(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -582,7 +589,7 @@
} else if (TII->isMUBUF(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -606,7 +613,8 @@
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
- if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
+ if (MO.isReg() && !MO.isDef() &&
+ TRI->isVectorRegister(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -704,22 +712,23 @@
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
-bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
- simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
- simplifyWaitcnt(VS_CNT, Wait.VsCnt);
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ simplifyWaitcnt(VM_CNT, Wait.VmCnt);
+ simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
-bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
- if (Count < UB && UB - Count > LB)
- return true;
- Count = ~0u;
- return false;
+ // The number of outstanding events for this type, T, can be calculated
+ // as (UB - LB). If the current Count is greater than or equal to the number
+ // of outstanding events, then the wait for this counter is redundant.
+ if (Count >= UB - LB)
+ Count = ~0u;
}
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
@@ -794,6 +803,107 @@
return new SIInsertWaitcnts();
}
+/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
+/// by previous passes. Currently this pass conservatively assumes that these
+/// preexisting waitcnt are required for correctness.
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait,
+ const MachineInstr *MI) {
+ bool Modified = false;
+ MachineInstr *WaitcntInstr = nullptr;
+ MachineInstr *WaitcntVsCntInstr = nullptr;
+ for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
+ &*II != MI; II = NextI, ++NextI) {
+ if (II->isMetaInstruction())
+ continue;
+
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ // Conservatively update required wait if this waitcnt was added in an
+ // earlier pass. In this case it will not exist in the tracked waitcnt
+ // set.
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ Wait = Wait.combined(OldWait);
+ }
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (!WaitcntInstr) {
+ WaitcntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+ }
+
+ if (!WaitcntVsCntInstr) {
+ WaitcntVsCntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+ }
+ }
+
+ // Updated encoding of merged waitcnt with the required wait.
+ if (WaitcntInstr) {
+ if (Wait.hasWaitExceptVsCnt()) {
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
+ if (OldEnc != NewEnc) {
+ WaitcntInstr->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
+ << '\n');
+ } else {
+ WaitcntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (WaitcntVsCntInstr) {
+ if (Wait.hasWaitVsCnt()) {
+ assert(ST->hasVscnt());
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->getImm();
+ if (Wait.VsCnt != OldVSCnt) {
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ } else {
+ WaitcntVsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ return Modified;
+}
+
static bool readsVCCZ(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -829,15 +939,17 @@
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
setForceEmitWaitcnt();
- bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
+ bool Modified = false;
- // See if this instruction has a forced S_WAITCNT VM.
- // TODO: Handle other cases of NeedsWaitcntVmBefore()
+ // FIXME: This should have already been handled by the memory legalizer.
+ // Removing this currently doesn't affect any lit tests, but we need to
+ // verify that nothing was relying on this. The number of buffer invalidates
+ // being handled here should not be expanded.
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
@@ -1003,7 +1115,7 @@
RegInterval Interval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
- const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
+ const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
@@ -1049,32 +1161,8 @@
}
}
- // Early-out if no wait is indicated.
- if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
- bool Modified = false;
- if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, ++NextI) {
- if (II->isDebugInstr())
- continue;
-
- if (TrackedWaitcntSet.count(&*II)) {
- TrackedWaitcntSet.erase(&*II);
- II->eraseFromParent();
- Modified = true;
- } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- int64_t Imm = II->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
- }
- }
- }
- return Modified;
- }
+ // Verify that the wait is actually needed.
+ ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
@@ -1088,57 +1176,19 @@
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
- ScoreBrackets.applyWaitcnt(Wait);
-
- AMDGPU::Waitcnt OldWait;
- bool Modified = false;
-
if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, NextI++) {
- if (II->isDebugInstr())
- continue;
-
- if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- unsigned IEnc = II->getOperand(0).getImm();
- AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- OldWait = OldWait.combined(IWait);
- if (!TrackedWaitcntSet.count(&*II))
- Wait = Wait.combined(IWait);
- unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (IEnc != NewEnc) {
- II->getOperand(0).setImm(NewEnc);
- Modified = true;
- }
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-
- unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
- ->getImm();
- OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
- if (!TrackedWaitcntSet.count(&*II))
- Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
- if (Wait.VsCnt != ICnt) {
- TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
- Modified = true;
- }
- Wait.VsCnt = ~0u;
- }
-
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *II << '\n');
-
- if (!Wait.hasWait())
- return Modified;
- }
+ // Try to merge the required wait with preexisting waitcnt instructions.
+ // Also erase redundant waitcnt.
+ Modified =
+ applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
+ } else {
+ // Update waitcnt brackets after determining the required wait.
+ ScoreBrackets.applyWaitcnt(Wait);
}
- if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ // Build new waitcnt instructions unless no wait is needed or the old waitcnt
+ // instruction was modified to handle the required wait.
+ if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
@@ -1151,7 +1201,7 @@
<< "New Instr: " << *SWaitInst << '\n');
}
- if (Wait.VsCnt != ~0u) {
+ if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
auto SWaitInst =
@@ -1208,6 +1258,10 @@
if (!TII->usesLGKM_CNT(MI))
return false;
+ // If in tgsplit mode then there can be no use of LDS.
+ if (ST->isTgSplitEnabled())
+ return false;
+
// If there are no memory operands then conservatively assume the flat
// operation may access LDS.
if (MI.memoperands_empty())
@@ -1246,8 +1300,7 @@
++FlatASCount;
if (!ST->hasVscnt())
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- else if (Inst.mayLoad() &&
- AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+ else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
@@ -1267,16 +1320,10 @@
if (FlatASCount > 1)
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
- // TODO: get a better carve out.
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
- Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
- Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+ !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
if (!ST->hasVscnt())
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- else if ((Inst.mayLoad() &&
- AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+ else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
/* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
(TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
@@ -1284,7 +1331,7 @@
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
if (ST->vmemWriteNeedsExpWaitcnt() &&
- (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
+ (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
@@ -1424,7 +1471,8 @@
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Track pre-existing waitcnts from earlier iterations.
+ // Track pre-existing waitcnts that were added in earlier iterations or by
+ // the memory legalizer.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
Inst.getOperand(0).isReg() &&
@@ -1473,8 +1521,12 @@
if (TII->isSMRD(Inst)) {
for (const MachineMemOperand *Memop : Inst.memoperands()) {
- const Value *Ptr = Memop->getValue();
- SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ // No need to handle invariant loads when avoiding WAR conflicts, as
+ // there cannot be a vector store to the same memory location.
+ if (!Memop->isInvariant()) {
+ const Value *Ptr = Memop->getValue();
+ SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ }
}
if (ST->hasReadVCCZBug()) {
// This smem read could complete and clobber vccz at any time.
@@ -1550,6 +1602,28 @@
TrackedWaitcntSet.clear();
BlockInfos.clear();
+ bool Modified = false;
+
+ if (!MFI->isEntryFunction()) {
+ // Wait for any outstanding memory operations that the input registers may
+ // depend on. We can't track them and it's better to do the wait after the
+ // costly call sequence.
+
+ // TODO: Could insert earlier and schedule more liberally with operations
+ // that only use caller preserved registers.
+ MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ if (ST->hasVscnt())
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+
+ Modified = true;
+ }
// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.
@@ -1557,7 +1631,6 @@
BlockInfos.insert({MBB, BlockInfo(MBB)});
std::unique_ptr<WaitcntBrackets> Brackets;
- bool Modified = false;
bool Repeat;
do {
Repeat = false;
@@ -1657,26 +1730,5 @@
}
}
- if (!MFI->isEntryFunction()) {
- // Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to the wait after the
- // costly call sequence.
-
- // TODO: Could insert earlier and schedule more liberally with operations
- // that only use caller preserved registers.
- MachineBasicBlock &EntryBB = MF.front();
- MachineBasicBlock::iterator I = EntryBB.begin();
- for (MachineBasicBlock::iterator E = EntryBB.end();
- I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
- ;
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
- if (ST->hasVscnt())
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
-
- Modified = true;
- }
-
return Modified;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 7ce042b..e39f528 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -113,7 +113,7 @@
// This field indicates that FLAT instruction accesses FLAT_GLBL segment.
// Must be 0 for non-FLAT instructions.
- field bit IsFlatGlobal = 0;
+ field bit FlatGlobal = 0;
// Reads the mode register, usually for FP environment.
field bit ReadsModeReg = 0;
@@ -133,7 +133,13 @@
// This field indicates that FLAT instruction accesses FLAT_SCRATCH segment.
// Must be 0 for non-FLAT instructions.
- field bit IsFlatScratch = 0;
+ field bit FlatScratch = 0;
+
+ // Atomic without a return.
+ field bit IsAtomicNoRet = 0;
+
+ // Atomic with return.
+ field bit IsAtomicRet = 0;
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
@@ -193,7 +199,7 @@
let TSFlags{50} = D16Buf;
- let TSFlags{51} = IsFlatGlobal;
+ let TSFlags{51} = FlatGlobal;
let TSFlags{52} = FPDPRounding;
@@ -203,7 +209,11 @@
let TSFlags{55} = IsDOT;
- let TSFlags{56} = IsFlatScratch;
+ let TSFlags{56} = FlatScratch;
+
+ let TSFlags{57} = IsAtomicNoRet;
+
+ let TSFlags{58} = IsAtomicRet;
let SchedRW = [Write32Bit];
@@ -251,6 +261,13 @@
int Size = 8;
}
+def CPolBit {
+ int GLC = 0;
+ int SLC = 1;
+ int DLC = 2;
+ int SCC = 4;
+}
+
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
class VINTRPe <bits<2> op> : Enc32 {
@@ -268,27 +285,25 @@
}
class MIMGe : Enc64 {
- bits<8> vdata;
+ bits<10> vdata;
bits<4> dmask;
bits<1> unorm;
- bits<1> glc;
+ bits<5> cpol;
bits<1> r128;
bits<1> tfe;
bits<1> lwe;
- bits<1> slc;
bit d16;
bits<7> srsrc;
bits<7> ssamp;
let Inst{11-8} = dmask;
let Inst{12} = unorm;
- let Inst{13} = glc;
+ let Inst{13} = cpol{CPolBit.GLC};
let Inst{15} = r128;
- let Inst{16} = tfe;
let Inst{17} = lwe;
- let Inst{25} = slc;
+ let Inst{25} = cpol{CPolBit.SLC};
let Inst{31-26} = 0x3c;
- let Inst{47-40} = vdata;
+ let Inst{47-40} = vdata{7-0};
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
@@ -299,7 +314,21 @@
bits<1> da;
let Inst{0} = op{7};
+ let Inst{7} = cpol{CPolBit.SCC};
let Inst{14} = da;
+ let Inst{16} = tfe;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr;
+}
+
+class MIMGe_gfx90a <bits<8> op> : MIMGe {
+ bits<8> vaddr;
+ bits<1> da;
+
+ let Inst{0} = op{7};
+ let Inst{7} = cpol{CPolBit.SCC};
+ let Inst{14} = da;
+ let Inst{16} = vdata{9}; // ACC bit
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr;
}
@@ -308,13 +337,13 @@
bits<8> vaddr0;
bits<3> dim;
bits<2> nsa;
- bits<1> dlc;
bits<1> a16;
let Inst{0} = op{7};
let Inst{2-1} = nsa;
let Inst{5-3} = dim;
- let Inst{7} = dlc;
+ let Inst{7} = cpol{CPolBit.DLC};
+ let Inst{16} = tfe;
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr0;
let Inst{62} = a16;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index dfd0075..7ab0f7a 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
@@ -107,20 +108,26 @@
bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const {
- // TODO: The generic check fails for VALU instructions that should be
- // rematerializable due to implicit reads of exec. We really want all of the
- // generic logic for this except for this.
- switch (MI.getOpcode()) {
- case AMDGPU::V_MOV_B32_e32:
- case AMDGPU::V_MOV_B32_e64:
- case AMDGPU::V_MOV_B64_PSEUDO:
- case AMDGPU::V_ACCVGPR_READ_B32_e64:
- case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
- // No implicit operands.
- return MI.getNumOperands() == MI.getDesc().getNumOperands();
- default:
- return false;
+ if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) {
+ // Normally VALU use of exec would block the rematerialization, but that
+ // is OK in this case to have an implicit exec read as all VALU do.
+ // We really want all of the generic logic for this except for this.
+
+ // Another potential implicit use is mode register. The core logic of
+ // the RA will not attempt rematerialization if mode is set anywhere
+ // in the function, otherwise it is safe since mode is not changed.
+ return !MI.hasImplicitDef() &&
+ MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
+ !MI.mayRaiseFPException();
}
+
+ return false;
+}
+
+bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
+ // Any implicit use of exec by VALU is not a real register read.
+ return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
+ isVALU(*MO.getParent());
}
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -313,39 +320,22 @@
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
- const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
- if (SOffset && SOffset->isReg()) {
- // We can only handle this if it's a stack access, as any other resource
- // would require reporting multiple base registers.
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (AddrReg && !AddrReg->isFI())
- return false;
-
- const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
- const SIMachineFunctionInfo *MFI
- = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
- if (RSrc->getReg() != MFI->getScratchRSrcReg())
- return false;
-
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOps.push_back(RSrc);
- BaseOps.push_back(SOffset);
- Offset = OffsetImm->getImm();
- } else {
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
- if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL
- return false;
+ const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
+ return false;
+ BaseOps.push_back(RSrc);
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp && !BaseOp->isFI())
BaseOps.push_back(BaseOp);
-
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (BaseOp)
- BaseOps.push_back(BaseOp);
-
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- Offset = OffsetImm->getImm();
- if (SOffset) // soffset can be an inline immediate.
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetImm->getImm();
+ const MachineOperand *SOffset =
+ getNamedOperand(LdSt, AMDGPU::OpName::soffset);
+ if (SOffset) {
+ if (SOffset->isReg())
+ BaseOps.push_back(SOffset);
+ else
Offset += SOffset->getImm();
}
// Get appropriate operand, and compute width accordingly.
@@ -576,15 +566,18 @@
if (!Tmp)
report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
RS.setRegUsed(Tmp);
- // Only loop through if there are any free registers left, otherwise
- // scavenger may report a fatal error without emergency spill slot
- // or spill with the slot.
- while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
- Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
- break;
- Tmp = Tmp2;
- RS.setRegUsed(Tmp);
+
+ if (!TII.getSubtarget().hasGFX90AInsts()) {
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
+ }
}
// Insert copy to temporary VGPR.
@@ -782,7 +775,6 @@
return;
}
-
if (RC == &AMDGPU::AGPR_32RegClass) {
if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
@@ -790,6 +782,12 @@
return;
}
+ if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
// FIXME: Pass should maintain scavenger to avoid scan through the block on
// every AGPR spill.
RegScavenger RS;
@@ -797,7 +795,8 @@
return;
}
- if (RI.getRegSizeInBits(*RC) == 16) {
+ const unsigned Size = RI.getRegSizeInBits(*RC);
+ if (Size == 16) {
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
@@ -863,9 +862,27 @@
return;
}
+ const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
+ if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
+ if (ST.hasPackedFP32Ops()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+ return;
+ }
+ }
+
const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
if (RI.isSGPRClass(RC)) {
- if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
+ if (!RI.isSGPRClass(SrcRC)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -873,12 +890,21 @@
return;
}
+ unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.hasAGPRs(RC)) {
- Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
+ Opcode = (RI.hasVGPRs(SrcRC)) ?
AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
- } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+ } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
+ } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
+ (RI.isProperlyAlignedRC(*RC) &&
+ (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
+ // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
+ if (ST.hasPackedFP32Ops()) {
+ Opcode = AMDGPU::V_PK_MOV_B32;
+ EltSize = 8;
+ }
}
// For the cases where we need an intermediate instruction/temporary register
@@ -890,7 +916,7 @@
if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
RS.reset(new RegScavenger());
- ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4);
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
// If there is an overlap, we can't kill the super-register on the last
// instruction, since it will also kill the components made live by this def.
@@ -911,6 +937,23 @@
indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
ImpDefSuper, ImpUseSuper);
+ } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
+ Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ if (Idx == 0)
+ MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
@@ -969,7 +1012,7 @@
.addImm(Value);
return;
}
- if (RegClass == &AMDGPU::VReg_64RegClass) {
+ if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
.addImm(Value);
return;
@@ -1301,6 +1344,8 @@
return AMDGPU::SI_SPILL_S160_SAVE;
case 24:
return AMDGPU::SI_SPILL_S192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@@ -1326,6 +1371,8 @@
return AMDGPU::SI_SPILL_V160_SAVE;
case 24:
return AMDGPU::SI_SPILL_V192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@@ -1351,6 +1398,8 @@
return AMDGPU::SI_SPILL_A160_SAVE;
case 24:
return AMDGPU::SI_SPILL_A192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_A224_SAVE;
case 32:
return AMDGPU::SI_SPILL_A256_SAVE;
case 64:
@@ -1434,6 +1483,8 @@
return AMDGPU::SI_SPILL_S160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_S192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@@ -1459,6 +1510,8 @@
return AMDGPU::SI_SPILL_V160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_V192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
@@ -1484,6 +1537,8 @@
return AMDGPU::SI_SPILL_A160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_A192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_A224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_A256_RESTORE;
case 64:
@@ -1590,6 +1645,7 @@
}
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
@@ -1640,6 +1696,18 @@
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
break;
+ case AMDGPU::S_AND_B64_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B64));
+ break;
+
+ case AMDGPU::S_AND_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1650,20 +1718,49 @@
assert(!SrcOp.isFPImm());
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
- .addImm(Imm.getLoBits(32).getZExtValue())
- .addReg(Dst, RegState::Implicit | RegState::Define);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
- .addImm(Imm.getHiBits(32).getZExtValue())
- .addReg(Dst, RegState::Implicit | RegState::Define);
+ APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+ APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addImm(Lo.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addImm(Hi.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ }
} else {
assert(SrcOp.isReg());
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
- .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
- .addReg(Dst, RegState::Implicit | RegState::Define);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
- .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
- .addReg(Dst, RegState::Implicit | RegState::Define);
+ if (ST.hasPackedFP32Ops() &&
+ !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+ .addImm(SISrcMods::OP_SEL_1) // src0_mod
+ .addReg(SrcOp.getReg())
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
+ .addReg(SrcOp.getReg())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ }
}
MI.eraseFromParent();
break;
@@ -1672,11 +1769,35 @@
expandMovDPP64(MI);
break;
}
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ assert(!SrcOp.isFPImm());
+ APInt Imm(64, SrcOp.getImm());
+ if (Imm.isIntN(32) || isInlineConstant(Imm)) {
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+ APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
+ .addImm(Lo.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
+ .addImm(Hi.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
@@ -1687,8 +1808,8 @@
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
.add(MI.getOperand(2));
@@ -1848,16 +1969,29 @@
MI.eraseFromParent();
break;
}
- case AMDGPU::ENTER_WWM: {
+ case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
- // WWM is entered.
+ // Whole Wave Mode is entered.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64));
break;
}
- case AMDGPU::EXIT_WWM: {
+ case AMDGPU::ENTER_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
- // WWM is exited.
+ // STRICT_WQM is entered.
+ const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
+ const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
+ BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::EXIT_STRICT_WWM:
+ case AMDGPU::EXIT_STRICT_WQM: {
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM/STICT_WQM is exited.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
@@ -1877,7 +2011,6 @@
unsigned Part = 0;
MachineInstr *Split[2];
-
for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
if (Dst.isPhysical()) {
@@ -2098,32 +2231,36 @@
// s_getpc_b64. Insert pc arithmetic code before last terminator.
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
- // TODO: Handle > 32-bit block address.
- if (BrOffset >= 0) {
- BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+ auto &MCCtx = MF->getContext();
+ MCSymbol *PostGetPCLabel =
+ MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
+ GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
+
+ MCSymbol *OffsetLo =
+ MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
+ MCSymbol *OffsetHi =
+ MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
- BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+ .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
- .addImm(0);
- } else {
- // Backwards branch.
- BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
- .addReg(PCReg, RegState::Define, AMDGPU::sub0)
- .addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
- BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
- .addReg(PCReg, RegState::Define, AMDGPU::sub1)
- .addReg(PCReg, 0, AMDGPU::sub1)
- .addImm(0);
- }
+ .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
// Insert the indirect branch after the other terminator.
BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
.addReg(PCReg);
+ auto ComputeBlockSize = [](const TargetInstrInfo *TII,
+ const MachineBasicBlock &MBB) {
+ unsigned Size = 0;
+ for (const MachineInstr &MI : MBB)
+ Size += TII->getInstSizeInBytes(MI);
+ return Size;
+ };
+
// FIXME: If spilling is necessary, this will fail because this scavenger has
// no emergency stack slots. It is non-trivial to spill in this situation,
// because the restore code needs to be specially placed after the
@@ -2168,7 +2305,16 @@
MRI.clearVirtRegs();
RS->setRegUsed(Scav);
- return 4 + 8 + 4 + 4;
+ // Now, the distance could be defined.
+ auto *Offset = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+ MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
+ // Add offset assignments.
+ auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
+ OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
+ auto *ShAmt = MCConstantExpr::create(32, MCCtx);
+ OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
+ return ComputeBlockSize(this, MBB);
}
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -2263,18 +2409,18 @@
// Skip over the instructions that are artificially terminators for special
// exec management.
- while (I != E && !I->isBranch() && !I->isReturn() &&
- I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
+ while (I != E && !I->isBranch() && !I->isReturn()) {
switch (I->getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
+ case AMDGPU::S_AND_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
+ case AMDGPU::S_AND_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
@@ -2292,34 +2438,7 @@
if (I == E)
return false;
- if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
- return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
-
- ++I;
-
- // TODO: Should be able to treat as fallthrough?
- if (I == MBB.end())
- return true;
-
- if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
- return true;
-
- MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
-
- // Specifically handle the case where the conditional branch is to the same
- // destination as the mask branch. e.g.
- //
- // si_mask_branch BB8
- // s_cbranch_execz BB8
- // s_cbranch BB9
- //
- // This is required to understand divergent loops which may need the branches
- // to be relaxed.
- if (TBB != MaskBrDest || Cond.empty())
- return true;
-
- auto Pred = Cond[0].getImm();
- return (Pred != EXECZ && Pred != EXECNZ);
+ return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
}
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
@@ -2330,11 +2449,6 @@
unsigned RemovedSize = 0;
while (I != MBB.end()) {
MachineBasicBlock::iterator Next = std::next(I);
- if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
- I = Next;
- continue;
- }
-
RemovedSize += getInstSizeInBytes(*I);
I->eraseFromParent();
++Count;
@@ -2400,6 +2514,7 @@
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
+ fixImplicitOperands(*CondBr);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(FBB);
@@ -2593,6 +2708,7 @@
case AMDGPU::COPY:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
case AMDGPU::V_ACCVGPR_READ_B32_e64:
+ case AMDGPU::V_ACCVGPR_MOV_B32:
return true;
default:
return false;
@@ -2983,7 +3099,9 @@
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
+ bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
switch (Opc) {
default:
@@ -2994,13 +3112,15 @@
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F64_e64:
break;
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
- case AMDGPU::V_FMAC_F32_e32: {
+ case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_FMAC_F64_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -3026,7 +3146,7 @@
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
MachineInstrBuilder MIB;
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
@@ -3074,7 +3194,9 @@
}
}
- unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64)
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+ : IsF64 ? AMDGPU::V_FMA_F64_e64
+ : AMDGPU::V_FMA_F32_e64)
: (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
@@ -3262,6 +3384,10 @@
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
@@ -3271,6 +3397,7 @@
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -3382,6 +3509,10 @@
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+ // GFX90A does not have V_MUL_LEGACY_F32_e32.
+ if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
+ return false;
+
int Op32 = AMDGPU::getVOPe32(Opcode);
if (Op32 == -1)
return false;
@@ -3439,6 +3570,7 @@
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F64_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -3663,7 +3795,8 @@
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
- if (MI.getOperand(i).isFPImm()) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isFPImm()) {
ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
"all fp values to integers.";
return false;
@@ -3690,8 +3823,8 @@
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
- const MachineOperand &MO = MI.getOperand(i);
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
return false;
@@ -3712,12 +3845,37 @@
continue;
}
- if (!MI.getOperand(i).isReg())
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
continue;
+ // FIXME: Ideally we would have separate instruction definitions with the
+ // aligned register constraint.
+ // FIXME: We do not verify inline asm operands, but custom inline asm
+ // verification is broken anyway
+ if (ST.needsAlignedVGPRs()) {
+ const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
+ const bool IsVGPR = RI.hasVGPRs(RC);
+ const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
+ if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
+ const TargetRegisterClass *SubRC =
+ RI.getSubRegClass(RC, MO.getSubReg());
+ RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
+ if (RC)
+ RC = SubRC;
+ }
+
+ // Check that this is the aligned version of the class.
+ if (!RC || !RI.isProperlyAlignedRC(*RC)) {
+ ErrInfo = "Subtarget requires even aligned vector registers";
+ return false;
+ }
+ }
+
if (RegClass != -1) {
- Register Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister || Reg.isVirtual())
+ if (Reg.isVirtual())
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3864,7 +4022,8 @@
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
- unsigned LiteralCount = 0;
+ bool UsesLiteral = false;
+ const MachineOperand *LiteralVal = nullptr;
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
@@ -3886,8 +4045,15 @@
SGPRsUsed.push_back(SGPRUsed);
}
} else {
- ++ConstantBusCount;
- ++LiteralCount;
+ if (!UsesLiteral) {
+ ++ConstantBusCount;
+ UsesLiteral = true;
+ LiteralVal = &MO;
+ } else if (!MO.isIdenticalTo(*LiteralVal)) {
+ assert(isVOP3(MI));
+ ErrInfo = "VOP3 instruction uses more than one literal";
+ return false;
+ }
}
}
}
@@ -3911,15 +4077,9 @@
return false;
}
- if (isVOP3(MI) && LiteralCount) {
- if (!ST.hasVOP3Literal()) {
- ErrInfo = "VOP3 instruction uses literal";
- return false;
- }
- if (LiteralCount > 1) {
- ErrInfo = "VOP3 instruction uses more than one literal";
- return false;
- }
+ if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
}
}
@@ -4113,25 +4273,10 @@
IsA16 = A16->getImm() != 0;
}
- bool PackDerivatives = IsA16 || BaseOpcode->G16;
bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
- unsigned AddrWords = BaseOpcode->NumExtraArgs;
- unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
- if (IsA16)
- AddrWords += (AddrComponents + 1) / 2;
- else
- AddrWords += AddrComponents;
-
- if (BaseOpcode->Gradients) {
- if (PackDerivatives)
- // There are two gradients per coordinate, we pack them separately.
- // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
- AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2;
- else
- AddrWords += Dim->NumGradients;
- }
+ unsigned AddrWords =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
unsigned VAddrWords;
if (IsNSA) {
@@ -4141,12 +4286,6 @@
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
if (AddrWords > 8)
AddrWords = 16;
- else if (AddrWords > 4)
- AddrWords = 8;
- else if (AddrWords == 4)
- AddrWords = 4;
- else if (AddrWords == 3)
- AddrWords = 3;
}
if (VAddrWords != AddrWords) {
@@ -4187,8 +4326,89 @@
}
if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
ST.getGeneration() < AMDGPUSubtarget::GFX10) {
+ if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
+ DC <= DppCtrl::ROW_NEWBCAST_LAST &&
+ !ST.hasGFX90AInsts()) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_newbroadcast/row_share is not supported before "
+ "GFX90A/GFX10";
+ return false;
+ } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_share and row_xmask are not supported before GFX10";
+ return false;
+ }
+ }
+
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+
+ if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
+ ((DstIdx >= 0 &&
+ (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
+ ((Src0Idx >= 0 &&
+ (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[Src0Idx].RegClass ==
+ AMDGPU::VReg_64_Align2RegClassID)))) &&
+ !AMDGPU::isLegal64BitDPPControl(DC)) {
ErrInfo = "Invalid dpp_ctrl value: "
- "row_share and row_xmask are not supported before GFX10";
+ "64 bit dpp only support row_newbcast";
+ return false;
+ }
+ }
+
+ if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+ uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+ const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
+ const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
+ if (Data && !Data->isReg())
+ Data = nullptr;
+
+ if (ST.hasGFX90AInsts()) {
+ if (Dst && Data &&
+ (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "vdata and vdst should be both VGPR or AGPR";
+ return false;
+ }
+ if (Data && Data2 &&
+ (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "both data operands should be VGPR or AGPR";
+ return false;
+ }
+ } else {
+ if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
+ (Data && RI.isAGPR(MRI, Data->getReg())) ||
+ (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "agpr loads and stores not supported on this GPU";
+ return false;
+ }
+ }
+ }
+
+ if (ST.needsAlignedVGPRs() &&
+ (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
+ MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+ MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
+ const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
+ Register Reg = Op->getReg();
+ bool Aligned = true;
+ if (Reg.isPhysical()) {
+ Aligned = !(RI.getHWRegIndex(Reg) & 1);
+ } else {
+ const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+ Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
+ !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+ }
+
+ if (!Aligned) {
+ ErrInfo = "Subtarget requires even aligned vector registers "
+ "for DS_GWS instructions";
return false;
}
}
@@ -4205,7 +4425,8 @@
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
- case AMDGPU::WWM: return AMDGPU::WWM;
+ case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
+ case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
case AMDGPU::S_MOV_B32: {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
return MI.getOperand(1).isReg() ||
@@ -4276,6 +4497,59 @@
"Unexpected scalar opcode without corresponding vector one!");
}
+static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
+ const MachineRegisterInfo &MRI,
+ const MCInstrDesc &TID,
+ unsigned RCID,
+ bool IsAllocatable) {
+ if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ (TID.mayLoad() || TID.mayStore() ||
+ (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+ switch (RCID) {
+ case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
+ case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
+ case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
+ case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
+ case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
+ default:
+ break;
+ }
+ }
+ return RCID;
+}
+
+const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
+ unsigned OpNum, const TargetRegisterInfo *TRI,
+ const MachineFunction &MF)
+ const {
+ if (OpNum >= TID.getNumOperands())
+ return nullptr;
+ auto RegClass = TID.OpInfo[OpNum].RegClass;
+ bool IsAllocatable = false;
+ if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
+ // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
+ // with two data operands. Request register class constainted to VGPR only
+ // of both operands present as Machine Copy Propagation can not check this
+ // constraint and possibly other passes too.
+ //
+ // The check is limited to FLAT and DS because atomics in non-flat encoding
+ // have their vdst and vdata tied to be the same register.
+ const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+ AMDGPU::OpName::vdst);
+ const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+ (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata);
+ if (DataIdx != -1) {
+ IsAllocatable = VDstIdx != -1 ||
+ AMDGPU::getNamedOperandIdx(TID.Opcode,
+ AMDGPU::OpName::data1) != -1;
+ }
+ }
+ RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
+ IsAllocatable);
+ return RI.getRegClass(RegClass);
+}
+
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4290,6 +4564,7 @@
}
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+ RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
return RI.getRegClass(RCID);
}
@@ -4308,8 +4583,9 @@
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
- if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
- VRC = &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
+ if (RI.getCommonSubClass(VRC64, VRC))
+ VRC = VRC64;
else
VRC = &AMDGPU::VGPR_32RegClass;
@@ -4466,7 +4742,40 @@
if (MO->isReg()) {
assert(DefinedRC);
- return isLegalRegOperand(MRI, OpInfo, *MO);
+ if (!isLegalRegOperand(MRI, OpInfo, *MO))
+ return false;
+ bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
+ if (IsAGPR && !ST.hasMAIInsts())
+ return false;
+ unsigned Opc = MI.getOpcode();
+ if (IsAGPR &&
+ (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
+ return false;
+ // Atomics should have both vdst and vdata either vgpr or agpr.
+ const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
+ isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
+ if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
+ MI.getOperand(DataIdx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
+ return false;
+ if ((int)OpIdx == DataIdx) {
+ if (VDstIdx != -1 &&
+ RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
+ return false;
+ // DS instructions with 2 src operands also must have tied RC.
+ const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::data1);
+ if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
+ return false;
+ }
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
+ RI.isSGPRReg(MRI, MO->getReg()))
+ return false;
+ return true;
}
// Handle non-register types that are treated like immediates.
@@ -4740,6 +5049,86 @@
}
}
+bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
+ unsigned Opc = Inst.getOpcode();
+ int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
+ if (OldSAddrIdx < 0)
+ return false;
+
+ assert(isSegmentSpecificFLAT(Inst));
+
+ int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
+ if (NewOpc < 0)
+ NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
+ if (NewOpc < 0)
+ return false;
+
+ MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
+ MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
+ if (RI.isSGPRReg(MRI, SAddr.getReg()))
+ return false;
+
+ int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
+ if (NewVAddrIdx < 0)
+ return false;
+
+ int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+
+ // Check vaddr, it shall be zero or absent.
+ MachineInstr *VAddrDef = nullptr;
+ if (OldVAddrIdx >= 0) {
+ MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
+ VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
+ if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+ !VAddrDef->getOperand(1).isImm() ||
+ VAddrDef->getOperand(1).getImm() != 0)
+ return false;
+ }
+
+ const MCInstrDesc &NewDesc = get(NewOpc);
+ Inst.setDesc(NewDesc);
+
+ // Callers expect interator to be valid after this call, so modify the
+ // instruction in place.
+ if (OldVAddrIdx == NewVAddrIdx) {
+ MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
+ // Clear use list from the old vaddr holding a zero register.
+ MRI.removeRegOperandFromUseList(&NewVAddr);
+ MRI.moveOperands(&NewVAddr, &SAddr, 1);
+ Inst.RemoveOperand(OldSAddrIdx);
+ // Update the use list with the pointer we have just moved from vaddr to
+ // saddr poisition. Otherwise new vaddr will be missing from the use list.
+ MRI.removeRegOperandFromUseList(&NewVAddr);
+ MRI.addRegOperandToUseList(&NewVAddr);
+ } else {
+ assert(OldSAddrIdx == NewVAddrIdx);
+
+ if (OldVAddrIdx >= 0) {
+ int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
+ AMDGPU::OpName::vdst_in);
+
+ // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
+ // it asserts. Untie the operands for now and retie them afterwards.
+ if (NewVDstIn != -1) {
+ int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+ Inst.untieRegOperand(OldVDstIn);
+ }
+
+ Inst.RemoveOperand(OldVAddrIdx);
+
+ if (NewVDstIn != -1) {
+ int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
+ Inst.tieOperands(NewVDst, NewVDstIn);
+ }
+ }
+ }
+
+ if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
+ VAddrDef->eraseFromParent();
+
+ return true;
+}
+
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
@@ -4752,6 +5141,9 @@
if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
return;
+ if (moveFlatAddrToVGPR(MI))
+ return;
+
Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
SAddr->setReg(ToSGPR);
}
@@ -4905,7 +5297,7 @@
.addReg(Exec)
.addReg(SaveExec);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
}
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
@@ -5316,17 +5708,10 @@
.add(*SOffset)
.add(*Offset);
- // Atomics do not have this operand.
- if (const MachineOperand *GLC =
- getNamedOperand(MI, AMDGPU::OpName::glc)) {
- MIB.addImm(GLC->getImm());
+ if (const MachineOperand *CPol =
+ getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+ MIB.addImm(CPol->getImm());
}
- if (const MachineOperand *DLC =
- getNamedOperand(MI, AMDGPU::OpName::dlc)) {
- MIB.addImm(DLC->getImm());
- }
-
- MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
if (const MachineOperand *TFE =
getNamedOperand(MI, AMDGPU::OpName::tfe)) {
@@ -5346,7 +5731,7 @@
.addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset)
- .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+ .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
.cloneMemRefs(MI);
}
@@ -5449,6 +5834,11 @@
Inst.eraseFromParent();
continue;
+ case AMDGPU::S_BREV_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
+ Inst.eraseFromParent();
+ continue;
+
case AMDGPU::S_NOT_B64:
splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
@@ -5654,6 +6044,8 @@
// Only propagate through live-def of SCC.
if (Op.isDef() && !Op.isDead())
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
+ if (Op.isUse())
+ addSCCDefsToVALUWorklist(Op, Worklist);
Inst.RemoveOperand(i);
}
}
@@ -5999,7 +6391,7 @@
void SIInstrInfo::splitScalar64BitUnaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode) const {
+ unsigned Opcode, bool Swap) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6032,6 +6424,9 @@
Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ if (Swap)
+ std::swap(DestSub0, DestSub1);
+
Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -6341,7 +6736,8 @@
case AMDGPU::COPY:
case AMDGPU::WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM:
+ case AMDGPU::STRICT_WWM:
+ case AMDGPU::STRICT_WQM:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::PHI:
case AMDGPU::INSERT_SUBREG:
@@ -6485,6 +6881,32 @@
}
}
+// Instructions that use SCC may be converted to VALU instructions. When that
+// happens, the SCC register is changed to VCC_LO. The instruction that defines
+// SCC must be changed to an instruction that defines VCC. This function makes
+// sure that the instruction that defines SCC is added to the moveToVALU
+// worklist.
+void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
+ SetVectorType &Worklist) const {
+ assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
+
+ MachineInstr *SCCUseInst = Op.getParent();
+ // Look for a preceeding instruction that either defines VCC or SCC. If VCC
+ // then there is nothing to do because the defining instruction has been
+ // converted to a VALU already. If SCC then that instruction needs to be
+ // converted to a VALU.
+ for (MachineInstr &MI :
+ make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
+ SCCUseInst->getParent()->rend())) {
+ if (MI.modifiesRegister(AMDGPU::VCC, &RI))
+ break;
+ if (MI.definesRegister(AMDGPU::SCC, &RI)) {
+ Worklist.insert(&MI);
+ break;
+ }
+ }
+}
+
const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
const MachineInstr &Inst) const {
const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
@@ -6499,7 +6921,8 @@
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM: {
+ case AMDGPU::STRICT_WWM:
+ case AMDGPU::STRICT_WQM: {
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
if (RI.hasAGPRs(SrcRC)) {
if (RI.hasAGPRs(NewDstRC))
@@ -6614,7 +7037,7 @@
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
- return (22ULL << 44) | // IMG_FORMAT_32_FLOAT
+ return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
(1ULL << 56) | // RESOURCE_LEVEL = 1
(3ULL << 60); // OOB_SELECT = 3
}
@@ -6786,11 +7209,6 @@
}
switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::DBG_VALUE:
- case TargetOpcode::EH_LABEL:
- return 0;
case TargetOpcode::BUNDLE:
return getInstBundleSize(MI);
case TargetOpcode::INLINEASM:
@@ -6800,6 +7218,8 @@
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
default:
+ if (MI.isMetaInstruction())
+ return 0;
return DescSize;
}
}
@@ -7026,36 +7446,92 @@
return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
}
+// Depending on the used address space and instructions, some immediate offsets
+// are allowed and some are not.
+// In general, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative.
+//
+// There are several bugs related to these offsets:
+// On gfx10.1, flat instructions that go into the global address space cannot
+// use an offset.
+//
+// For scratch instructions, the address can be either an SGPR or a VGPR.
+// The following offsets can be used, depending on the architecture (x means
+// cannot be used):
+// +----------------------------+------+------+
+// | Address-Mode | SGPR | VGPR |
+// +----------------------------+------+------+
+// | gfx9 | | |
+// | negative, 4-aligned offset | x | ok |
+// | negative, unaligned offset | x | ok |
+// +----------------------------+------+------+
+// | gfx10 | | |
+// | negative, 4-aligned offset | ok | ok |
+// | negative, unaligned offset | ok | x |
+// +----------------------------+------+------+
+// | gfx10.3 | | |
+// | negative, 4-aligned offset | ok | ok |
+// | negative, unaligned offset | ok | ok |
+// +----------------------------+------+------+
+//
+// This function ignores the addressing mode, so if an offset cannot be used in
+// one addressing mode, it is considered illegal.
bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
- bool Signed) const {
+ uint64_t FlatVariant) const {
// TODO: Should 0 be special cased?
if (!ST.hasFlatInstOffsets())
return false;
- if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
+ (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
+ AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
return false;
+ bool Signed = FlatVariant != SIInstrFlags::FLAT;
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ Signed = false;
+ if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
+ (Offset % 4) != 0) {
+ return false;
+ }
+
unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
}
-std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
- unsigned AddrSpace,
- bool IsSigned) const {
+// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
+std::pair<int64_t, int64_t>
+SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
+ uint64_t FlatVariant) const {
int64_t RemainderOffset = COffsetVal;
int64_t ImmField = 0;
- const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned);
- if (IsSigned) {
+ bool Signed = FlatVariant != SIInstrFlags::FLAT;
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ Signed = false;
+
+ const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
+ if (Signed) {
// Use signed division by a power of two to truncate towards 0.
int64_t D = 1LL << (NumBits - 1);
RemainderOffset = (COffsetVal / D) * D;
ImmField = COffsetVal - RemainderOffset;
+
+ if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
+ (ImmField % 4) != 0) {
+ // Make ImmField a multiple of 4
+ RemainderOffset += ImmField % 4;
+ ImmField -= ImmField % 4;
+ }
} else if (COffsetVal >= 0) {
ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
RemainderOffset = COffsetVal - ImmField;
}
- assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned));
+ assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
assert(RemainderOffset + ImmField == COffsetVal);
return {ImmField, RemainderOffset};
}
@@ -7069,7 +7545,8 @@
GFX80 = 4,
GFX9 = 5,
GFX10 = 6,
- SDWA10 = 7
+ SDWA10 = 7,
+ GFX90A = 8
};
static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -7141,6 +7618,15 @@
if (MCOp == -1)
return Opcode;
+ if (ST.hasGFX90AInsts()) {
+ uint16_t NMCOp = (uint16_t)-1;
+ NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
+ if (NMCOp == (uint16_t)-1)
+ NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
+ if (NMCOp != (uint16_t)-1)
+ MCOp = NMCOp;
+ }
+
// (uint16_t)-1 means that Opcode is a pseudo instruction that has
// no encoding in the given subtarget generation.
if (MCOp == (uint16_t)-1)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ce59fe8..fc5e5be 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -96,7 +96,8 @@
unsigned Opcode) const;
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode) const;
+ MachineInstr &Inst, unsigned Opcode,
+ bool Swap = false) const;
void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
@@ -122,6 +123,8 @@
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const;
+ void addSCCDefsToVALUWorklist(MachineOperand &Op,
+ SetVectorType &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
@@ -158,8 +161,7 @@
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
MO_REL32_HI = 5,
- MO_LONG_BRANCH_FORWARD = 6,
- MO_LONG_BRANCH_BACKWARD = 7,
+ MO_FAR_BRANCH_OFFSET = 6,
MO_ABS32_LO = 8,
MO_ABS32_HI = 9,
@@ -171,9 +173,15 @@
return RI;
}
+ const GCNSubtarget &getSubtarget() const {
+ return ST;
+ }
+
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const override;
+ bool isIgnorableUse(const MachineOperand &MO) const override;
+
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1,
int64_t &Offset2) const override;
@@ -501,28 +509,28 @@
// i.e. global_* or scratch_*.
static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
auto Flags = MI.getDesc().TSFlags;
- return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
bool isSegmentSpecificFLAT(uint16_t Opcode) const {
auto Flags = get(Opcode).TSFlags;
- return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
static bool isFLATGlobal(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::IsFlatGlobal;
+ return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal;
}
bool isFLATGlobal(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::IsFlatGlobal;
+ return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal;
}
static bool isFLATScratch(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::IsFlatScratch;
+ return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch;
}
bool isFLATScratch(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::IsFlatScratch;
+ return get(Opcode).TSFlags & SIInstrFlags::FlatScratch;
}
// Any FLAT encoded instruction, including global_* and scratch_*.
@@ -538,6 +546,32 @@
return get(Opcode).TSFlags & SIInstrFlags::EXP;
}
+ static bool isAtomicNoRet(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
+ }
+
+ bool isAtomicNoRet(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
+ }
+
+ static bool isAtomicRet(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
+ }
+
+ bool isAtomicRet(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
+ }
+
+ static bool isAtomic(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & (SIInstrFlags::IsAtomicRet |
+ SIInstrFlags::IsAtomicNoRet);
+ }
+
+ bool isAtomic(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
+ SIInstrFlags::IsAtomicNoRet);
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -915,6 +949,10 @@
MachineBasicBlock *
legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
+ /// Change SADDR form of a FLAT \p Inst to its VADDR form if saddr operand
+ /// was moved to VGPR. \returns true if succeeded.
+ bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
+
/// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary. If present, \p MDT is updated.
@@ -1039,13 +1077,13 @@
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
- bool Signed) const;
+ uint64_t FlatVariant) const;
/// Split \p COffsetVal into {immediate offset field, remainder offset}
/// values.
std::pair<int64_t, int64_t> splitFlatOffset(int64_t COffsetVal,
unsigned AddrSpace,
- bool IsSigned) const;
+ uint64_t FlatVariant) const;
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
@@ -1059,11 +1097,7 @@
const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
const TargetRegisterInfo *TRI,
const MachineFunction &MF)
- const override {
- if (OpNum >= TID.getNumOperands())
- return nullptr;
- return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
- }
+ const override;
void fixImplicitOperands(MachineInstr &MI) const;
@@ -1166,26 +1200,39 @@
int getMUBUFNoLdsInst(uint16_t Opcode);
LLVM_READONLY
- int getAtomicRetOp(uint16_t Opcode);
-
- LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
LLVM_READONLY
int getSOPKOp(uint16_t Opcode);
+ /// \returns SADDR form of a FLAT Global instruction given an \p Opcode
+ /// of a VADDR form.
LLVM_READONLY
int getGlobalSaddrOp(uint16_t Opcode);
+ /// \returns VADDR form of a FLAT Global instruction given an \p Opcode
+ /// of a SADDR form.
+ LLVM_READONLY
+ int getGlobalVaddrOp(uint16_t Opcode);
+
LLVM_READONLY
int getVCMPXNoSDstOp(uint16_t Opcode);
+ /// \returns ST form with only immediate offset of a FLAT Scratch instruction
+ /// given an \p Opcode of an SS (SADDR) form.
LLVM_READONLY
int getFlatScratchInstSTfromSS(uint16_t Opcode);
+ /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
+ /// of an SV (VADDR) form.
LLVM_READONLY
int getFlatScratchInstSSfromSV(uint16_t Opcode);
+ /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
+ /// of an SS (SADDR) form.
+ LLVM_READONLY
+ int getFlatScratchInstSVfromSS(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 5adc9e8..25b647d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -16,7 +16,7 @@
Predicate VIAssemblerPredicate = isGFX8GFX9;
}
-// Execpt for the NONE field, this must be kept in sync with the
+// Except for the NONE field, this must be kept in sync with the
// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
def SIEncodingFamily {
int NONE = -1;
@@ -28,6 +28,7 @@
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
+ int GFX90A = 8;
}
//===----------------------------------------------------------------------===//
@@ -186,6 +187,8 @@
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
+def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -265,21 +268,25 @@
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),
!eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v2f64.Value));
+ !eq(SrcVT.Value, v2f64.Value),
+ !eq(SrcVT.Value, v4f64.Value));
}
class isIntType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
- !eq(SrcVT.Value, i64.Value));
+ !eq(SrcVT.Value, i64.Value),
+ !eq(SrcVT.Value, v2i32.Value));
}
class isPackedType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v4f16.Value));
+ !eq(SrcVT.Value, v4f16.Value),
+ !eq(SrcVT.Value, v2f32.Value));
}
+
//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
@@ -629,6 +636,11 @@
(add (ctpop $src0), $src1)
>;
+def xnor : PatFrag <
+ (ops node:$src0, node:$src1),
+ (not (xor $src0, $src1))
+>;
+
foreach I = 1-4 in {
def shl#I#_add : PatFrag <
(ops node:$src0, node:$src1),
@@ -802,26 +814,28 @@
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
}], getNegV2I16Imm>;
+
+def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
+ return fp16SrcZerosHighBits(N->getOpcode());
+}]>;
+
+
//===----------------------------------------------------------------------===//
// MUBUF/SMEM Patterns
//===----------------------------------------------------------------------===//
-def extract_glc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_slc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_dlc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+def extract_cpol : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & AMDGPU::CPol::ALL, SDLoc(N), MVT::i8);
}]>;
def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
}]>;
+def set_glc : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
@@ -1074,6 +1088,12 @@
let ParserMatchClass = MatchClass;
}
+class NamedOperandU32Default1<string Name, AsmOperandClass MatchClass> :
+ OperandWithDefaultOps<i32, (ops (i32 1))> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
let OperandType = "OPERAND_IMMEDIATE" in {
def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
@@ -1097,18 +1117,14 @@
def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
-def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
-def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;
-
-def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def GLC_0 : NamedOperandBit_0<"GLC", NamedMatchClass<"GLC">>;
-def GLC_1 : NamedOperandBit_1<"GLC", NamedMatchClass<"GLC_1">>;
-
-def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def SLC_0 : NamedOperandBit_0<"SLC", NamedMatchClass<"SLC">>;
+def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>;
+def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>;
+def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def TFE_0 : NamedOperandBit_0<"TFE", NamedMatchClass<"TFE">>;
def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
+def SWZ_0 : NamedOperandBit_0<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
@@ -1243,7 +1259,7 @@
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isVReg32";
+ let PredicateMethod = "isVRegWithInputMods";
}
def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
@@ -1270,7 +1286,7 @@
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isVReg32";
+ let PredicateMethod = "isVRegWithInputMods";
}
def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
@@ -1363,11 +1379,6 @@
int NONE = 0;
}
-def TRAPID{
- int LLVM_TRAP = 2;
- int LLVM_DEBUG_TRAP = 3;
-}
-
def HWREG {
int MODE = 1;
int STATUS = 2;
@@ -1507,8 +1518,12 @@
VSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
- VSrc_f64,
- VSrc_b64),
+ !if(!eq(VT.Value, v2f32.Value),
+ VSrc_v2f32,
+ VSrc_f64),
+ !if(!eq(VT.Value, v2i32.Value),
+ VSrc_v2b32,
+ VSrc_b64)),
!if(!eq(VT.Value, i1.Value),
SSrc_i1,
!if(isFP,
@@ -1541,7 +1556,9 @@
!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v2i16.Value));
+ !eq(SrcVT.Value, v2i16.Value),
+ !eq(SrcVT.Value, v2f32.Value),
+ !eq(SrcVT.Value, v2i32.Value));
}
// Return type of input modifiers operand for specified input operand
@@ -1598,8 +1615,11 @@
!if (!eq(NumSrcArgs, 1),
!if (HasModifiers,
// VOP1 with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- clampmod0:$clamp, omod0:$omod)
+ !if(HasOMod,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod0:$clamp, omod0:$omod),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod0:$clamp))
/* else */,
// VOP1 without modifiers
!if (HasClamp,
@@ -1695,7 +1715,7 @@
Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/, 0>.ret;
}
-class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
@@ -1705,45 +1725,45 @@
!if (!eq(NumSrcArgs, 1),
!if (HasModifiers,
// VOP1_DPP with modifiers
- (ins DstRC:$old, Src0Mod:$src0_modifiers,
+ (ins OldRC:$old, Src0Mod:$src0_modifiers,
Src0RC:$src0)
/* else */,
// VOP1_DPP without modifiers
- (ins DstRC:$old, Src0RC:$src0)
+ (ins OldRC:$old, Src0RC:$src0)
/* endif */),
!if (HasModifiers,
// VOP2_DPP with modifiers
- (ins DstRC:$old,
+ (ins OldRC:$old,
Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1)
/* else */,
// VOP2_DPP without modifiers
- (ins DstRC:$old,
+ (ins OldRC:$old,
Src0RC:$src0, Src1RC:$src1)
)));
}
-class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
-class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins FI:$fi));
}
-class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins dpp8:$dpp8, FI:$fi));
}
@@ -1846,7 +1866,7 @@
// instruction.
class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
bit HasClamp, ValueType DstVT = i32> {
- string dst = " $vdst";
+ string dst = "$vdst";
string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1",
@@ -1867,7 +1887,7 @@
bit Src0HasMods,
bit Src1HasMods,
bit Src2HasMods> {
- string dst = " $vdst";
+ string dst = "$vdst";
string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string isrc1 = !if(!eq(NumSrcArgs, 1), "",
@@ -1972,14 +1992,29 @@
string ret = dst#args#sdwa;
}
-
-// Function that checks if instruction supports DPP and SDWA
-class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
- ValueType Src1VT = i32> {
+class getHas64BitOps <int NumSrcArgs, ValueType DstVT, ValueType Src0VT,
+ ValueType Src1VT> {
bit ret = !if(!eq(NumSrcArgs, 3),
- 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+ 0,
!if(!eq(DstVT.Size, 64),
- 0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+ 1,
+ !if(!eq(Src0VT.Size, 64),
+ 1,
+ !if(!eq(Src1VT.Size, 64),
+ 1,
+ 0
+ )
+ )
+ )
+ );
+}
+
+class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !if(!eq(NumSrcArgs, 3),
+ 0, // NumSrcArgs == 3 - No SDWA for VOP3
+ !if(!eq(DstVT.Size, 64),
+ 0, // 64-bit dst - No SDWA for 64-bit operands
!if(!eq(Src0VT.Size, 64),
0, // 64-bit src0
!if(!eq(Src1VT.Size, 64),
@@ -1993,8 +2028,42 @@
class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
- bit ret = !if(!eq(NumSrcArgs, 0), 0,
- getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+ bit ret = !if(!eq(NumSrcArgs, 3),
+ 0, // NumSrcArgs == 3 - No DPP for VOP3
+ 1);
+}
+
+class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+ getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+ getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Return an AGPR+VGPR operand class for the given VGPR register class.
+class getLdStRegisterOperand<RegisterClass RC> {
+ RegisterOperand ret =
+ !if(!eq(RC.Size, 32), AVLdSt_32,
+ !if(!eq(RC.Size, 64), AVLdSt_64,
+ !if(!eq(RC.Size, 96), AVLdSt_96,
+ !if(!eq(RC.Size, 128), AVLdSt_128,
+ !if(!eq(RC.Size, 160), AVLdSt_160,
+ RegisterOperand<VReg_1> // invalid register
+ )))));
+}
+
+class BitOr<bit a, bit b> {
+ bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+ bit ret = !if(a, !if(b, 1, 0), 0);
}
def PatGenMode {
@@ -2037,6 +2106,7 @@
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
+ field bit EmitDstSel = EmitDst;
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
@@ -2077,12 +2147,14 @@
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
- field bit HasExtSDWA = HasExt;
- field bit HasExtSDWA9 = HasExt;
+ field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasExtSDWA9 = HasExtSDWA;
field int NeedPatGen = PatGenMode.NoPattern;
field bit IsMAI = 0;
field bit IsDOT = 0;
+ field bit IsSingle = 0;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -2134,7 +2206,9 @@
field string AsmDPP = !if(HasExtDPP,
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
- field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret;
+ // DPP8 encoding has no fields for modifiers, and it is enforced by setting
+ // the asm operand name via this HasModifiers flag
+ field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
@@ -2144,6 +2218,7 @@
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -2191,6 +2266,7 @@
def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
+def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>;
def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -2234,6 +2310,16 @@
def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
+def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>;
+def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>;
+
+def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>;
+def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>;
+def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>;
+def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
+def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
+def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -2372,7 +2458,8 @@
[!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
- [!cast<string>(SIEncodingFamily.SDWA10)]];
+ [!cast<string>(SIEncodingFamily.SDWA10)],
+ [!cast<string>(SIEncodingFamily.GFX90A)]];
}
// Get equivalent SOPK instruction.
@@ -2408,15 +2495,6 @@
let ValueCols = [["0"]];
}
-// Maps an atomic opcode to its version with a return value.
-def getAtomicRetOp : InstrMapping {
- let FilterClass = "AtomicNoRet";
- let RowFields = ["NoRetOp"];
- let ColFields = ["IsRet"];
- let KeyCol = ["0"];
- let ValueCols = [["1"]];
-}
-
// Maps an atomic opcode to its returnless version.
def getAtomicNoRetOp : InstrMapping {
let FilterClass = "AtomicNoRet";
@@ -2435,6 +2513,15 @@
let ValueCols = [["1"]];
}
+// Maps a GLOBAL SADDR to its VADDR form.
+def getGlobalVaddrOp : InstrMapping {
+ let FilterClass = "GlobalSaddrTable";
+ let RowFields = ["SaddrOp"];
+ let ColFields = ["IsSaddr"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
// Maps a v_cmpx opcode with sdst to opcode without sdst.
def getVCMPXNoSDstOp : InstrMapping {
let FilterClass = "VCMPXNoSDstTable";
@@ -2470,6 +2557,14 @@
let ValueCols = [["SS"]];
}
+def getFlatScratchInstSVfromSS : InstrMapping {
+ let FilterClass = "FlatScratchInst";
+ let RowFields = ["SVOp"];
+ let ColFields = ["Mode"];
+ let KeyCol = ["SS"];
+ let ValueCols = [["SV"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7c1cbd6..fbf4634 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -41,18 +41,21 @@
(i32 timm:$attrchan), (i32 timm:$attr), M0))]
>;
-let OtherPredicates = [has32BankLDS] in {
+let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in {
defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
-} // End OtherPredicates = [has32BankLDS]
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus]
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
+let OtherPredicates = [has16BankLDS, isNotGFX90APlus],
+ Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus],
+ // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+let OtherPredicates = [isNotGFX90APlus] in {
let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
@@ -73,6 +76,8 @@
[(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
+} // End OtherPredicates = [isNotGFX90APlus]
+
} // End Uses = [MODE, M0, EXEC]
//===----------------------------------------------------------------------===//
@@ -86,11 +91,6 @@
let maybeAtomic = 1;
}
-def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
- let HasExt = 1;
- let HasExtDPP = 1;
-}
-
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
@@ -104,13 +104,31 @@
// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64:$src0)>;
+ (ins VSrc_b64:$src0)> {
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+ let SchedRW = [Write64Bit];
+ let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
+}
// 64-bit vector move with dpp. Expanded post-RA.
-def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
}
+// 64-bit scalar move immediate instruction. This is used to avoid subregs
+// initialization and allow rematerialization.
+def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
+ (ins i64imm:$src0)> {
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+ let SchedRW = [WriteSALU, Write64Bit];
+ let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
+ let Uses = [];
+}
+
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
@@ -119,17 +137,18 @@
// turned into a copy by WQM pass, but does not seed WQM requirements.
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
-// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
-// the instruction that defines $src0 (which is run in WWM) doesn't
+// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't
// accidentally clobber inactive channels of $vdst.
let Constraints = "@earlyclobber $vdst" in {
-def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
}
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
let Uses = [EXEC];
let Defs = [EXEC, SCC];
let hasSideEffects = 0;
@@ -137,7 +156,21 @@
let mayStore = 0;
}
-def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
+def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+ let Uses = [EXEC];
+ let Defs = [EXEC, SCC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -145,6 +178,7 @@
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
+let Defs = [SCC] in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VGPR_32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
@@ -156,6 +190,7 @@
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
let Constraints = "$src = $vdst";
}
+} // End Defs = [SCC]
let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
@@ -230,6 +265,7 @@
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
}
let WaveSizePredicate = isWave32 in {
@@ -237,6 +273,7 @@
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
}
@@ -255,19 +292,6 @@
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-// Dummy terminator instruction to use after control flow instructions
-// replaced with exec mask operations.
-def SI_MASK_BRANCH : VPseudoInstSI <
- (outs), (ins brtarget:$target)> {
- let isBranch = 0;
- let isTerminator = 1;
- let isBarrier = 0;
- let SchedRW = [];
- let hasNoSchedulingInfo = 1;
- let FixedSize = 1;
- let Size = 0;
-}
-
let isTerminator = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
@@ -294,6 +318,14 @@
let hasSideEffects = 1;
}
+def SI_WATERFALL_LOOP : CFPseudoInstSI <
+ (outs),
+ (ins brtarget:$target), [], 1> {
+ let Size = 8;
+ let isBranch = 1;
+ let Defs = [];
+}
+
def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_1:$saved, brtarget:$target),
[(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
@@ -337,24 +369,22 @@
// required in degenerate cases (when V_CMPX cannot be used due to constant
// bus limitations) and because it allows us to avoid having to track SCC
// liveness across basic blocks.
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _PSEUDO : PseudoInstSI <(outs), ins> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
let isTerminator = 1;
}
}
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
+let Defs = [VCC] in
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
-let Defs = [EXEC] in
-def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
-
let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
@@ -376,6 +406,18 @@
let SALU = 1;
}
+let Uses = [EXEC] in {
+def SI_LIVE_MASK : PseudoInstSI <
+ (outs SReg_1:$dst), (ins),
+ [(set i1:$dst, (int_amdgcn_live_mask))]> {
+ let SALU = 1;
+}
+let Defs = [EXEC,SCC] in {
+// Demote: Turn a pixel shader thread into a helper lane.
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
+} // End Defs = [EXEC,SCC]
+} // End Uses = [EXEC]
+
def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
[(int_amdgcn_unreachable)],
"; divergent unreachable"> {
@@ -463,7 +505,7 @@
// Tail call handling pseudo
def SI_TCRETURN : SPseudoInstSI <(outs),
- (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
+ (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
[(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let isCall = 1;
@@ -476,6 +518,11 @@
let isConvergent = 1;
}
+// Handle selecting indirect tail calls
+def : GCNPat<
+ (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)),
+ (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff)
+>;
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
@@ -654,6 +701,7 @@
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
+defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -697,6 +745,7 @@
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
+defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -707,6 +756,7 @@
defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
@@ -749,6 +799,16 @@
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
+def : Pat <
+ (int_amdgcn_wqm_demote i1:$src),
+ (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
+>;
+
+def : Pat <
+ (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+ (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
+>;
+
// TODO: we could add more variants for other types of conditionals
def : Pat <
@@ -1021,6 +1081,38 @@
>;
}
+foreach Index = 0-5 in {
+ def Extract_Element_v6i32_#Index : Extract_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6i32_#Index : Insert_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v6f32_#Index : Extract_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6f32_#Index : Insert_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-6 in {
+ def Extract_Element_v7i32_#Index : Extract_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7i32_#Index : Insert_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v7f32_#Index : Extract_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7f32_#Index : Insert_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1171,8 +1263,32 @@
def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
-def : BitConvert <v5i32, v5f32, SGPR_160>;
-def : BitConvert <v5f32, v5i32, SGPR_160>;
+def : BitConvert <v5i32, v5f32, SReg_160>;
+def : BitConvert <v5f32, v5i32, SReg_160>;
+def : BitConvert <v5i32, v5f32, VReg_160>;
+def : BitConvert <v5f32, v5i32, VReg_160>;
+
+// 192-bit bitcast
+def : BitConvert <v6i32, v6f32, SReg_192>;
+def : BitConvert <v6f32, v6i32, SReg_192>;
+def : BitConvert <v6i32, v6f32, VReg_192>;
+def : BitConvert <v6f32, v6i32, VReg_192>;
+def : BitConvert <v3i64, v3f64, VReg_192>;
+def : BitConvert <v3f64, v3i64, VReg_192>;
+def : BitConvert <v3i64, v6i32, VReg_192>;
+def : BitConvert <v3i64, v6f32, VReg_192>;
+def : BitConvert <v3f64, v6i32, VReg_192>;
+def : BitConvert <v3f64, v6f32, VReg_192>;
+def : BitConvert <v6i32, v3i64, VReg_192>;
+def : BitConvert <v6f32, v3i64, VReg_192>;
+def : BitConvert <v6i32, v3f64, VReg_192>;
+def : BitConvert <v6f32, v3f64, VReg_192>;
+
+// 224-bit bitcast
+def : BitConvert <v7i32, v7f32, SReg_224>;
+def : BitConvert <v7f32, v7i32, SReg_224>;
+def : BitConvert <v7i32, v7f32, VReg_224>;
+def : BitConvert <v7f32, v7i32, VReg_224>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1349,6 +1465,19 @@
// sub1)
// >;
+// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
+// of the real value.
+def : GCNPat <
+ (fneg (v2f32 SReg_64:$src)),
+ (v2f32 (REG_SEQUENCE SReg_64,
+ (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
+ (i32 (S_MOV_B32 (i32 0x80000000)))),
+ SReg_32)), sub0,
+ (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
+ (i32 (S_MOV_B32 (i32 0x80000000)))),
+ SReg_32)), sub1))
+>;
+
} // End let AddedComplexity = 1
def : GCNPat <
@@ -1414,6 +1543,15 @@
>;
def : GCNPat <
+ (getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)),
+ (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
+ 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0,
+ 0, 0, 0, 0, 0)
+> {
+ let SubtargetPredicate = HasPackedFP32Ops;
+}
+
+def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
@@ -1532,9 +1670,16 @@
/********** Intrinsic Patterns **********/
/********** ================== **********/
+let OtherPredicates = [isNotGFX90APlus] in
// FIXME: Should use _e64 and select source modifiers.
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+let OtherPredicates = [isGFX90APlus] in
+def : GCNPat <
+ (fpow f32:$src0, f32:$src1),
+ (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0))
+>;
+
def : GCNPat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -1793,6 +1938,8 @@
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
+def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
@@ -1930,11 +2077,19 @@
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
-def : GCNPat <
- (i32 (AMDGPUfp16_zext f16:$src)),
- (COPY $src)
->;
+// Eliminate a zero extension from an fp16 operation if it already
+// zeros the high bits of the 32-bit register.
+//
+// This is complicated on gfx9+. Some instructions maintain the legacy
+// zeroing behavior, but others preserve the high bits. Some have a
+// control bit to change the behavior. We can't simply say with
+// certainty what the source behavior is without more context on how
+// the src is lowered. e.g. fptrunc + fma may be lowered to a
+// v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+ (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
+ (COPY VSrc_b16:$src)>;
def : GCNPat <
(i32 (trunc i64:$a)),
@@ -2141,6 +2296,17 @@
SRCMODS.NONE, $src2)
>;
+let SubtargetPredicate = isGFX90APlus in
+def : GCNPat <
+ (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
+ (f64 (VOP3NoMods f64:$src2))),
+ (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+
+// COPY is workaround tablegen bug from multiple outputs
+// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
@@ -2207,9 +2373,13 @@
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+def : GCNPat <
+ (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+ (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+ (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
+>;
} // End SubtargetPredicate = HasVOP3PInsts
-
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -2233,7 +2403,7 @@
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,
(as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
(as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
@@ -2242,7 +2412,7 @@
def : GCNPat <
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
(as_i32timm $row_mask), (as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
>;
@@ -2573,6 +2743,24 @@
}
}
+def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_SMED3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
@@ -2614,6 +2802,8 @@
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
new file mode 100644
index 0000000..d560b47
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -0,0 +1,231 @@
+//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass mainly lowers early terminate pseudo instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-late-branch-lowering"
+
+namespace {
+
+class SILateBranchLowering : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+
+ void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
+
+public:
+ static char ID;
+
+ unsigned MovOpc;
+ Register ExecReg;
+
+ SILateBranchLowering() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Final Branch Preparation";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SILateBranchLowering::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID;
+
+static void generateEndPgm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ const SIInstrInfo *TII, MachineFunction &MF) {
+ const Function &F = MF.getFunction();
+ bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ // Check if hardware has been configured to expect color or depth exports.
+ bool HasExports =
+ AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F);
+
+ // Prior to GFX10, hardware always expects at least one export for PS.
+ bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
+
+ if (IsPS && (HasExports || MustExport)) {
+ // Generate "null export" if hardware is expecting PS to export.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(AMDGPU::Exp::ET_NULL)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+ }
+
+ // s_endpgm
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+}
+
+static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineDominatorTree *MDT) {
+ MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
+
+ // Update dominator tree
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ MDT->getBase().applyUpdates(DTUpdates);
+}
+
+void SILateBranchLowering::earlyTerm(MachineInstr &MI,
+ MachineBasicBlock *EarlyExitBlock) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc DL = MI.getDebugLoc();
+
+ auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
+ .addMBB(EarlyExitBlock);
+ auto Next = std::next(MI.getIterator());
+
+ if (Next != MBB.end() && !Next->isTerminator())
+ splitBlock(MBB, *BranchMI, MDT);
+
+ MBB.addSuccessor(EarlyExitBlock);
+ MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+}
+
+bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ SmallVector<MachineInstr *, 4> EarlyTermInstrs;
+ SmallVector<MachineInstr *, 1> EpilogInstrs;
+ bool MadeChange = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_BRANCH:
+ // Optimize out branches to the next block.
+ // This only occurs in -O0 when BranchFolding is not executed.
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+ assert(&MI == &MBB.back());
+ MI.eraseFromParent();
+ MadeChange = true;
+ }
+ break;
+
+ case AMDGPU::SI_EARLY_TERMINATE_SCC0:
+ EarlyTermInstrs.push_back(&MI);
+ break;
+
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ EpilogInstrs.push_back(&MI);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ // Lower any early exit branches first
+ if (!EarlyTermInstrs.empty()) {
+ MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
+ DebugLoc DL;
+
+ MF.insert(MF.end(), EarlyExitBlock);
+ BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
+ ExecReg)
+ .addImm(0);
+ generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
+
+ for (MachineInstr *Instr : EarlyTermInstrs) {
+ // Early termination in GS does nothing
+ if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
+ earlyTerm(*Instr, EarlyExitBlock);
+ Instr->eraseFromParent();
+ }
+
+ EarlyTermInstrs.clear();
+ MadeChange = true;
+ }
+
+ // Now check return to epilog instructions occur at function end
+ if (!EpilogInstrs.empty()) {
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // If there are multiple returns to epilog then all will
+ // become jumps to new empty end block.
+ if (EpilogInstrs.size() > 1) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ for (auto MI : EpilogInstrs) {
+ auto MBB = MI->getParent();
+ if (MBB == &MF.back() && MI == &MBB->back())
+ continue;
+
+ // SI_RETURN_TO_EPILOG is not the last instruction.
+ // Jump to empty block at function end.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB->addSuccessor(EmptyMBBAtEnd);
+ MDT->getBase().insertEdge(MBB, EmptyMBBAtEnd);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ MI->eraseFromParent();
+ MadeChange = true;
+ }
+
+ EpilogInstrs.clear();
+ }
+
+ return MadeChange;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b39420f..493c1ad 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -104,9 +104,7 @@
unsigned BaseOff;
unsigned DMask;
InstClassEnum InstClass;
- bool GLC;
- bool SLC;
- bool DLC;
+ unsigned CPol = 0;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -199,6 +197,7 @@
const CombineInfo &Paired);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired);
+ const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
SmallVectorImpl<MachineInstr *> &InstsToMove);
@@ -304,6 +303,16 @@
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
+ case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ return 1;
+ case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return 2;
default:
return 0;
}
@@ -521,11 +530,7 @@
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
Offset &= 0xffff;
} else if (InstClass != MIMG) {
- GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
- if (InstClass != S_BUFFER_LOAD_IMM) {
- SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
- }
- DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
}
AddressRegs Regs = getRegs(Opc, TII);
@@ -675,10 +680,9 @@
return false;
// Check other optional immediate operands for equality.
- unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
- AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
- AMDGPU::OpName::da, AMDGPU::OpName::r128,
- AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
+ AMDGPU::OpName::unorm, AMDGPU::OpName::da,
+ AMDGPU::OpName::r128, AMDGPU::OpName::a16};
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -725,6 +729,16 @@
return NewFormatInfo->Format;
}
+// Return the value in the inclusive range [Lo,Hi] that is aligned to the
+// highest power of two. Note that the result is well defined for all inputs
+// including corner cases like:
+// - if Lo == Hi, return that value
+// - if Lo == 0, return 0 (even though the "- 1" below underflows
+// - if Lo > Hi, return 0 (as if the range wrapped around)
+static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
+ return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
CombineInfo &Paired,
@@ -764,20 +778,19 @@
return false;
}
- unsigned EltOffset0 = CI.Offset / CI.EltSize;
- unsigned EltOffset1 = Paired.Offset / CI.EltSize;
+ uint32_t EltOffset0 = CI.Offset / CI.EltSize;
+ uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
CI.UseST64 = false;
CI.BaseOff = 0;
- // Handle DS instructions.
+ // Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
- CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
+ CI.CPol == Paired.CPol &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
}
- // Handle SMEM and VMEM instructions.
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -800,22 +813,36 @@
}
// Try to shift base address to decrease offsets.
- unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
- CI.BaseOff = std::min(CI.Offset, Paired.Offset);
+ uint32_t Min = std::min(EltOffset0, EltOffset1);
+ uint32_t Max = std::max(EltOffset0, EltOffset1);
- if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
+ const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
+ if (((Max - Min) & ~Mask) == 0) {
if (Modify) {
- CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
- Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
+ // Copy the low bits of the offsets, so that when we adjust them by
+ // subtracting BaseOff they will be multiples of 64.
+ BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = (EltOffset0 - BaseOff) / 64;
+ Paired.Offset = (EltOffset1 - BaseOff) / 64;
CI.UseST64 = true;
}
return true;
}
- if (isUInt<8>(OffsetDiff)) {
+ if (isUInt<8>(Max - Min)) {
if (Modify) {
- CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
- Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = EltOffset0 - BaseOff;
+ Paired.Offset = EltOffset1 - BaseOff;
}
return true;
}
@@ -841,6 +868,26 @@
}
}
+const TargetRegisterClass *
+SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ return nullptr;
+}
+
/// This function assumes that CI comes before Paired in a basic block.
bool SILoadStoreOptimizer::checkAndPrepareMerge(
CombineInfo &CI, CombineInfo &Paired,
@@ -873,6 +920,9 @@
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
+ const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
+ bool IsAGPR = TRI->hasAGPRs(DataRC);
+
MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -941,6 +991,17 @@
continue;
if (&*MBBI == &*Paired.I) {
+ if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
+ return false;
+ // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+ // operands. However we are reporting that ds_write2 shall have
+ // only VGPR data so that machine copy propagation does not
+ // create an illegal instruction with a VGPR and AGPR sources.
+ // Consequenctially if we create such instruction the verifier
+ // will complain.
+ if (IsAGPR && CI.InstClass == DS_WRITE)
+ return false;
+
// We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
@@ -1014,8 +1075,7 @@
const MCInstrDesc &Read2Desc = TII->get(Opc);
- const TargetRegisterClass *SuperRC =
- (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
@@ -1229,8 +1289,7 @@
BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.DLC) // dlc
+ .addImm(CI.CPol) // cpol
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
@@ -1289,10 +1348,8 @@
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1356,10 +1413,8 @@
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1436,10 +1491,8 @@
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1536,18 +1589,12 @@
case 16:
return &AMDGPU::SGPR_512RegClass;
}
- } else {
- switch (CI.Width + Paired.Width) {
- default:
- return nullptr;
- case 2:
- return &AMDGPU::VReg_64RegClass;
- case 3:
- return &AMDGPU::VReg_96RegClass;
- case 4:
- return &AMDGPU::VReg_128RegClass;
- }
}
+
+ unsigned BitWidth = 32 * (CI.Width + Paired.Width);
+ return TRI->hasAGPRs(getDataRegClass(*CI.I))
+ ? TRI->getAGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
@@ -1596,10 +1643,8 @@
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1671,7 +1716,7 @@
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
- Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5839e59..0f2836e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -72,10 +72,9 @@
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
- SmallSet<MachineInstr *, 16> NeedsKillCleanup;
+ SmallSet<MachineBasicBlock *, 4> KillBlocks;
const TargetRegisterClass *BoolRC = nullptr;
- bool InsertKillCleanups;
unsigned AndOpc;
unsigned OrOpc;
unsigned XorOpc;
@@ -86,6 +85,8 @@
unsigned OrSaveExecOpc;
unsigned Exec;
+ bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
+
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
@@ -163,8 +164,8 @@
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
-static bool hasKill(const MachineBasicBlock *Begin,
- const MachineBasicBlock *End, const SIInstrInfo *TII) {
+bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
+ const MachineBasicBlock *End) {
DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
@@ -173,9 +174,8 @@
if (MBB == End || !Visited.insert(MBB).second)
continue;
- for (auto &Term : MBB->terminators())
- if (TII->isKillTerminator(Term.getOpcode()))
- return true;
+ if (KillBlocks.contains(MBB))
+ return true;
Worklist.append(MBB->succ_begin(), MBB->succ_end());
}
@@ -211,32 +211,11 @@
// just cleared bits.
bool SimpleIf = isSimpleIf(MI, MRI);
- if (InsertKillCleanups) {
- // Check for SI_KILL_*_TERMINATOR on full path of control flow and
- // flag the associated SI_END_CF for insertion of a kill cleanup.
- auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
- while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
- assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
- assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
- MachineOperand &NextExec = UseMI->getOperand(0);
- Register NextExecReg = NextExec.getReg();
- if (NextExec.isDead()) {
- assert(!SimpleIf);
- break;
- }
- UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
- }
- if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
- if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
- NeedsKillCleanup.insert(&*UseMI);
- SimpleIf = false;
- }
- }
- } else if (SimpleIf) {
+ if (SimpleIf) {
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
// if there is any such terminator simplifications are not safe.
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
- SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+ SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
}
// Add an implicit def of exec to discourage scheduling VALU after this which
@@ -451,8 +430,6 @@
auto E = B->end();
for ( ; It != E; ++It) {
- if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
- continue;
if (TII->mayReadEXEC(*MRI, *It))
break;
}
@@ -505,18 +482,8 @@
LoweredEndCf.insert(NewMI);
- // If this ends control flow which contains kills (as flagged in emitIf)
- // then insert an SI_KILL_CLEANUP immediately following the exec mask
- // manipulation. This can be lowered to early termination if appropriate.
- MachineInstr *CleanUpMI = nullptr;
- if (NeedsKillCleanup.count(&MI))
- CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
-
- if (LIS) {
+ if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- if (CleanUpMI)
- LIS->InsertMachineInstrInMaps(*CleanUpMI);
- }
MI.eraseFromParent();
@@ -633,6 +600,10 @@
emitLoop(MI);
break;
+ case AMDGPU::SI_WATERFALL_LOOP:
+ MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
+ break;
+
case AMDGPU::SI_END_CF:
SplitBB = emitEndCf(MI);
break;
@@ -811,8 +782,6 @@
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
- InsertKillCleanups =
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
@@ -836,7 +805,27 @@
Exec = AMDGPU::EXEC;
}
- SmallVector<MachineInstr *, 32> Worklist;
+ // Compute set of blocks with kills
+ const bool CanDemote =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+ for (auto &MBB : MF) {
+ bool IsKillBlock = false;
+ for (auto &Term : MBB.terminators()) {
+ if (TII->isKillTerminator(Term.getOpcode())) {
+ KillBlocks.insert(&MBB);
+ IsKillBlock = true;
+ break;
+ }
+ }
+ if (CanDemote && !IsKillBlock) {
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+ KillBlocks.insert(&MBB);
+ break;
+ }
+ }
+ }
+ }
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin();
@@ -853,18 +842,12 @@
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
- SplitMBB = process(MI);
- break;
-
case AMDGPU::SI_ELSE:
case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_WATERFALL_LOOP:
case AMDGPU::SI_LOOP:
case AMDGPU::SI_END_CF:
- // Only build worklist if SI_IF instructions must be processed first.
- if (InsertKillCleanups)
- Worklist.push_back(&MI);
- else
- SplitMBB = process(MI);
+ SplitMBB = process(MI);
break;
// FIXME: find a better place for this
@@ -886,14 +869,11 @@
}
}
- for (MachineInstr *MI : Worklist)
- process(*MI);
-
optimizeEndCf();
LoweredEndCf.clear();
LoweredIf.clear();
- NeedsKillCleanup.clear();
+ KillBlocks.clear();
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 9570680..672266f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -598,6 +598,11 @@
MachineBasicBlock *PostDomBound =
PDT->findNearestCommonDominator(DomBlocks);
+
+ // FIXME: This fails to find irreducible cycles. If we have a def (other
+ // than a constant) in a pair of blocks that end up looping back to each
+ // other, it will be mishandle. Due to structurization this shouldn't occur
+ // in practice.
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
SSAUpdater.Initialize(DstReg);
@@ -732,6 +737,9 @@
const MachineInstr *MI;
for (;;) {
MI = MRI->getUniqueVRegDef(Reg);
+ if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ return true;
+
if (MI->getOpcode() != AMDGPU::COPY)
break;
@@ -808,9 +816,9 @@
MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DstReg,
unsigned PrevReg, unsigned CurReg) {
- bool PrevVal;
+ bool PrevVal = false;
bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
- bool CurVal;
+ bool CurVal = false;
bool CurConstant = isConstantLaneMask(CurReg, CurVal);
if (PrevConstant && CurConstant) {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 3040505..38b9d85 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -31,12 +31,6 @@
namespace {
-static cl::opt<bool> EnableSpillVGPRToAGPR(
- "amdgpu-spill-vgpr-to-agpr",
- cl::desc("Enable spilling VGPRs to AGPRs"),
- cl::ReallyHidden,
- cl::init(true));
-
class SILowerSGPRSpills : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI = nullptr;
@@ -71,6 +65,7 @@
INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
@@ -88,6 +83,8 @@
MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();
@@ -96,8 +93,13 @@
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
- TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
- TRI);
+ // If this value was already livein, we probably have a direct use of the
+ // incoming register value, so don't kill at the spill point. This happens
+ // since we pass some special inputs (workgroup IDs) in the callee saved
+ // range.
+ const bool IsLiveIn = MRI.isLiveIn(Reg);
+ TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
+ RC, TRI);
if (LIS) {
assert(std::distance(MIS.begin(), I) == 1);
@@ -255,13 +257,10 @@
if (!LowestAvailableVGPR)
LowestAvailableVGPR = PreReservedVGPR;
- const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- Optional<int> FI;
- // Check if we are reserving a CSR. Create a stack object for a possible spill
- // in the function prologue.
- if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
- FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+ // Create a stack object for a possible spill in the function prologue.
+ // Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
+ Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
// Find saved info about the pre-reserved register.
const auto *ReservedVGPRInfoItr =
@@ -291,6 +290,7 @@
TRI = &TII->getRegisterInfo();
VRM = getAnalysisIfAvailable<VirtRegMap>();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
assert(SaveBlocks.empty() && RestoreBlocks.empty());
@@ -300,29 +300,28 @@
bool HasCSRs = spillCalleeSavedRegs(MF);
MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
if (!MFI.hasStackObjects() && !HasCSRs) {
SaveBlocks.clear();
RestoreBlocks.clear();
+ if (FuncInfo->VGPRReservedForSGPRSpill) {
+ // Free the reserved VGPR for later possible use by frame lowering.
+ FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
+ MRI.freezeReservedRegs(MF);
+ }
return false;
}
- MachineRegisterInfo &MRI = MF.getRegInfo();
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
- && EnableSpillVGPRToAGPR;
-
bool MadeChange = false;
-
- const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
- std::unique_ptr<RegScavenger> RS;
-
bool NewReservedRegs = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
(HasCSRs || FuncInfo->hasSpilledSGPRs());
- if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) {
+ if (HasSGPRSpillToVGPR) {
// Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
// are spilled to VGPRs, in which case we can eliminate the stack usage.
//
@@ -331,33 +330,15 @@
lowerShiftReservedVGPR(MF, ST);
+ // To track the spill frame indices handled in this pass.
+ BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator Next;
for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
MachineInstr &MI = *I;
Next = std::next(I);
- if (SpillToAGPR && TII->isVGPRSpill(MI)) {
- // Try to eliminate stack used by VGPR spills before frame
- // finalization.
- unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::vaddr);
- int FI = MI.getOperand(FIOp).getIndex();
- Register VReg =
- TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
- if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
- TRI->isAGPR(MRI, VReg))) {
- NewReservedRegs = true;
- if (!RS)
- RS.reset(new RegScavenger());
-
- // FIXME: change to enterBasicBlockEnd()
- RS->enterBasicBlock(MBB);
- TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get());
- continue;
- }
- }
-
if (!TII->isSGPRSpill(MI))
continue;
@@ -365,24 +346,32 @@
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
NewReservedRegs = true;
- bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI,
+ nullptr, LIS);
(void)Spilled;
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+ SpillFIs.set(FI);
}
}
}
+ // FIXME: Adding to live-ins redundant with reserving registers.
for (MachineBasicBlock &MBB : MF) {
for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(SSpill.VGPR);
-
- for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
- MBB.addLiveIn(Reg);
-
- for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
- MBB.addLiveIn(Reg);
-
MBB.sortUniqueLiveIns();
+
+ // FIXME: The dead frame indices are replaced with a null register from
+ // the debug value instructions. We should instead, update it with the
+ // correct register value. But not sure the register value alone is
+ // adequate to lower the DIExpression. It should be worked out later.
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
+ SpillFIs[MI.getOperand(0).getIndex()]) {
+ MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+ MI.getOperand(0).setIsDebug();
+ }
+ }
}
MadeChange = true;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a0cdc7..85cfe36 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,6 +8,22 @@
#include "SIMachineFunctionInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include <cassert>
+#include <vector>
#define MAX_LANES 64
@@ -49,6 +65,7 @@
// Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
// have any calls.
const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CC != CallingConv::AMDGPU_Gfx &&
(!isEntryFunction() || HasCalls);
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
@@ -61,6 +78,9 @@
}
if (!isEntryFunction()) {
+ if (UseFixedABI)
+ ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+
// TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
@@ -119,13 +139,15 @@
if (WorkItemIDZ)
WorkItemIDY = true;
- PrivateSegmentWaveByteOffset = true;
+ if (!ST.flatScratchIsArchitected()) {
+ PrivateSegmentWaveByteOffset = true;
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- ArgInfo.PrivateSegmentWaveByteOffset =
- ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ }
}
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
@@ -156,13 +178,14 @@
if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
+ // TODO: This could be refined a lot. The attribute is a poor way of
+ // detecting calls or stack objects that may require it before argument
+ // lowering.
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
- (isAmdHsaOrMesa || ST.enableFlatScratch())) {
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
- if (HasCalls || HasStackObjects || ST.enableFlatScratch())
- FlatScratchInit = true;
+ (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
+ (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ !ST.flatScratchIsArchitected()) {
+ FlatScratchInit = true;
}
Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
@@ -285,8 +308,6 @@
assert(Size >= 4 && "invalid sgpr spill size");
assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
- const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
-
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
@@ -309,16 +330,24 @@
// partially spill the SGPR to VGPRs.
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
+
+#if 0
+ DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
+ "VGPRs for SGPR spilling",
+ 0, DS_Error);
+ MF.getFunction().getContext().diagnose(DiagOutOfRegs);
+#endif
return false;
}
- Optional<int> CSRSpillFI;
- if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
- isCalleeSavedReg(CSRegs, LaneVGPR)) {
- CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
+ Optional<int> SpillFI;
+ // We need to preserve inactive lanes, so always save, even caller-save
+ // registers.
+ if (!isEntryFunction()) {
+ SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
}
- SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
+ SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
@@ -344,7 +373,7 @@
MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
if (LaneVGPR == Register())
return false;
- SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
+ SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
return true;
}
@@ -437,6 +466,21 @@
}
}
+int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
+ const SIRegisterInfo &TRI) {
+ if (ScavengeFI)
+ return *ScavengeFI;
+ if (isEntryFunction()) {
+ ScavengeFI = MFI.CreateFixedObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ } else {
+ ScavengeFI = MFI.CreateStackObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
+ }
+ return *ScavengeFI;
+}
+
MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -529,7 +573,8 @@
}
yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
- const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI)
+ const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
+ const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
@@ -543,6 +588,9 @@
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+ auto SFI = MFI.getOptionalScavengeFI();
+ if (SFI)
+ ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
}
void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
@@ -550,7 +598,8 @@
}
bool SIMachineFunctionInfo::initializeBaseYamlFields(
- const yaml::SIMachineFunctionInfo &YamlMFI) {
+ const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
+ PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
LDSSize = YamlMFI.LDSSize;
@@ -563,6 +612,24 @@
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+
+ if (YamlMFI.ScavengeFI) {
+ auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
+ if (!FIOrErr) {
+ // Create a diagnostic for a the frame index.
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+
+ Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
+ SourceMgr::DK_Error, toString(FIOrErr.takeError()),
+ "", None, None);
+ SourceRange = YamlMFI.ScavengeFI->SourceRange;
+ return true;
+ }
+ ScavengeFI = *FIOrErr;
+ } else {
+ ScavengeFI = None;
+ }
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 35fb431..fb6d4f8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -17,6 +17,7 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/Support/raw_ostream.h"
@@ -288,10 +289,12 @@
Optional<SIArgumentInfo> ArgInfo;
SIMode Mode;
+ Optional<FrameIndex> ScavengeFI;
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
- const TargetRegisterInfo &TRI);
+ const TargetRegisterInfo &TRI,
+ const llvm::MachineFunction &MF);
void mappingImpl(yaml::IO &YamlIO) override;
~SIMachineFunctionInfo() = default;
@@ -321,6 +324,7 @@
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+ YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
}
};
@@ -445,15 +449,15 @@
bool hasReg() { return VGPR != 0;}
};
- struct SGPRSpillVGPRCSR {
+ struct SGPRSpillVGPR {
// VGPR used for SGPR spills
Register VGPR;
- // If the VGPR is a CSR, the stack slot used to save/restore it in the
- // prolog/epilog.
+ // If the VGPR is is used for SGPR spills in a non-entrypoint function, the
+ // stack slot used to save/restore it in the prolog/epilog.
Optional<int> FI;
- SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
+ SGPRSpillVGPR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
};
struct VGPRSpillToAGPR {
@@ -461,16 +465,16 @@
bool FullyAllocated = false;
};
- SparseBitVector<> WWMReservedRegs;
-
- void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); }
+ // Map WWM VGPR to a stack slot that is used to save/restore it in the
+ // prolog/epilog.
+ MapVector<Register, Optional<int>> WWMReservedRegs;
private:
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
- SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
+ SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
@@ -480,6 +484,10 @@
// VGPRs used for AGPR spills.
SmallVector<MCPhysReg, 32> SpillVGPR;
+ // Emergency stack slot. Sometimes, we create this before finalizing the stack
+ // frame, so save it here and add it to the RegScavenger later.
+ Optional<int> ScavengeFI;
+
public: // FIXME
/// If this is set, an SGPR used for save/restore of the register used for the
/// frame pointer.
@@ -497,7 +505,14 @@
public:
SIMachineFunctionInfo(const MachineFunction &MF);
- bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI);
+ bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI,
+ const MachineFunction &MF,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange);
+
+ void reserveWWMRegister(Register Reg, Optional<int> FI) {
+ WWMReservedRegs.insert(std::make_pair(Reg, FI));
+ }
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
@@ -505,9 +520,7 @@
ArrayRef<SpilledReg>() : makeArrayRef(I->second);
}
- ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const {
- return SpillVGPRs;
- }
+ ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
SpillVGPRs[Index].VGPR = NewVGPR;
@@ -538,6 +551,9 @@
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
void removeDeadFrameIndices(MachineFrameInfo &MFI);
+ int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI);
+ Optional<int> getOptionalScavengeFI() const { return ScavengeFI; }
+
bool hasCalculatedTID() const { return TIDReg != 0; };
Register getTIDReg() const { return TIDReg; };
void setTIDReg(Register Reg) { TIDReg = Reg; }
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3caa75e..71be73c 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -84,22 +84,6 @@
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
-/// Sets named bit \p BitName to "true" if present in instruction \p MI.
-/// \returns Returns true if \p MI is modified, false otherwise.
-template <uint16_t BitName>
-bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
- int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
- if (BitIdx == -1)
- return false;
-
- MachineOperand &Bit = MI->getOperand(BitIdx);
- if (Bit.getImm() != 0)
- return false;
-
- Bit.setImm(1);
- return true;
-}
-
class SIMemOpInfo final {
private:
@@ -129,12 +113,43 @@
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
+
+ if (Ordering == AtomicOrdering::NotAtomic) {
+ assert(Scope == SIAtomicScope::NONE &&
+ OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
+ !IsCrossAddressSpaceOrdering &&
+ FailureOrdering == AtomicOrdering::NotAtomic);
+ return;
+ }
+
+ assert(Scope != SIAtomicScope::NONE &&
+ (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ !isStrongerThan(FailureOrdering, Ordering));
+
// There is also no cross address space ordering if the ordering
// address space is the same as the instruction address space and
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
this->IsCrossAddressSpaceOrdering = false;
+
+ // Limit the scope to the maximum supported by the instruction's address
+ // spaces.
+ if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
+ SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::AGENT);
+ }
}
public:
@@ -202,12 +217,12 @@
void reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const;
- /// Inspects the target synchonization scope \p SSID and determines
+ /// Inspects the target synchronization scope \p SSID and determines
/// the SI atomic scope it corresponds to, the address spaces it
/// covers, and whether the memory ordering applies between address
/// spaces.
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
- toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+ toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
/// \return Return a bit set of the address spaces accessed by \p AS.
SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
@@ -257,6 +272,11 @@
SICacheControl(const GCNSubtarget &ST);
+ /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+ /// \returns Returns true if \p MI is modified, false otherwise.
+ bool enableNamedBit(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Bit) const;
+
public:
/// Create a cache control for the subtarget \p ST.
@@ -269,6 +289,20 @@
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
+ /// Update \p MI memory store instruction to bypass any caches up to
+ /// the \p Scope memory scope for address spaces \p
+ /// AddrSpace. Return true iff the instruction was modified.
+ virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
+ /// Update \p MI memory read-modify-write instruction to bypass any caches up
+ /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
+ /// iff the instruction was modified.
+ virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
/// Update \p MI memory instruction of kind \p Op associated with address
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
/// true iff the instruction was modified.
@@ -324,13 +358,13 @@
/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::glc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::GLC);
}
/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::slc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::SLC);
}
public:
@@ -341,6 +375,14 @@
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
@@ -377,13 +419,54 @@
};
+class SIGfx90ACacheControl : public SIGfx7CacheControl {
+public:
+
+ SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::DLC);
}
public:
@@ -424,7 +507,7 @@
/// Return true iff instruction \p MI is a atomic instruction that
/// returns a result.
bool isAtomicRet(const MachineInstr &MI) const {
- return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+ return SIInstrInfo::isAtomicRet(MI);
}
/// Removes all processed atomic pseudo instructions from the current
@@ -476,7 +559,7 @@
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
- SIAtomicAddrSpace InstrScope) const {
+ SIAtomicAddrSpace InstrAddrSpace) const {
if (SSID == SyncScope::System)
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC,
@@ -499,23 +582,23 @@
true);
if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
return None;
}
@@ -557,7 +640,7 @@
IsVolatile |= MMO->isVolatile();
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
- AtomicOrdering OpOrdering = MMO->getOrdering();
+ AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
if (OpOrdering != AtomicOrdering::NotAtomic) {
const auto &IsSyncScopeInclusion =
MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
@@ -568,9 +651,9 @@
}
SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
- Ordering =
- isStrongerThan(Ordering, OpOrdering) ?
- Ordering : MMO->getOrdering();
+ Ordering = isStrongerThan(Ordering, OpOrdering)
+ ? Ordering
+ : MMO->getSuccessOrdering();
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
FailureOrdering =
@@ -591,7 +674,8 @@
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
ScopeOrNone.getValue();
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
+ ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
reportUnsupported(MI, "Unsupported atomic address space");
return None;
}
@@ -659,7 +743,7 @@
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering);
+ IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -682,9 +766,21 @@
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
+bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Bit) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
+ if (!CPol)
+ return false;
+
+ CPol->setImm(CPol->getImm() | Bit);
+ return true;
+}
+
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (ST.hasGFX90AInsts())
+ return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
@@ -725,6 +821,32 @@
return Changed;
}
+bool SIGfx6CacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ /// The L1 cache is write through so does not need to be bypassed. There is no
+ /// bypass control for the L2 cache at the isa level.
+
+ return Changed;
+}
+
+bool SIGfx6CacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ /// The L1 cache is write through so does not need to be bypassed. There is no
+ /// bypass control for the L2 cache at the isa level.
+
+ return Changed;
+}
+
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
@@ -968,6 +1090,292 @@
return Changed;
}
+bool SIGfx90ACacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed.
+ if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ /// Do not set glc for store atomic operations as they implicitly write
+ /// through the L1 cache.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass. Store atomics implicitly write through the L1
+ // cache.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ /// Do not set glc for RMW atomic operations as they implicitly bypass
+ /// the L1 cache, and the glc bit is instead used to indicate if they are
+ /// return or no-return.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ if (Op == SIMemOp::LOAD) {
+ Changed |= enableGLCBit(MI);
+ }
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ if (ST.isTgSplitEnabled()) {
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to wait for global or GDS memory operations
+ // to complete to ensure they are visible to waves in the other CUs.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are on
+ // the same CU, so no need to wait for global memory as all waves in the
+ // work-group access the same the L1, nor wait for GDS as access are ordered
+ // on a CU.
+ if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+ (Scope == SIAtomicScope::WORKGROUP)) {
+ // Same as GFX7 using agent scope.
+ Scope = SIAtomicScope::AGENT;
+ }
+ // In threadgroup split mode LDS cannot be allocated so no need to wait for
+ // LDS memory operations.
+ AddrSpace &= ~SIAtomicAddrSpace::LDS;
+ }
+ return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
+ IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+ // CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // Same as GFX7.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to invalidate the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be invalidated.
+ if (ST.isTgSplitEnabled()) {
+ // Same as GFX7 using agent scope.
+ Scope = SIAtomicScope::AGENT;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Same as GFX7.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by the
+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+ // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
+ // vmcnt(0)" needed by the "BUFFER_WBL2".
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Same as GFX7.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ Changed |=
+ SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
+ IsCrossAddrSpaceOrdering, Pos);
+
+ return Changed;
+}
+
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -1292,6 +1700,13 @@
bool Changed = false;
if (MOI.isAtomic()) {
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace());
+ }
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
@@ -1336,7 +1751,7 @@
Position::BEFORE);
// TODO: If both release and invalidate are happening they could be combined
- // to use the single "BUFFER_WBL2" instruction. This could be done by
+ // to use the single "BUFFER_WBINV*" instruction. This could be done by
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
// track cache invalidate and write back instructions.
@@ -1360,6 +1775,15 @@
bool Changed = false;
if (MOI.isAtomic()) {
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
+ MOI.getInstrAddrSpace());
+ }
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
@@ -1375,7 +1799,7 @@
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
+ MOI.getInstrAddrSpace(),
isAtomicRet(*MI) ? SIMemOp::LOAD :
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
@@ -1401,7 +1825,7 @@
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
// Unbundle instructions after the post-RA scheduler.
- if (MI->isBundle()) {
+ if (MI->isBundle() && MI->mayLoadOrStore()) {
MachineBasicBlock::instr_iterator II(MI->getIterator());
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
I != E && I->isBundledWithPred(); ++I) {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 54f2091..b9c839f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -220,6 +220,18 @@
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
return true;
}
+ case AMDGPU::S_AND_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+ return true;
+ }
+ case AMDGPU::S_AND_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+ return true;
+ }
default:
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 162e966..5f89f38 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -416,15 +416,20 @@
continue;
Register SavedExec = I->getOperand(0).getReg();
- if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) &&
- MRI->use_instr_nodbg_begin(SavedExec)->getParent() ==
- I->getParent()) {
- LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
- LIS->RemoveMachineInstrFromMaps(*I);
- I->eraseFromParent();
- MRI->replaceRegWith(SavedExec, ExecReg);
- LIS->removeInterval(SavedExec);
- Changed = true;
+ if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec)) {
+ MachineInstr *SingleExecUser = &*MRI->use_instr_nodbg_begin(SavedExec);
+ int Idx = SingleExecUser->findRegisterUseOperandIdx(SavedExec);
+ assert(Idx != -1);
+ if (SingleExecUser->getParent() == I->getParent() &&
+ !SingleExecUser->getOperand(Idx).isImplicit() &&
+ TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) {
+ LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
+ LIS->RemoveMachineInstrFromMaps(*I);
+ I->eraseFromParent();
+ MRI->replaceRegWith(SavedExec, ExecReg);
+ LIS->removeInterval(SavedExec);
+ Changed = true;
+ }
}
break;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
new file mode 100644
index 0000000..307c9eb
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -0,0 +1,637 @@
+//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else
+/// structures and waterfall loops.
+///
+/// When we do structurization, we usually transform an if-else into two
+/// sucessive if-then (with a flow block to do predicate inversion). Consider a
+/// simple case after structurization: A divergent value %a was defined before
+/// if-else and used in both THEN (use in THEN is optional) and ELSE part:
+/// bb.if:
+/// %a = ...
+/// ...
+/// bb.then:
+/// ... = op %a
+/// ... // %a can be dead here
+/// bb.flow:
+/// ...
+/// bb.else:
+/// ... = %a
+/// ...
+/// bb.endif
+///
+/// As register allocator has no idea of the thread-control-flow, it will just
+/// assume %a would be alive in the whole range of bb.then because of a later
+/// use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
+/// to exec mask. For this if-else case, the lanes active in bb.then will be
+/// inactive in bb.else, and vice-versa. So we are safe to say that %a was dead
+/// after the last use in bb.then until the end of the block. The reason is
+/// the instructions in bb.then will only overwrite lanes that will never be
+/// accessed in bb.else.
+///
+/// This pass aims to to tell register allocator that %a is in-fact dead,
+/// through inserting a phi-node in bb.flow saying that %a is undef when coming
+/// from bb.then, and then replace the uses in the bb.else with the result of
+/// newly inserted phi.
+///
+/// Two key conditions must be met to ensure correctness:
+/// 1.) The def-point should be in the same loop-level as if-else-endif to make
+/// sure the second loop iteration still get correct data.
+/// 2.) There should be no further uses after the IF-ELSE region.
+///
+///
+/// Waterfall loops get inserted around instructions that use divergent values
+/// but can only be executed with a uniform value. For example an indirect call
+/// to a divergent address:
+/// bb.start:
+/// %a = ...
+/// %fun = ...
+/// ...
+/// bb.loop:
+/// call %fun (%a)
+/// ... // %a can be dead here
+/// loop %bb.loop
+///
+/// The loop block is executed multiple times, but it is run exactly once for
+/// each active lane. Similar to the if-else case, the register allocator
+/// assumes that %a is live throughout the loop as it is used again in the next
+/// iteration. If %a is a VGPR that is unused after the loop, it does not need
+/// to be live after its last use in the loop block. By inserting a phi-node at
+/// the start of bb.loop that is undef when coming from bb.loop, the register
+/// allocation knows that the value of %a does not need to be preserved through
+/// iterations of the loop.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-opt-vgpr-liverange"
+
+namespace {
+
+class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ LiveVariables *LV = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+ const MachineLoopInfo *Loops = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+public:
+ static char ID;
+
+ MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const;
+
+ void collectElseRegionBlocks(MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &) const;
+
+ void
+ collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const;
+
+ void collectWaterfallCandidateRegisters(
+ MachineBasicBlock *Loop,
+ SmallSetVector<Register, 16> &CandidateRegs) const;
+
+ void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const;
+
+ void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow) const;
+
+ void updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ void
+ optimizeLiveRange(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+
+ SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Optimize VGPR LiveRange";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveVariables>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<LiveVariables>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+};
+
+} // end anonymous namespace
+
+// Check whether the MBB is a else flow block and get the branching target which
+// is the Endif block
+MachineBasicBlock *
+SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
+ for (auto &BR : MBB->terminators()) {
+ if (BR.getOpcode() == AMDGPU::SI_ELSE)
+ return BR.getOperand(2).getMBB();
+ }
+ return nullptr;
+}
+
+void SIOptimizeVGPRLiveRange::collectElseRegionBlocks(
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &Blocks) const {
+ assert(Flow != Endif);
+
+ MachineBasicBlock *MBB = Endif;
+ unsigned Cur = 0;
+ while (MBB) {
+ for (auto *Pred : MBB->predecessors()) {
+ if (Pred != Flow && !Blocks.contains(Pred))
+ Blocks.insert(Pred);
+ }
+
+ if (Cur < Blocks.size())
+ MBB = Blocks[Cur++];
+ else
+ MBB = nullptr;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Found Else blocks: ";
+ for (auto *MBB : Blocks)
+ dbgs() << printMBBReference(*MBB) << ' ';
+ dbgs() << '\n';
+ });
+}
+
+/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg.
+void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock(
+ Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const {
+ for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == MBB && !UseMI.isPHI())
+ Uses.push_back(&UseMI);
+ }
+}
+
+/// Collect the killed registers in the ELSE region which are not alive through
+/// the whole THEN region.
+void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
+ MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const {
+
+ SmallSet<Register, 8> KillsInElse;
+
+ for (auto *Else : ElseBlocks) {
+ for (auto &MI : Else->instrs()) {
+ if (MI.isDebugInstr())
+ continue;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ continue;
+
+ Register MOReg = MO.getReg();
+ // We can only optimize AGPR/VGPR virtual register
+ if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+ continue;
+
+ if (MO.readsReg()) {
+ LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) {
+ // Check if the register is live into the endif block. If not,
+ // consider it killed in the else region.
+ LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+ if (!VI.isLiveIn(*Endif, MOReg, *MRI)) {
+ KillsInElse.insert(MOReg);
+ } else {
+ LLVM_DEBUG(dbgs() << "Excluding " << printReg(MOReg, TRI)
+ << " as Live in Endif\n");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Check the phis in the Endif, looking for value coming from the ELSE
+ // region. Make sure the phi-use is the last use.
+ for (auto &MI : Endif->phis()) {
+ for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+ auto &MO = MI.getOperand(Idx);
+ auto *Pred = MI.getOperand(Idx + 1).getMBB();
+ if (Pred == Flow)
+ continue;
+ assert(ElseBlocks.contains(Pred) && "Should be from Else region\n");
+
+ if (!MO.isReg() || !MO.getReg() || MO.isUndef())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg))
+ continue;
+
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+ if (VI.isLiveIn(*Endif, Reg, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI)
+ << " as Live in Endif\n");
+ continue;
+ }
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent();
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+ KillsInElse.insert(Reg);
+ }
+ }
+
+ auto IsLiveThroughThen = [&](Register Reg) {
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ if (!I->readsReg())
+ continue;
+ auto *UseMI = I->getParent();
+ auto *UseMBB = UseMI->getParent();
+ if (UseMBB == Flow || UseMBB == Endif) {
+ if (!UseMI->isPHI())
+ return true;
+
+ auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB();
+ // The register is live through the path If->Flow or Flow->Endif.
+ // we should not optimize for such cases.
+ if ((UseMBB == Flow && IncomingMBB != If) ||
+ (UseMBB == Endif && IncomingMBB == Flow))
+ return true;
+ }
+ }
+ return false;
+ };
+
+ for (auto Reg : KillsInElse) {
+ if (!IsLiveThroughThen(Reg))
+ CandidateRegs.push_back(Reg);
+ }
+}
+
+/// Collect the registers used in the waterfall loop block that are defined
+/// before.
+void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
+ MachineBasicBlock *Loop,
+ SmallSetVector<Register, 16> &CandidateRegs) const {
+
+ for (auto &MI : Loop->instrs()) {
+ if (MI.isDebugInstr())
+ continue;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ continue;
+
+ Register MOReg = MO.getReg();
+ // We can only optimize AGPR/VGPR virtual register
+ if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+ continue;
+
+ if (MO.readsReg()) {
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ // Make sure the value is defined before the LOOP block
+ if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+ // If the variable is used after the loop, the register coalescer will
+ // merge the newly created register and remove the phi node again.
+ // Just do nothing in that case.
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
+ bool IsUsed = false;
+ for (auto *Succ : Loop->successors()) {
+ if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+ IsUsed = true;
+ break;
+ }
+ }
+ if (!IsUsed) {
+ LLVM_DEBUG(dbgs() << "Found candidate reg: "
+ << printReg(MOReg, TRI, 0, MRI) << '\n');
+ CandidateRegs.insert(MOReg);
+ } else {
+ LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: "
+ << printReg(MOReg, TRI, 0, MRI) << '\n');
+ }
+ }
+ }
+ }
+ }
+}
+
+// Re-calculate the liveness of \p Reg in the THEN-region
+void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
+
+ SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
+
+ MachineBasicBlock *ThenEntry = nullptr;
+ for (auto *Succ : If->successors()) {
+ if (Succ != Flow) {
+ ThenEntry = Succ;
+ break;
+ }
+ }
+ assert(ThenEntry && "No successor in Then region?");
+
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ df_iterator_default_set<MachineBasicBlock *, 16> Visited;
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ // Clear Live bit, as we will recalculate afterwards
+ LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(MBB->getNumber());
+ }
+
+ // Get the blocks the Reg should be alive through
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ auto *UseMI = I->getParent();
+ if (UseMI->isPHI() && I->readsReg()) {
+ if (Visited.contains(UseMI->getParent()))
+ PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
+ }
+ }
+
+ Visited.clear();
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ SmallVector<MachineInstr *> Uses;
+ // PHI instructions has been processed before.
+ findNonPHIUsesInBlock(Reg, MBB, Uses);
+
+ if (Uses.size() == 1) {
+ LLVM_DEBUG(dbgs() << "Found one Non-PHI use in "
+ << printMBBReference(*MBB) << '\n');
+ LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin()));
+ } else if (Uses.size() > 1) {
+ // Process the instructions in-order
+ LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in "
+ << printMBBReference(*MBB) << '\n');
+ for (MachineInstr &MI : *MBB) {
+ if (llvm::is_contained(Uses, &MI))
+ LV->HandleVirtRegUse(Reg, MBB, MI);
+ }
+ }
+
+ // Mark Reg alive through the block if this is a PHI incoming block
+ if (PHIIncoming.contains(MBB))
+ LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(),
+ MBB);
+ }
+
+ // Set the isKilled flag if we get new Kills in the THEN region.
+ for (auto *MI : OldVarInfo.Kills) {
+ if (Visited.contains(MI->getParent()))
+ MI->addRegisterKilled(Reg, TRI);
+ }
+}
+
+void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+ // Transfer aliveBlocks from Reg to NewReg
+ for (auto *MBB : ElseBlocks) {
+ unsigned BBNum = MBB->getNumber();
+ if (OldVarInfo.AliveBlocks.test(BBNum)) {
+ NewVarInfo.AliveBlocks.set(BBNum);
+ LLVM_DEBUG(dbgs() << "Removing AliveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(BBNum);
+ }
+ }
+
+ // Transfer the possible Kills in ElseBlocks from Reg to NewReg
+ auto I = OldVarInfo.Kills.begin();
+ while (I != OldVarInfo.Kills.end()) {
+ if (ElseBlocks.contains((*I)->getParent())) {
+ NewVarInfo.Kills.push_back(*I);
+ I = OldVarInfo.Kills.erase(I);
+ } else {
+ ++I;
+ }
+ }
+}
+
+void SIOptimizeVGPRLiveRange::optimizeLiveRange(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ // Insert a new PHI, marking the value from the THEN region being
+ // undef.
+ LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+ const auto *RC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ Register UndefReg = MRI->createVirtualRegister(RC);
+ MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : Flow->predecessors()) {
+ if (Pred == If)
+ PHI.addReg(Reg).addMBB(Pred);
+ else
+ PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+ }
+
+ // Replace all uses in the ELSE region or the PHIs in ENDIF block
+ // Use early increment range because setReg() will update the linked list.
+ for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+ auto *UseMI = O.getParent();
+ auto *UseBlock = UseMI->getParent();
+ // Replace uses in Endif block
+ if (UseBlock == Endif) {
+ assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
+ O.setReg(NewReg);
+ continue;
+ }
+
+ // Replace uses in Else region
+ if (ElseBlocks.contains(UseBlock))
+ O.setReg(NewReg);
+ }
+
+ // The optimized Reg is not alive through Flow blocks anymore.
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ OldVarInfo.AliveBlocks.reset(Flow->getNumber());
+
+ updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks);
+ updateLiveRangeInThenRegion(Reg, If, Flow);
+}
+
+void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
+ Register Reg, MachineBasicBlock *Loop) const {
+ // Insert a new PHI, marking the value from the last loop iteration undef.
+ LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+ const auto *RC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ Register UndefReg = MRI->createVirtualRegister(RC);
+
+ // Replace all uses in the LOOP region
+ // Use early increment range because setReg() will update the linked list.
+ for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+ auto *UseMI = O.getParent();
+ auto *UseBlock = UseMI->getParent();
+ // Replace uses in Loop block
+ if (UseBlock == Loop)
+ O.setReg(NewReg);
+ }
+
+ MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : Loop->predecessors()) {
+ if (Pred == Loop)
+ PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+ else
+ PHI.addReg(Reg).addMBB(Pred);
+ }
+
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+ // collectWaterfallCandidateRegisters only collects registers that are dead
+ // after the loop. So we know that the old reg is not live throughout the
+ // whole block anymore.
+ OldVarInfo.AliveBlocks.reset(Loop->getNumber());
+
+ // Mark the last use as kill
+ for (auto &MI : reverse(Loop->instrs())) {
+ if (MI.readsRegister(NewReg, TRI)) {
+ MI.addRegisterKilled(NewReg, TRI);
+ NewVarInfo.Kills.push_back(&MI);
+ break;
+ }
+ }
+ assert(!NewVarInfo.Kills.empty() &&
+ "Failed to find last usage of register in loop");
+}
+
+char SIOptimizeVGPRLiveRange::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+
+char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID;
+
+FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() {
+ return new SIOptimizeVGPRLiveRange();
+}
+
+bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ Loops = &getAnalysis<MachineLoopInfo>();
+ LV = &getAnalysis<LiveVariables>();
+ MRI = &MF.getRegInfo();
+
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ bool MadeChange = false;
+
+ // TODO: we need to think about the order of visiting the blocks to get
+ // optimal result for nesting if-else cases.
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &MI : MBB.terminators()) {
+ // Detect the if-else blocks
+ if (MI.getOpcode() == AMDGPU::SI_IF) {
+ MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
+ auto *Endif = getElseTarget(IfTarget);
+ if (!Endif)
+ continue;
+
+ SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
+ SmallVector<Register> CandidateRegs;
+
+ LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: "
+ << printMBBReference(MBB) << ' '
+ << printMBBReference(*IfTarget) << ' '
+ << printMBBReference(*Endif) << '\n');
+
+ // Collect all the blocks in the ELSE region
+ collectElseRegionBlocks(IfTarget, Endif, ElseBlocks);
+
+ // Collect the registers can be optimized
+ collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks,
+ CandidateRegs);
+ MadeChange |= !CandidateRegs.empty();
+ // Now we are safe to optimize.
+ for (auto Reg : CandidateRegs)
+ optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+ } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+ LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
+ << printMBBReference(MBB) << '\n');
+
+ SmallSetVector<Register, 16> CandidateRegs;
+ collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+ MadeChange |= !CandidateRegs.empty();
+ // Now we are safe to optimize.
+ for (auto Reg : CandidateRegs)
+ optimizeWaterfallLiveRange(Reg, &MBB);
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index ab05081..e05aafe 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -48,10 +48,18 @@
SmallSet<Register, 16> Defs;
- bool isDependentLoad(const MachineInstr &MI) const;
+ void collectUsedRegUnits(const MachineInstr &MI,
+ BitVector &UsedRegUnits) const;
+ bool isBundleCandidate(const MachineInstr &MI) const;
+ bool isDependentLoad(const MachineInstr &MI) const;
+ bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
};
+constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+ SIInstrFlags::SMRD | SIInstrFlags::DS |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
} // End anonymous namespace.
INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
@@ -80,55 +88,125 @@
return false;
}
+void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
+ BitVector &UsedRegUnits) const {
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.readsReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ assert(!Op.getSubReg() &&
+ "subregister indexes should not be present after RA");
+
+ for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+ UsedRegUnits.set(*Units);
+ }
+}
+
+bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const {
+ const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
+ return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled();
+}
+
+bool SIPostRABundler::canBundle(const MachineInstr &MI,
+ const MachineInstr &NextMI) const {
+ const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
+
+ return (IMemFlags != 0 && MI.mayLoadOrStore() && !NextMI.isBundled() &&
+ NextMI.mayLoad() == MI.mayLoad() && NextMI.mayStore() == MI.mayStore() &&
+ ((NextMI.getDesc().TSFlags & MemFlags) == IMemFlags) &&
+ !isDependentLoad(NextMI));
+}
+
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- bool Changed = false;
- const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
- SIInstrFlags::SMRD | SIInstrFlags::DS |
- SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+ BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
+ BitVector KillUsedRegUnits(TRI->getNumRegUnits());
+ bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
MachineBasicBlock::instr_iterator B = MBB.instr_begin();
MachineBasicBlock::instr_iterator E = MBB.instr_end();
+
for (auto I = B; I != E; I = Next) {
Next = std::next(I);
-
- const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags;
-
- if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() ||
- B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
- ((B->getDesc().TSFlags & MemFlags) != IMemFlags) ||
- isDependentLoad(*I)) {
-
- if (B != I) {
- if (std::next(B) != I) {
- finalizeBundle(MBB, B, I);
- Changed = true;
- }
- Next = I;
- }
-
- B = Next;
- Defs.clear();
+ if (!isBundleCandidate(*I))
continue;
+
+ assert(Defs.empty());
+
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+
+ MachineBasicBlock::instr_iterator BundleStart = I;
+ MachineBasicBlock::instr_iterator BundleEnd = I;
+ unsigned ClauseLength = 1;
+ for (I = Next; I != E; I = Next) {
+ Next = std::next(I);
+
+ assert(BundleEnd != I);
+ if (canBundle(*BundleEnd, *I)) {
+ BundleEnd = I;
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+ ++ClauseLength;
+ } else if (!I->isMetaInstruction()) {
+ // Allow meta instructions in between bundle candidates, but do not
+ // start or end a bundle on one.
+ //
+ // TODO: It may be better to move meta instructions like dbg_value
+ // after the bundle. We're relying on the memory legalizer to unbundle
+ // these.
+ break;
+ }
}
- if (I->getNumExplicitDefs() == 0)
- continue;
+ Next = std::next(BundleEnd);
+ if (ClauseLength > 1) {
+ Changed = true;
- Defs.insert(I->defs().begin()->getReg());
+ // Before register allocation, kills are inserted after potential soft
+ // clauses to hint register allocation. Look for kills that look like
+ // this, and erase them.
+ if (Next != E && Next->isKill()) {
+
+ // TODO: Should maybe back-propagate kill flags to the bundle.
+ for (const MachineInstr &BundleMI : make_range(BundleStart, Next))
+ collectUsedRegUnits(BundleMI, BundleUsedRegUnits);
+
+ BundleUsedRegUnits.flip();
+
+ while (Next != E && Next->isKill()) {
+ MachineInstr &Kill = *Next;
+ collectUsedRegUnits(Kill, KillUsedRegUnits);
+
+ KillUsedRegUnits &= BundleUsedRegUnits;
+
+ // Erase the kill if it's a subset of the used registers.
+ //
+ // TODO: Should we just remove all kills? Is there any real reason to
+ // keep them after RA?
+ if (KillUsedRegUnits.none()) {
+ ++Next;
+ Kill.eraseFromParent();
+ } else
+ break;
+
+ KillUsedRegUnits.reset();
+ }
+
+ BundleUsedRegUnits.reset();
+ }
+
+ finalizeBundle(MBB, BundleStart, Next);
+ }
+
+ Defs.clear();
}
-
- if (B != E && std::next(B) != E) {
- finalizeBundle(MBB, B, E);
- Changed = true;
- }
-
- Defs.clear();
}
return Changed;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index dc08d9d..c2e2875 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -38,6 +38,9 @@
RegisterClassInfo RegClassInfo;
std::vector<unsigned> RegsToRewrite;
+#ifndef NDEBUG
+ void printWWMInfo(const MachineInstr &MI);
+#endif
public:
static char ID;
@@ -139,13 +142,26 @@
}
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
for (unsigned Reg : RegsToRewrite) {
LIS->removeInterval(Reg);
const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
- MFI->ReserveWWMRegister(PhysReg);
+
+ // Check if PhysReg is already reserved
+ if (!MFI->WWMReservedRegs.count(PhysReg)) {
+ Optional<int> FI;
+ if (!MFI->isEntryFunction()) {
+ // Create a stack object for a possible spill in the function prologue.
+ // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes.
+ const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg);
+ FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlign(*RC));
+ }
+ MFI->reserveWWMRegister(PhysReg, FI);
+ }
}
RegsToRewrite.clear();
@@ -154,6 +170,31 @@
MRI->freezeReservedRegs(MF);
}
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void
+SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
+
+ unsigned Opc = MI.getOpcode();
+
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
+ dbgs() << "Entering ";
+ } else {
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
+ dbgs() << "Exiting ";
+ }
+
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
+ dbgs() << "Strict WWM ";
+ } else {
+ assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
+ dbgs() << "Strict WQM ";
+ }
+
+ dbgs() << "region: " << MI;
+}
+
+#endif
+
bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
@@ -185,21 +226,23 @@
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
RegsAssigned |= processDef(MI.getOperand(0));
- if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
- LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+ if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
+ LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
- if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
- LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+ if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
+ LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
if (!InWWM)
continue;
- LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Processing " << MI);
for (MachineOperand &DefOpnd : MI.defs()) {
RegsAssigned |= processDef(DefOpnd);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 9ca4351..dce0f4b 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -14,13 +14,20 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
+static unsigned SkipThreshold;
+
+static cl::opt<unsigned, true> SkipThresholdFlag(
+ "amdgpu-skip-threshold", cl::Hidden,
+ cl::desc(
+ "Number of instructions before jumping over divergent control flow"),
+ cl::location(SkipThreshold), cl::init(12));
+
namespace {
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +37,13 @@
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+ bool getBlockDestinations(MachineBasicBlock &SrcMBB,
+ MachineBasicBlock *&TrueMBB,
+ MachineBasicBlock *&FalseMBB,
+ SmallVectorImpl<MachineOperand> &Cond);
+ bool mustRetainExeczBranch(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+ bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
public:
static char ID;
@@ -219,8 +233,11 @@
return false;
// Scan back to find an identical S_SET_GPR_IDX_ON
- for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
- E = MI.getIterator(); I != E; ++I) {
+ for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
+ E = MI.getIterator();
+ I != E; ++I) {
+ if (I->isBundle())
+ continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
return false;
@@ -249,9 +266,77 @@
}
}
- MI.eraseFromParent();
+ MI.eraseFromBundle();
for (MachineInstr *RI : ToRemove)
- RI->eraseFromParent();
+ RI->eraseFromBundle();
+ return true;
+}
+
+bool SIPreEmitPeephole::getBlockDestinations(
+ MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
+ MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
+ if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
+ return false;
+
+ if (!FalseMBB)
+ FalseMBB = SrcMBB.getNextNode();
+
+ return true;
+}
+
+bool SIPreEmitPeephole::mustRetainExeczBranch(
+ const MachineBasicBlock &From, const MachineBasicBlock &To) const {
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (I->isConditionalBranch())
+ return true;
+
+ if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+ return true;
+
+ // These instructions are potentially expensive even if EXEC = 0.
+ if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+ TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
+ return true;
+
+ ++NumInstr;
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Returns true if the skip branch instruction is removed.
+bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *TrueMBB = nullptr;
+ MachineBasicBlock *FalseMBB = nullptr;
+ SmallVector<MachineOperand, 1> Cond;
+
+ if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
+ return false;
+
+ // Consider only the forward branches.
+ if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
+ mustRetainExeczBranch(*FalseMBB, *TrueMBB))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
+ MI.eraseFromParent();
+ SrcMBB.removeSuccessor(TrueMBB);
+
return true;
}
@@ -259,52 +344,25 @@
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
bool Changed = false;
+ MF.RenumberBlocks();
+
for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
- MachineBasicBlock::iterator TermI = MBBE;
- // Check first terminator for VCC branches to optimize
+ MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
+ // Check first terminator for branches to optimize
if (TermI != MBB.end()) {
MachineInstr &MI = *TermI;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
- continue;
- default:
+ break;
+ case AMDGPU::S_CBRANCH_EXECZ:
+ Changed |= removeExeczBranch(MI, MBB);
break;
}
}
- // Check all terminators for SI_RETURN_TO_EPILOG
- // FIXME: This is not an optimization and should be moved somewhere else.
- while (TermI != MBB.end()) {
- MachineInstr &MI = *TermI;
- if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (&MBB != &MF.back() || &MI != &MBB.back()) {
- // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
- // at the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- MI.eraseFromParent();
- MBBE = MBB.getFirstTerminator();
- TermI = MBBE;
- continue;
- }
- }
- TermI++;
- }
if (!ST.hasVGPRIndexMode())
continue;
@@ -315,10 +373,10 @@
// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
// second is not needed. Do expensive checks in the optimizeSetGPR()
// and limit the distance to 20 instructions for compile time purposes.
- for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
- MachineInstr &MI = *MBBI;
- ++MBBI;
-
+ // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
+ // may be bundled with the instructions they modify.
+ for (auto &MI :
+ make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
if (Count == Threshold)
SetGPRMI = nullptr;
else
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 9b72d08..b13afce 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,10 +41,13 @@
uint32_t ScratchBlocks = 0;
uint64_t ComputePGMRSrc2 = 0;
+ uint64_t ComputePGMRSrc3GFX90A = 0;
uint32_t NumVGPR = 0;
uint32_t NumArchVGPR = 0;
uint32_t NumAccVGPR = 0;
+ uint32_t AccumOffset = 0;
+ uint32_t TgSplit = 0;
uint32_t NumSGPR = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7a45d8c..bba5bf7 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -43,6 +43,233 @@
static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
+namespace llvm {
+
+// A temporary struct to spill SGPRs.
+// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
+// just v_writelane and v_readlane.
+//
+// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
+// is saved to scratch (or the other way around for loads).
+// For this, a VGPR is required where the needed lanes can be clobbered. The
+// RegScavenger can provide a VGPR where currently active lanes can be
+// clobbered, but we still need to save inactive lanes.
+// The high-level steps are:
+// - Try to scavenge SGPR(s) to save exec
+// - Try to scavenge VGPR
+// - Save needed, all or inactive lanes of a TmpVGPR
+// - Spill/Restore SGPRs using TmpVGPR
+// - Restore TmpVGPR
+//
+// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
+// cannot scavenge temporary SGPRs to save exec, we use the following code:
+// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
+// s_not exec, exec
+// buffer_store_dword TmpVGPR ; save inactive lanes
+// s_not exec, exec
+struct SGPRSpillBuilder {
+ struct PerVGPRData {
+ unsigned PerVGPR;
+ unsigned NumVGPRs;
+ int64_t VGPRLanes;
+ };
+
+ // The SGPR to save
+ Register SuperReg;
+ MachineBasicBlock::iterator MI;
+ ArrayRef<int16_t> SplitParts;
+ unsigned NumSubRegs;
+ bool IsKill;
+ const DebugLoc &DL;
+
+ /* When spilling to stack */
+ // The SGPRs are written into this VGPR, which is then written to scratch
+ // (or vice versa for loads).
+ Register TmpVGPR = AMDGPU::NoRegister;
+ // Temporary spill slot to save TmpVGPR to.
+ int TmpVGPRIndex = 0;
+ // If TmpVGPR is live before the spill or if it is scavenged.
+ bool TmpVGPRLive = false;
+ // Scavenged SGPR to save EXEC.
+ Register SavedExecReg = AMDGPU::NoRegister;
+ // Stack index to write the SGPRs to.
+ int Index;
+ unsigned EltSize = 4;
+
+ RegScavenger *RS;
+ MachineBasicBlock &MBB;
+ MachineFunction &MF;
+ SIMachineFunctionInfo &MFI;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+ bool IsWave32;
+ Register ExecReg;
+ unsigned MovOpc;
+ unsigned NotOpc;
+
+ SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
+ bool IsWave32, MachineBasicBlock::iterator MI, int Index,
+ RegScavenger *RS)
+ : SuperReg(MI->getOperand(0).getReg()), MI(MI),
+ IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
+ RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
+ MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
+ IsWave32(IsWave32) {
+ const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
+ SplitParts = TRI.getRegSplitParts(RC, EltSize);
+ NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ if (IsWave32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ MovOpc = AMDGPU::S_MOV_B32;
+ NotOpc = AMDGPU::S_NOT_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ MovOpc = AMDGPU::S_MOV_B64;
+ NotOpc = AMDGPU::S_NOT_B64;
+ }
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+ assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+ SuperReg != AMDGPU::EXEC && "exec should never spill");
+ }
+
+ PerVGPRData getPerVGPRData() {
+ PerVGPRData Data;
+ Data.PerVGPR = IsWave32 ? 32 : 64;
+ Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
+ Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
+ return Data;
+ }
+
+ // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
+ // free.
+ // Writes these instructions if an SGPR can be scavenged:
+ // s_mov_b64 s[6:7], exec ; Save exec
+ // s_mov_b64 exec, 3 ; Wanted lanemask
+ // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
+ //
+ // Writes these instructions if no SGPR can be scavenged:
+ // buffer_store_dword v0 ; Only if no free VGPR was found
+ // s_not_b64 exec, exec
+ // buffer_store_dword v0 ; Save inactive lanes
+ // ; exec stays inverted, it is flipped back in
+ // ; restore.
+ void prepare() {
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ // FIXME: The liveness analysis is limited and does not tell if a register
+ // is in use in lanes that are currently inactive. We can never be sure if
+ // a register as actually in use in another lane, so we need to save all
+ // used lanes of the chosen VGPR.
+ assert(RS && "Cannot spill SGPR to memory without RegScavenger");
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
+
+ // Reserve temporary stack slot
+ TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
+ if (TmpVGPR) {
+ // Found a register that is dead in the currently active lanes, we only
+ // need to spill inactive lanes.
+ TmpVGPRLive = false;
+ } else {
+ // Pick v0 because it doesn't make a difference.
+ TmpVGPR = AMDGPU::VGPR0;
+ TmpVGPRLive = true;
+ }
+
+ // Try to scavenge SGPRs to save exec
+ assert(!SavedExecReg && "Exec is already saved, refuse to save again");
+ const TargetRegisterClass &RC =
+ IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
+ RS->setRegUsed(SuperReg);
+ SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
+
+ int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
+
+ if (SavedExecReg) {
+ RS->setRegUsed(SavedExecReg);
+ // Set exec to needed lanes
+ BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
+ auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
+ if (!TmpVGPRLive)
+ I.addReg(TmpVGPR, RegState::ImplicitDefine);
+ // Spill needed lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+ } else {
+ // Spill active lanes
+ if (TmpVGPRLive)
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
+ /*IsKill*/ false);
+ // Spill inactive lanes
+ auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ if (!TmpVGPRLive)
+ I.addReg(TmpVGPR, RegState::ImplicitDefine);
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+ }
+ }
+
+ // Writes these instructions if an SGPR can be scavenged:
+ // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
+ // s_waitcnt vmcnt(0) ; If a free VGPR was found
+ // s_mov_b64 exec, s[6:7] ; Save exec
+ //
+ // Writes these instructions if no SGPR can be scavenged:
+ // buffer_load_dword v0 ; Restore inactive lanes
+ // s_waitcnt vmcnt(0) ; If a free VGPR was found
+ // s_not_b64 exec, exec
+ // buffer_load_dword v0 ; Only if no free VGPR was found
+ void restore() {
+ if (SavedExecReg) {
+ // Restore used lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+ /*IsKill*/ false);
+ // Restore exec
+ auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
+ .addReg(SavedExecReg, RegState::Kill);
+ // Add an implicit use of the load so it is not dead.
+ // FIXME This inserts an unnecessary waitcnt
+ if (!TmpVGPRLive) {
+ I.addReg(TmpVGPR, RegState::ImplicitKill);
+ }
+ } else {
+ // Restore inactive lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+ /*IsKill*/ false);
+ auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ if (!TmpVGPRLive) {
+ I.addReg(TmpVGPR, RegState::ImplicitKill);
+ }
+ // Restore active lanes
+ if (TmpVGPRLive)
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
+ }
+ }
+
+ // Write TmpVGPR to memory or read TmpVGPR from memory.
+ // Either using a single buffer_load/store if exec is set to the needed mask
+ // or using
+ // buffer_load
+ // s_not exec, exec
+ // buffer_load
+ // s_not exec, exec
+ void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
+ if (SavedExecReg) {
+ // Spill needed lanes
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+ } else {
+ // Spill active lanes
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
+ /*IsKill*/ false);
+ // Spill inactive lanes
+ BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+ BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ }
+ }
+};
+
+} // namespace llvm
+
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
@@ -122,7 +349,9 @@
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
- return CSR_AMDGPU_HighRegs_SaveList;
+ return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+ ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
+ : CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -143,7 +372,9 @@
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
- return CSR_AMDGPU_HighRegs_RegMask;
+ return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+ ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
+ : CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
}
@@ -172,7 +403,7 @@
// When we need stack realignment, we can't reference off of the
// stack pointer, so we reserve a base pointer.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.getNumFixedObjects() && needsStackRealignment(MF);
+ return MFI.getNumFixedObjects() && shouldRealignStack(MF);
}
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
@@ -181,6 +412,14 @@
return CSR_AMDGPU_AllVGPRs_RegMask;
}
+const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
+ return CSR_AMDGPU_AllAGPRs_RegMask;
+}
+
+const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
+ return CSR_AMDGPU_AllVectorRegs_RegMask;
+}
+
const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
@@ -263,6 +502,12 @@
}
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+ // TODO: In an entry function without calls and AGPRs used it is possible
+ // to use the whole register budget for VGPRs. Even more it shall
+ // be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (ST.hasGFX90AInsts())
+ MaxNumVGPRs /= 2;
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -323,10 +568,21 @@
assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
}
- for (MCRegister Reg : MFI->WWMReservedRegs) {
- reserveRegisterTuples(Reserved, Reg);
+ for (auto Reg : MFI->WWMReservedRegs) {
+ reserveRegisterTuples(Reserved, Reg.first);
}
+ // Reserve VGPRs used for SGPR spilling.
+ // Note we treat freezeReservedRegs unusually because we run register
+ // allocation in two phases. It's OK to re-freeze with new registers for the
+ // second run.
+#if 0
+ for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) {
+ for (auto &SpilledVGPR : SpilledFI.second)
+ reserveRegisterTuples(Reserved, SpilledVGPR.VGPR);
+ }
+#endif
+
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);
@@ -340,7 +596,7 @@
return Reserved;
}
-bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
// On entry, the base address is 0, so it can't possibly need any more
// alignment.
@@ -350,7 +606,7 @@
if (Info->isEntryFunction())
return false;
- return TargetRegisterInfo::canRealignStack(MF);
+ return TargetRegisterInfo::shouldRealignStack(MF);
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -408,7 +664,7 @@
}
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
- if (!MI->mayLoadOrStore())
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return false;
int64_t FullOffset = Offset + getScratchInstrOffset(MI);
@@ -417,7 +673,8 @@
return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
const SIInstrInfo *TII = ST.getInstrInfo();
- return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
+ return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch);
}
Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
@@ -457,7 +714,7 @@
.addFrameIndex(FrameIdx);
if (ST.enableFlatScratch() ) {
- BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
.addReg(OffsetReg, RegState::Kill)
.addReg(FIReg);
return BaseReg;
@@ -500,7 +757,8 @@
assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
if (IsFlat) {
- assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
+ assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch) &&
"offset should be legal");
FIOp->ChangeToRegister(BaseReg, false);
OffsetOp->setImm(NewOffset);
@@ -531,7 +789,8 @@
return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
const SIInstrInfo *TII = ST.getInstrInfo();
- return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
+ return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch);
}
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -566,6 +825,13 @@
case AMDGPU::SI_SPILL_A256_SAVE:
case AMDGPU::SI_SPILL_A256_RESTORE:
return 8;
+ case AMDGPU::SI_SPILL_S224_SAVE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
+ case AMDGPU::SI_SPILL_V224_SAVE:
+ case AMDGPU::SI_SPILL_V224_RESTORE:
+ case AMDGPU::SI_SPILL_A224_SAVE:
+ case AMDGPU::SI_SPILL_A224_RESTORE:
+ return 7;
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_V192_SAVE:
@@ -667,13 +933,11 @@
}
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+ MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- int Index,
- unsigned Lane,
- unsigned ValueReg,
- bool IsKill) {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MI->getParent()->getParent();
+ int Index, unsigned Lane,
+ unsigned ValueReg, bool IsKill) {
+ MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -691,8 +955,8 @@
unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
: AMDGPU::V_ACCVGPR_READ_B32_e64;
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
- .addReg(Src, getKillRegState(IsKill));
+ auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+ .addReg(Src, getKillRegState(IsKill));
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
return MIB;
}
@@ -716,7 +980,7 @@
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -725,10 +989,8 @@
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
.addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
+ .addImm(0) // cpol
.addImm(0) // tfe
- .addImm(0) // dlc
.addImm(0) // swz
.cloneMemRefs(*MI);
@@ -774,23 +1036,20 @@
return LoadStoreOp;
}
-void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp,
- int Index,
- Register ValueReg,
- bool IsKill,
- MCRegister ScratchOffsetReg,
- int64_t InstOffset,
- MachineMemOperand *MMO,
- RegScavenger *RS) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MI->getParent()->getParent();
+void SIRegisterInfo::buildSpillLoadStore(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
+ MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
+ RegScavenger *RS, LivePhysRegs *LiveRegs) const {
+ assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both");
+
+ MachineFunction *MF = MBB.getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
- const DebugLoc &DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
bool IsStore = Desc->mayStore();
bool IsFlat = TII->isFLATScratch(LoadStoreOp);
@@ -798,7 +1057,8 @@
MCRegister SOffset = ScratchOffsetReg;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
- const bool IsAGPR = hasAGPRs(RC);
+ // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
+ const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
// Always use 4 byte operations for AGPRs because we need to scavenge
@@ -823,9 +1083,10 @@
assert((IsFlat || ((Offset % EltSize) == 0)) &&
"unexpected VGPR spill offset");
- bool IsOffsetLegal = IsFlat
- ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
- : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+ bool IsOffsetLegal =
+ IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)
+ : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
SOffset = MCRegister();
@@ -836,9 +1097,17 @@
Offset *= ST.getWavefrontSize();
// We don't have access to the register scavenger if this function is called
- // during PEI::scavengeFrameVirtualRegs().
- if (RS)
+ // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
+ if (RS) {
SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
+ } else if (LiveRegs) {
+ for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
+ if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+ SOffset = Reg;
+ break;
+ }
+ }
+ }
if (!SOffset) {
// There are no free SGPRs, and since we are in the process of spilling
@@ -860,10 +1129,9 @@
report_fatal_error("could not scavenge SGPR to spill in entry function");
if (ScratchOffsetReg == AMDGPU::NoRegister) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
- .addImm(Offset);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(ScratchOffsetReg)
.addImm(Offset);
}
@@ -916,7 +1184,7 @@
Register Sub = IsSubReg
? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
: ValueReg;
- auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
+ auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
if (!MIB.getInstr())
break;
if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
@@ -962,9 +1230,9 @@
RS->setRegUsed(TmpReg);
}
if (IsStore) {
- auto AccRead = BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
- .addReg(SubReg, getKillRegState(IsKill));
+ auto AccRead = BuildMI(MBB, MI, DL,
+ TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+ .addReg(SubReg, getKillRegState(IsKill));
if (NeedSuperRegDef)
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
@@ -977,9 +1245,9 @@
MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
commonAlignment(Alignment, RemRegOffset));
- auto MIB = BuildMI(*MBB, MI, DL, *Desc)
- .addReg(SubReg,
- getDefRegState(!IsStore) | getKillRegState(IsKill));
+ auto MIB =
+ BuildMI(MBB, MI, DL, *Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
if (!IsFlat)
MIB.addReg(FuncInfo->getScratchRSrcReg());
@@ -990,11 +1258,9 @@
MIB.addReg(SOffset, SOffsetRegState);
}
MIB.addImm(Offset + RemRegOffset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // tfe for MUBUF or dlc for FLAT
+ .addImm(0); // cpol
if (!IsFlat)
- MIB.addImm(0) // dlc
+ MIB.addImm(0) // tfe
.addImm(0); // swz
MIB.addMemOperand(NewMMO);
@@ -1002,9 +1268,9 @@
MIB.addReg(ValueReg, RegState::ImplicitDefine);
if (!IsStore && TmpReg != AMDGPU::NoRegister) {
- MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+ MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
FinalReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpReg, RegState::Kill);
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
@@ -1014,321 +1280,239 @@
if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(SOffset)
- .addImm(ScratchOffsetRegDelta);
+ .addImm(-ScratchOffsetRegDelta);
}
}
-// Generate a VMEM access which loads or stores the VGPR containing an SGPR
-// spill such that all the lanes set in VGPRLanes are loaded or stored.
-// This generates exec mask manipulation and will use SGPRs available in MI
-// or VGPR lanes in the VGPR to save and restore the exec mask.
-void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
- int Index, int Offset,
- unsigned EltSize, Register VGPR,
- int64_t VGPRLanes,
- RegScavenger *RS,
- bool IsLoad) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MBB->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- Register SuperReg = MI->getOperand(0).getReg();
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- unsigned FirstPart = Offset * 32;
- unsigned ExecLane = 0;
-
- bool IsKill = MI->getOperand(0).isKill();
- const DebugLoc &DL = MI->getDebugLoc();
-
- // Cannot handle load/store to EXEC
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- // On Wave32 only handle EXEC_LO.
- // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
- bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
-
- unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- Register SavedExecReg;
-
- // Backup EXEC
- if (OnlyExecLo) {
- SavedExecReg =
- NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
- } else {
- // If src/dst is an odd size it is possible subreg0 is not aligned.
- for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
- SavedExecReg = getMatchingSuperReg(
- getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
- &AMDGPU::SReg_64_XEXECRegClass);
- if (SavedExecReg)
- break;
- }
- }
- assert(SavedExecReg);
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
-
- // Setup EXEC
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
-
+void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
+ int Offset, bool IsLoad,
+ bool IsKill) const {
// Load/store VGPR
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
- Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
- ? getBaseRegister()
- : getFrameRegister(*MF);
+ Register FrameReg =
+ FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
+ ? getBaseRegister()
+ : getFrameRegister(SB.MF);
Align Alignment = FrameInfo.getObjectAlign(Index);
- MachinePointerInfo PtrInfo =
- MachinePointerInfo::getFixedStack(*MF, Index);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
+ MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
- EltSize, Alignment);
+ SB.EltSize, Alignment);
if (IsLoad) {
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VGPR, false,
- FrameReg,
- Offset * EltSize, MMO,
- RS);
+ buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
+ Offset * SB.EltSize, MMO, SB.RS);
} else {
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc, Index, VGPR,
- IsKill, FrameReg,
- Offset * EltSize, MMO, RS);
+ buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
+ Offset * SB.EltSize, MMO, SB.RS);
// This only ever adds one VGPR spill
- MFI->addToSpilledVGPRs(1);
- }
-
- // Restore EXEC
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
- .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
-
- // Restore clobbered SGPRs
- if (IsLoad) {
- // Nothing to do; register will be overwritten
- } else if (!IsKill) {
- // Restore SGPRs from appropriate VGPR lanes
- if (!OnlyExecLo) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
- getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
- .addReg(VGPR)
- .addImm(ExecLane + 1);
- }
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
- NumSubRegs == 1 ? SavedExecReg
- : Register(getSubReg(
- SuperReg, SplitParts[FirstPart + ExecLane])))
- .addReg(VGPR, RegState::Kill)
- .addImm(ExecLane);
+ SB.MFI.addToSpilledVGPRs(1);
}
}
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
+ LiveIntervals *LIS,
bool OnlyToVGPR) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MBB->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
- = MFI->getSGPRToVGPRSpills(Index);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+ SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- Register SuperReg = MI->getOperand(0).getReg();
- bool IsKill = MI->getOperand(0).isKill();
- const DebugLoc &DL = MI->getDebugLoc();
-
- assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
- SuperReg != MFI->getFrameOffsetReg()));
-
- assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- unsigned EltSize = 4;
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
+ SB.SuperReg != SB.MFI.getFrameOffsetReg()));
if (SpillToVGPR) {
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- bool UseKill = IsKill && i == NumSubRegs - 1;
+ bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
- .addReg(SubReg, getKillRegState(UseKill))
- .addImm(Spill.Lane)
- .addReg(Spill.VGPR);
-
- if (i == 0 && NumSubRegs > 1) {
- // We may be spilling a super-register which is only partially defined,
- // and need to ensure later spills think the value is defined.
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
+ .addReg(SubReg, getKillRegState(UseKill))
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR);
+ if (LIS) {
+ if (i == 0)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
}
- if (NumSubRegs > 1)
- MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
+ if (i == 0 && SB.NumSubRegs > 1) {
+ // We may be spilling a super-register which is only partially defined,
+ // and need to ensure later spills think the value is defined.
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+ }
+
+ if (SB.NumSubRegs > 1)
+ MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
// it are fixed.
}
} else {
- // Scavenged temporary VGPR to use. It must be scavenged once for any number
- // of spilled subregs.
- Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- RS->setRegUsed(TmpVGPR);
+ SB.prepare();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+ // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
+ unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
- unsigned PerVGPR = 32;
- unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
- int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+ // Per VGPR helper data
+ auto PVD = SB.getPerVGPRData();
- for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
unsigned TmpVGPRFlags = RegState::Undef;
// Write sub registers into the VGPR
- for (unsigned i = Offset * PerVGPR,
- e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ for (unsigned i = Offset * PVD.PerVGPR,
+ e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
MachineInstrBuilder WriteLane =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
+ BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+ SB.TmpVGPR)
.addReg(SubReg, SubKillState)
- .addImm(i % PerVGPR)
- .addReg(TmpVGPR, TmpVGPRFlags);
+ .addImm(i % PVD.PerVGPR)
+ .addReg(SB.TmpVGPR, TmpVGPRFlags);
TmpVGPRFlags = 0;
+ if (LIS) {
+ if (i == 0)
+ LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane);
+ else
+ LIS->InsertMachineInstrInMaps(*WriteLane);
+ }
+
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
- if (NumSubRegs > 1) {
- // The last implicit use of the SuperReg carries the "Kill" flag.
+ if (SB.NumSubRegs > 1) {
+ // The last implicit use of the SB.SuperReg carries the "Kill" flag.
unsigned SuperKillState = 0;
- if (i + 1 == NumSubRegs)
- SuperKillState |= getKillRegState(IsKill);
- WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ if (i + 1 == SB.NumSubRegs)
+ SuperKillState |= getKillRegState(SB.IsKill);
+ WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
}
}
// Write out VGPR
- buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
- RS, false);
+ SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
}
+
+ SB.restore();
}
MI->eraseFromParent();
- MFI->addToSpilledSGPRs(NumSubRegs);
+ SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
+
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
return true;
}
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
+ LiveIntervals *LIS,
bool OnlyToVGPR) const {
- MachineFunction *MF = MI->getParent()->getParent();
- MachineBasicBlock *MBB = MI->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
- = MFI->getSGPRToVGPRSpills(Index);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+ SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
- const SIInstrInfo *TII = ST.getInstrInfo();
- const DebugLoc &DL = MI->getDebugLoc();
-
- Register SuperReg = MI->getOperand(0).getReg();
-
- assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- unsigned EltSize = 4;
-
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
-
if (SpillToVGPR) {
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane);
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ auto MIB =
+ BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane);
+ if (SB.NumSubRegs > 1 && i == 0)
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+ if (LIS) {
+ if (i == e - 1)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+
}
} else {
- Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- RS->setRegUsed(TmpVGPR);
+ SB.prepare();
- unsigned PerVGPR = 32;
- unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
- int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+ // Per VGPR helper data
+ auto PVD = SB.getPerVGPRData();
- for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
// Load in VGPR data
- buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
- RS, true);
+ SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
// Unpack lanes
- for (unsigned i = Offset * PerVGPR,
- e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ for (unsigned i = Offset * PVD.PerVGPR,
+ e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
bool LastSubReg = (i + 1 == e);
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
- .addReg(TmpVGPR, getKillRegState(LastSubReg))
- .addImm(i);
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ auto MIB = BuildMI(SB.MBB, MI, SB.DL,
+ SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
+ .addImm(i);
+ if (SB.NumSubRegs > 1 && i == 0)
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+ if (LIS) {
+ if (i == e - 1)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
}
}
+
+ SB.restore();
}
MI->eraseFromParent();
+
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
return true;
}
@@ -1338,28 +1522,31 @@
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI,
int FI,
- RegScavenger *RS) const {
+ RegScavenger *RS,
+ LiveIntervals *LIS) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
- return spillSGPR(MI, FI, RS, true);
+ return spillSGPR(MI, FI, RS, LIS, true);
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
- return restoreSGPR(MI, FI, RS, true);
+ return restoreSGPR(MI, FI, RS, LIS, true);
default:
llvm_unreachable("not an SGPR spill instruction");
}
@@ -1389,6 +1576,7 @@
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
@@ -1403,6 +1591,7 @@
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -1417,6 +1606,7 @@
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V224_SAVE:
case AMDGPU::SI_SPILL_V192_SAVE:
case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V128_SAVE:
@@ -1426,6 +1616,7 @@
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A256_SAVE:
+ case AMDGPU::SI_SPILL_A224_SAVE:
case AMDGPU::SI_SPILL_A192_SAVE:
case AMDGPU::SI_SPILL_A160_SAVE:
case AMDGPU::SI_SPILL_A128_SAVE:
@@ -1439,13 +1630,11 @@
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VData->getReg(), VData->isKill(),
- FrameReg,
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
- *MI->memoperands_begin(),
- RS);
+ auto *MBB = MI->getParent();
+ buildSpillLoadStore(
+ *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(), RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent();
break;
@@ -1456,6 +1645,7 @@
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_V192_RESTORE:
+ case AMDGPU::SI_SPILL_V224_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_V1024_RESTORE:
@@ -1465,6 +1655,7 @@
case AMDGPU::SI_SPILL_A128_RESTORE:
case AMDGPU::SI_SPILL_A160_RESTORE:
case AMDGPU::SI_SPILL_A192_RESTORE:
+ case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_A1024_RESTORE: {
@@ -1475,18 +1666,17 @@
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VData->getReg(), VData->isKill(),
- FrameReg,
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
- *MI->memoperands_begin(),
- RS);
+ auto *MBB = MI->getParent();
+ buildSpillLoadStore(
+ *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(), RS);
MI->eraseFromParent();
break;
}
default: {
+ // Other access to frame index
const DebugLoc &DL = MI->getDebugLoc();
int64_t Offset = FrameInfo.getObjectOffset(Index);
@@ -1507,7 +1697,7 @@
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
int64_t NewOffset = Offset + OffsetOp->getImm();
if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
- true)) {
+ SIInstrFlags::FlatScratch)) {
OffsetOp->setImm(NewOffset);
if (FrameReg)
return;
@@ -1580,9 +1770,9 @@
FIOp.setIsKill(false);
}
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
- .addReg(FrameReg)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
if (!UseSGPR)
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
@@ -1590,10 +1780,10 @@
if (TmpSReg == FrameReg) {
// Undo frame register modification.
- BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
+ BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
FrameReg)
- .addReg(FrameReg)
- .addImm(Offset);
+ .addReg(FrameReg)
+ .addImm(-Offset);
}
return;
@@ -1667,17 +1857,17 @@
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
.addReg(ScaledReg, RegState::Kill);
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(-Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
@@ -1735,14 +1925,8 @@
return AMDGPUInstPrinter::getRegisterName(Reg);
}
-const TargetRegisterClass *
-SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth == 1)
- return &AMDGPU::VReg_1RegClass;
- if (BitWidth <= 16)
- return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
- return &AMDGPU::VGPR_32RegClass;
+static const TargetRegisterClass *
+getAnyVGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::VReg_64RegClass;
if (BitWidth <= 96)
@@ -1753,6 +1937,8 @@
return &AMDGPU::VReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256RegClass;
if (BitWidth <= 512)
@@ -1763,12 +1949,44 @@
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::VReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::VReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::VReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::VReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::VReg_192_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::VReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::VReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::VReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
const TargetRegisterClass *
-SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
+SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth == 1)
+ return &AMDGPU::VReg_1RegClass;
if (BitWidth <= 16)
- return &AMDGPU::AGPR_LO16RegClass;
+ return &AMDGPU::VGPR_LO16RegClass;
if (BitWidth <= 32)
- return &AMDGPU::AGPR_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
+ : getAnyVGPRClassForBitWidth(BitWidth);
+}
+
+static const TargetRegisterClass *
+getAnyAGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::AReg_64RegClass;
if (BitWidth <= 96)
@@ -1779,6 +1997,8 @@
return &AMDGPU::AReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::AReg_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::AReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::AReg_256RegClass;
if (BitWidth <= 512)
@@ -1789,6 +2009,40 @@
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::AReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::AReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::AReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::AReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::AReg_192_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::AReg_224_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::AReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::AReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::AReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth <= 16)
+ return &AMDGPU::AGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::AGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
+ : getAnyAGPRClassForBitWidth(BitWidth);
+}
+
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 16)
@@ -1805,6 +2059,8 @@
return &AMDGPU::SGPR_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::SGPR_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::SGPR_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::SGPR_256RegClass;
if (BitWidth <= 512)
@@ -1827,29 +2083,51 @@
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass,
+ &AMDGPU::AGPR_32RegClass,
+ &AMDGPU::VReg_64_Align2RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::AReg_64_Align2RegClass,
&AMDGPU::AReg_64RegClass,
+ &AMDGPU::VReg_96_Align2RegClass,
&AMDGPU::VReg_96RegClass,
&AMDGPU::SReg_96RegClass,
+ &AMDGPU::AReg_96_Align2RegClass,
&AMDGPU::AReg_96RegClass,
+ &AMDGPU::VReg_128_Align2RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
+ &AMDGPU::AReg_128_Align2RegClass,
&AMDGPU::AReg_128RegClass,
+ &AMDGPU::VReg_160_Align2RegClass,
&AMDGPU::VReg_160RegClass,
&AMDGPU::SReg_160RegClass,
+ &AMDGPU::AReg_160_Align2RegClass,
&AMDGPU::AReg_160RegClass,
+ &AMDGPU::VReg_192_Align2RegClass,
&AMDGPU::VReg_192RegClass,
&AMDGPU::SReg_192RegClass,
+ &AMDGPU::AReg_192_Align2RegClass,
&AMDGPU::AReg_192RegClass,
+ &AMDGPU::VReg_224_Align2RegClass,
+ &AMDGPU::VReg_224RegClass,
+ &AMDGPU::SReg_224RegClass,
+ &AMDGPU::AReg_224_Align2RegClass,
+ &AMDGPU::AReg_224RegClass,
+ &AMDGPU::VReg_256_Align2RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
+ &AMDGPU::AReg_256_Align2RegClass,
&AMDGPU::AReg_256RegClass,
+ &AMDGPU::VReg_512_Align2RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
+ &AMDGPU::AReg_512_Align2RegClass,
&AMDGPU::AReg_512RegClass,
&AMDGPU::SReg_1024RegClass,
+ &AMDGPU::VReg_1024_Align2RegClass,
&AMDGPU::VReg_1024RegClass,
+ &AMDGPU::AReg_1024_Align2RegClass,
&AMDGPU::AReg_1024RegClass,
&AMDGPU::SCC_CLASSRegClass,
&AMDGPU::Pseudo_SReg_32RegClass,
@@ -1949,6 +2227,16 @@
return RC;
}
+const TargetRegisterClass *
+SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const {
+ // Ensure this subregister index is aligned in the super register.
+ const TargetRegisterClass *MatchRC =
+ getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
+ return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
+}
+
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
@@ -2147,6 +2435,12 @@
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
+const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
+ // VGPR tuples have an alignment requirement on gfx90a variants.
+ return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
+ : &AMDGPU::VReg_64RegClass;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getRegClass(unsigned RCID) const {
switch ((int)RCID) {
@@ -2234,6 +2528,18 @@
return AMDGPU::NoRegister;
}
+bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
+ if (!ST.needsAlignedVGPRs())
+ return true;
+
+ if (hasVGPRs(&RC))
+ return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
+ if (hasAGPRs(&RC))
+ return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
+
+ return true;
+}
+
bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
switch (PhysReg) {
case AMDGPU::SGPR_NULL:
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 963da9b..2a92051 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -21,7 +21,9 @@
class GCNSubtarget;
class LiveIntervals;
+class LivePhysRegs;
class RegisterBank;
+struct SGPRSpillBuilder;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
@@ -79,7 +81,7 @@
bool hasBasePointer(const MachineFunction &MF) const;
Register getBaseRegister() const;
- bool canRealignStack(const MachineFunction &MF) const override;
+ bool shouldRealignStack(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -106,18 +108,18 @@
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;
- void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
- int Offset, unsigned EltSize, Register VGPR,
- int64_t VGPRLanes, RegScavenger *RS,
- bool IsLoad) const;
+ void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
+ bool IsLoad, bool IsKill = true) const;
/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr,
bool OnlyToVGPR = false) const;
bool restoreSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr,
bool OnlyToVGPR = false) const;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
@@ -125,7 +127,8 @@
RegScavenger *RS) const override;
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
- int FI, RegScavenger *RS) const;
+ int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr) const;
StringRef getRegAsmName(MCRegister Reg) const override;
@@ -134,8 +137,13 @@
return getEncodingValue(Reg) & 0xff;
}
- static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth);
- static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth);
+ LLVM_READONLY
+ const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
+ const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
/// Return the 'base' register class for this register.
@@ -182,12 +190,21 @@
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
- /// \returns The register class that is used for a sub-register of \p RC for
- /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
- /// be returned.
+ /// \returns The canonical register class that is used for a sub-register of
+ /// \p RC for the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC
+ /// will be returned.
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+ /// Returns a register class which is compatible with \p SuperRC, such that a
+ /// subregister exists with class \p SubRC with subregister index \p
+ /// SubIdx. If this is impossible (e.g., an unaligned subregister index within
+ /// a register tuple), return null.
+ const TargetRegisterClass *
+ getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const;
+
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
@@ -268,6 +285,10 @@
: &AMDGPU::SReg_64_XEXECRegClass;
}
+ // Return the appropriate register class to use for 64-bit VGPRs for the
+ // subtarget.
+ const TargetRegisterClass *getVGPR64Class() const;
+
MCRegister getVCC() const;
const TargetRegisterClass *getRegClass(unsigned RCID) const;
@@ -279,6 +300,8 @@
LiveIntervals *LIS) const;
const uint32_t *getAllVGPRRegMask() const;
+ const uint32_t *getAllAGPRRegMask() const;
+ const uint32_t *getAllVectorRegMask() const;
const uint32_t *getAllAllocatableSRegMask() const;
// \returns number of 32 bit registers covered by a \p LM
@@ -306,6 +329,10 @@
// \returns \p Reg otherwise.
MCPhysReg get32BitRegister(MCPhysReg Reg) const;
+ // Returns true if a given register class is properly aligned for
+ // the subtarget.
+ bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
+
/// Return all SGPR128 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
@@ -318,16 +345,16 @@
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
-private:
- void buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp,
- int Index,
- Register ValueReg,
- bool ValueIsKill,
- MCRegister ScratchOffsetReg,
- int64_t InstrOffset,
- MachineMemOperand *MMO,
- RegScavenger *RS) const;
+ // Insert spill or restore instructions.
+ // When lowering spill pseudos, the RegScavenger should be set.
+ // For creating spill instructions during frame lowering, where no scavenger
+ // is available, LiveRegs can be used.
+ void buildSpillLoadStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned LoadStoreOp,
+ int Index, Register ValueReg, bool ValueIsKill,
+ MCRegister ScratchOffsetReg, int64_t InstrOffset,
+ MachineMemOperand *MMO, RegScavenger *RS,
+ LivePhysRegs *LiveRegs = nullptr) const;
};
} // End namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 92390f1..6e3c4e8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -36,12 +36,12 @@
foreach Size = {2...6,8,16} in {
foreach Index = Indexes<!sub(33, Size)>.slice in {
- def !foldl("", Indexes<Size>.slice, acc, cur,
- !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
+ def !interleave(!foreach(cur, Indexes<Size>.slice, "sub"#!add(cur, Index)),
+ "_") :
SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
let CoveringSubRegIndices =
- !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
- !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
+ !foreach(cur, Indexes<Size>.slice,
+ !cast<SubRegIndex>(sub#!add(cur, Index)));
}
}
}
@@ -58,6 +58,7 @@
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
+ list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
@@ -77,9 +78,10 @@
!if(!eq(size, 4), ret4,
!if(!eq(size, 5), ret5,
!if(!eq(size, 6), ret6,
- !if(!eq(size, 8), ret8,
- !if(!eq(size, 16), ret16,
- ret32)))))));
+ !if(!eq(size, 7), ret7,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16,
+ ret32))))))));
}
// Generates list of sequential register tuple names.
@@ -350,9 +352,12 @@
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
-// SGPR 192-bit registers
+// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs.
def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
+// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs.
+def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
+
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
@@ -368,6 +373,7 @@
let isAllocatable = 0;
}
+// Trap handler TMP 16-bit registers
def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "TTMP%u_LO16", 0, 15))> {
let Size = 16;
@@ -377,11 +383,25 @@
// Trap handler TMP 64-bit registers
def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
+// Trap handler TMP 96-bit registers
+def TTMP_96Regs : SIRegisterTuples<getSubRegs<3>.ret, TTMP_32, 15, 3, 3, "ttmp">;
+
// Trap handler TMP 128-bit registers
def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">;
+// Trap handler TMP 160-bit registers
+def TTMP_160Regs : SIRegisterTuples<getSubRegs<5>.ret, TTMP_32, 15, 4, 5, "ttmp">;
+
+// Trap handler TMP 192-bit registers
+def TTMP_192Regs : SIRegisterTuples<getSubRegs<6>.ret, TTMP_32, 15, 4, 6, "ttmp">;
+
+// Trap handler TMP 224-bit registers
+def TTMP_224Regs : SIRegisterTuples<getSubRegs<7>.ret, TTMP_32, 15, 4, 7, "ttmp">;
+
+// Trap handler TMP 256-bit registers
def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+// Trap handler TMP 512-bit registers
def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
class TmpRegTuplesBase<int index, int size,
@@ -508,6 +528,9 @@
// VGPR 192-bit registers
def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+// VGPR 224-bit registers
+def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
+
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
@@ -547,6 +570,9 @@
// AGPR 192-bit registers
def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
+// AGPR 224-bit registers
+def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
+
// AGPR 256-bit registers
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
@@ -682,111 +708,53 @@
let isAllocatable = 0;
}
-// Requires 2 s_mov_b64 to copy
-let CopyCost = 2 in {
+multiclass SRegClass<int numRegs, int priority,
+ list<ValueType> regTypes,
+ SIRegisterTuples regList,
+ SIRegisterTuples ttmpList = regList,
+ int copyCost = !sra(!add(numRegs, 1), 1)> {
+ defvar hasTTMP = !ne(regList, ttmpList);
+ defvar suffix = !cast<string>(!mul(numRegs, 32));
+ defvar sgprName = !strconcat("SGPR_", suffix);
+ defvar ttmpName = !strconcat("TTMP_", suffix);
-// There are no 3-component scalar instructions, but this is needed
-// for symmetry with VGPRs.
-def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96Regs)> {
- let AllocationPriority = 14;
+ let AllocationPriority = priority, CopyCost = copyCost in {
+ def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
+ }
+
+ if hasTTMP then {
+ def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
+ let isAllocatable = 0;
+ }
+ }
+
+ def SReg_ # suffix :
+ RegisterClass<"AMDGPU", regTypes, 32,
+ !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]),
+ !if(hasTTMP,
+ !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
+ (add)))> {
+ let isAllocatable = 0;
+ }
+ }
}
-def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96)> {
- let AllocationPriority = 14;
-}
-
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add SGPR_128Regs)> {
- let AllocationPriority = 15;
-}
-
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add TTMP_128Regs)> {
- let isAllocatable = 0;
-}
-
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128)> {
- let isAllocatable = 0;
-}
-
-} // End CopyCost = 2
-
-// There are no 5-component scalar instructions, but this is needed
-// for symmetry with VGPRs.
-def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160Regs)> {
- let AllocationPriority = 16;
-}
-
-def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160)> {
- // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
- // subclasses of SGPR_160 to be marked unallocatable too.
-}
-
-def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
- let Size = 192;
- let AllocationPriority = 17;
-}
-
-def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
- let Size = 192;
- let isAllocatable = 0;
-}
-
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 18;
-}
-
-def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
- let isAllocatable = 0;
-}
-
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
- (add SGPR_256, TTMP_256)> {
- // Requires 4 s_mov_b64 to copy
- let CopyCost = 4;
- let isAllocatable = 0;
-}
-
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add SGPR_512Regs)> {
- let AllocationPriority = 19;
-}
-
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add TTMP_512Regs)> {
- let isAllocatable = 0;
-}
-
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add SGPR_512, TTMP_512)> {
- // Requires 8 s_mov_b64 to copy
- let CopyCost = 8;
- let isAllocatable = 0;
-}
+defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
+defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
+defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
+defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
- (add SGPR_1024Regs)> {
- let AllocationPriority = 20;
-}
-
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
- (add SGPR_1024)> {
- let CopyCost = 16;
- let isAllocatable = 0;
-}
-
// Register class for all vector registers (VGPRs + Interpolation Registers)
-class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
RegisterClass<"AMDGPU", regTypes, 32, regList> {
let Size = !mul(numRegs, 32);
@@ -796,31 +764,48 @@
let Weight = numRegs;
}
-def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
- (add VGPR_64)>;
-def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>;
-def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
-def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
-def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
-def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
-def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+// Define a register tuple class, along with one requiring an even
+// aligned base register.
+multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
-class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> :
- VRegClass<numRegs, regTypes, regList> {
- // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr
- let CopyCost = !add(numRegs, numRegs, 1);
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
}
-def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+ (add VGPR_64)>;
+defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
+defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
+
+defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
+defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+
+multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ let CopyCost = !add(numRegs, numRegs, 1) in {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
+
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+ }
+}
+
+defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;
-def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
-def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
-def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
-def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
-def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
-def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
+defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
+defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
+defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
+defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
+defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
} // End GeneratePressureSet = 0
@@ -847,21 +832,36 @@
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
}
-def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
(add AGPR_32, VGPR_32)> {
let isAllocatable = 0;
}
-def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
+def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
(add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
} // End GeneratePressureSet = 0
+def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
+ (add AReg_96, VReg_96)> {
+ let isAllocatable = 0;
+}
+
+def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add AReg_128, VReg_128)> {
+ let isAllocatable = 0;
+}
+
+def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
+ (add AReg_160, VReg_160)> {
+ let isAllocatable = 0;
+}
+
//===----------------------------------------------------------------------===//
// Register operands
//===----------------------------------------------------------------------===//
@@ -912,21 +912,38 @@
}
}
-multiclass SIRegOperand <string rc, string MatchName, string opType> :
- SIRegOperand32<rc, MatchName, opType> {
+multiclass SIRegOperand64 <string rc, string MatchName, string opType,
+ string rc_suffix = "_64", bit Vectors = 1> {
let OperandNamespace = "AMDGPU" in {
- def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT64";
let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
}
- def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP64";
let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
}
+
+ if Vectors then
+ def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+ let OperandType = opType#"_V2FP32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">;
+ let DecoderMethod = "decodeOperand_VSrcV232";
+ }
+ if Vectors then
+ def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+ let OperandType = opType#"_V2INT32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">;
+ let DecoderMethod = "decodeOperand_VSrcV232";
+ }
}
}
+multiclass SIRegOperand <string rc, string MatchName, string opType> :
+ SIRegOperand32<rc, MatchName, opType>,
+ SIRegOperand64<rc, MatchName, opType>;
+
// FIXME: 64-bit sources can sometimes use 32-bit constants.
multiclass RegImmOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
@@ -938,10 +955,18 @@
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+multiclass RegInlineOperand64 <string rc, string MatchName,
+ string rc_suffix = "_64">
+ : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+
multiclass RegInlineOperandAC <string rc, string MatchName,
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
+multiclass RegInlineOperandAC64 <string rc, string MatchName,
+ string rc_suffix = "_64">
+ : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>;
+
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
@@ -971,7 +996,7 @@
}
//===----------------------------------------------------------------------===//
-// VSrc_* Operands with an VGPR
+// VRegSrc_* Operands with a VGPR
//===----------------------------------------------------------------------===//
// This is for operands with the enum(9), VSrc encoding restriction,
@@ -1001,6 +1026,13 @@
//===----------------------------------------------------------------------===//
defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
+let DecoderMethod = "decodeOperand_VReg_64" in
+defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">;
+defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">;
+let DecoderMethod = "decodeOperand_VReg_256" in
+defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">;
+defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">;
+defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
//===----------------------------------------------------------------------===//
// AVSrc_* Operands with an AGPR or VGPR
@@ -1016,6 +1048,31 @@
let EncoderMethod = "getAVOperandEncoding";
}
+def AVLdSt_32 : RegisterOperand<AV_32> {
+ let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_64 : RegisterOperand<AV_64> {
+ let DecoderMethod = "DecodeAVLdSt_64RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_96 : RegisterOperand<AV_96> {
+ let DecoderMethod = "DecodeAVLdSt_96RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_128 : RegisterOperand<AV_128> {
+ let DecoderMethod = "DecodeAVLdSt_128RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_160 : RegisterOperand<AV_160> {
+ let DecoderMethod = "DecodeAVLdSt_160RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
@@ -1024,3 +1081,8 @@
defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
+
+let DecoderMethod = "decodeOperand_AReg_64" in
+defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">;
+let DecoderMethod = "decodeOperand_AReg_256" in
+defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
deleted file mode 100644
index d30ff4a..0000000
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass optmizes the s_cbranch_execz instructions.
-/// The pass removes this skip instruction for short branches,
-/// if there is no unwanted sideeffect in the fallthrough code sequence.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-remove-short-exec-branches"
-
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
- "amdgpu-skip-threshold", cl::Hidden,
- cl::desc(
- "Number of instructions before jumping over divergent control flow"),
- cl::location(SkipThreshold), cl::init(12));
-
-namespace {
-
-class SIRemoveShortExecBranches : public MachineFunctionPass {
-private:
- const SIInstrInfo *TII = nullptr;
- bool getBlockDestinations(MachineBasicBlock &SrcMBB,
- MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB,
- SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
- bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-
-public:
- static char ID;
-
- SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
- initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
- "SI remove short exec branches", false, false)
-
-char SIRemoveShortExecBranches::ID = 0;
-
-char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
-
-bool SIRemoveShortExecBranches::getBlockDestinations(
- MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
- if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- if (!FalseMBB)
- FalseMBB = SrcMBB.getNextNode();
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::mustRetainExeczBranch(
- const MachineBasicBlock &From, const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might never be taken when EXEC = 0.
- // Hence we should retain cbranch out of the loop lest it become infinite.
- if (I->isConditionalBranch())
- return true;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
- return true;
-
- if (TII->isKillTerminator(I->getOpcode()))
- return true;
-
- // These instructions are potentially expensive even if EXEC = 0.
- if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
- return true;
-
- ++NumInstr;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-// Returns true if the skip branch instruction is removed.
-bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
- MachineBasicBlock &SrcMBB) {
- MachineBasicBlock *TrueMBB = nullptr;
- MachineBasicBlock *FalseMBB = nullptr;
- SmallVector<MachineOperand, 1> Cond;
-
- if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- // Consider only the forward branches.
- if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
- mustRetainExeczBranch(*FalseMBB, *TrueMBB))
- return false;
-
- LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
- MI.eraseFromParent();
- SrcMBB.removeSuccessor(TrueMBB);
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
- MF.RenumberBlocks();
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
- if (MBBI == MBB.end())
- continue;
-
- MachineInstr &MI = *MBBI;
- switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- Changed = removeExeczBranch(MI, MBB);
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
index db4a009..b24c061 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -54,10 +54,15 @@
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
+// Integer multiplications.
+def WriteIntMul : SchedWrite;
+
// mAI multipass instructions.
def Write2PassMAI : SchedWrite;
def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
+def Write4PassDGEMM : SchedWrite;
+def Write8PassDGEMM : SchedWrite;
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
@@ -80,6 +85,7 @@
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
+def SIDPFullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
@@ -101,6 +107,9 @@
def HWVALU : ProcResource<1> {
let BufferSize = 1;
}
+def HWTransVALU : ProcResource<1> { // Transcendental VALU
+ let BufferSize = 1;
+}
def HWRC : ProcResource<1> { // Register destination cache
let BufferSize = 1;
}
@@ -137,11 +146,13 @@
def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
def : HWVALUWriteRes<Write32Bit, 1>;
- def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteFloatCvt, 4>;
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ def : HWVALUWriteRes<Write4PassDGEMM, 4>;
+ def : HWVALUWriteRes<Write8PassDGEMM, 16>;
+
let ResourceCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
let ResourceCycles = [8] in
@@ -150,7 +161,6 @@
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
def : ReadAdvance<MIVGPRRead, -2>;
- def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
// Technically mfma reads can be from 0 to 4 cycles but that does not make
// sense to model because its register setup is huge. In particular if we
@@ -159,10 +169,6 @@
// need to consume 2 or 4 more vgprs to be initialized before the acc
// write sequence. Just assume worst case here.
def : ReadAdvance<MIMFMARead, -4>;
-
- def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
- def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
- def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}
def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
@@ -176,11 +182,13 @@
defm : SICommonWriteRes;
-def : HWVALUWriteRes<WriteFloatFMA, 1>;
-def : HWVALUWriteRes<WriteDouble, 4>;
-def : HWVALUWriteRes<WriteDoubleAdd, 2>;
-def : HWVALUWriteRes<WriteDoubleCvt, 4>;
-def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<Write64Bit, 2>;
+def : HWVALUWriteRes<WriteIntMul, 4>;
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -190,16 +198,44 @@
defm : SICommonWriteRes;
-def : HWVALUWriteRes<WriteFloatFMA, 16>;
-def : HWVALUWriteRes<WriteDouble, 16>;
-def : HWVALUWriteRes<WriteDoubleAdd, 8>;
-def : HWVALUWriteRes<WriteDoubleCvt, 4>;
-def : HWVALUWriteRes<WriteTrans64, 16>;
+def : HWVALUWriteRes<Write64Bit, 2>;
+def : HWVALUWriteRes<WriteIntMul, 4>;
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 16>;
def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
} // End SchedModel = SIQuarterSpeedModel
+let SchedModel = SIDPFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 1>;
+def : HWVALUWriteRes<WriteDoubleAdd, 1>;
+def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<WriteIntMul, 1>;
+def : HWVALUWriteRes<Write64Bit, 1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>;
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+} // End SchedModel = SIDPFullSpeedModel
+
let SchedModel = GFX10SpeedModel in {
// The latency values are 1 / (operations / cycle).
@@ -207,13 +243,14 @@
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
-def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>;
-def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 24>;
def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index cdb78aa..45dd57e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -232,9 +232,14 @@
RC = &AMDGPU::VReg_96RegClass;
} else if (Info->VAddrDwords == 4) {
RC = &AMDGPU::VReg_128RegClass;
- } else if (Info->VAddrDwords <= 8) {
+ } else if (Info->VAddrDwords == 5) {
+ RC = &AMDGPU::VReg_160RegClass;
+ } else if (Info->VAddrDwords == 6) {
+ RC = &AMDGPU::VReg_192RegClass;
+ } else if (Info->VAddrDwords == 7) {
+ RC = &AMDGPU::VReg_224RegClass;
+ } else if (Info->VAddrDwords == 8) {
RC = &AMDGPU::VReg_256RegClass;
- NewAddrDwords = 8;
} else {
RC = &AMDGPU::VReg_512RegClass;
NewAddrDwords = 16;
@@ -573,7 +578,7 @@
dropInstructionKeepingImpDefs(*MovY, TII);
MachineInstr *Next = &*std::next(MovT.getIterator());
- if (MRI.use_nodbg_empty(T)) {
+ if (T.isVirtual() && MRI.use_nodbg_empty(T)) {
dropInstructionKeepingImpDefs(MovT, TII);
} else {
Xop.setIsKill(false);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 0640e24..38548ea 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -7,14 +7,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This pass adds instructions to enable whole quad mode for pixel
-/// shaders, and whole wavefront mode for all programs.
+/// This pass adds instructions to enable whole quad mode (strict or non-strict)
+/// for pixel shaders, and strict whole wavefront mode for all programs.
+///
+/// The "strict" prefix indicates that inactive lanes do not take part in
+/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
+/// always be enabled irrespective of control flow decisions. Conversely in
+/// non-strict WQM inactive lanes may control flow decisions.
///
/// Whole quad mode is required for derivative computations, but it interferes
-/// with shader side effects (stores and atomics). This pass is run on the
-/// scheduled machine IR but before register coalescing, so that machine SSA is
-/// available for analysis. It ensures that WQM is enabled when necessary, but
-/// disabled around stores and atomics.
+/// with shader side effects (stores and atomics). It ensures that WQM is
+/// enabled when necessary, but disabled around stores and atomics.
///
/// When necessary, this pass creates a function prolog
///
@@ -28,12 +31,21 @@
/// ...
/// S_MOV_B64 EXEC, Tmp
///
-/// We also compute when a sequence of instructions requires Whole Wavefront
-/// Mode (WWM) and insert instructions to save and restore it:
+/// We also compute when a sequence of instructions requires strict whole
+/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
///
-/// S_OR_SAVEEXEC_B64 Tmp, -1
-/// ...
-/// S_MOV_B64 EXEC, Tmp
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
+/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
+/// we use a similar save and restore mechanism and force whole quad mode for
+/// those instructions:
+///
+/// S_MOV_B64 Tmp, EXEC
+/// S_WQM_B64 EXEC, EXEC
+/// ...
+/// S_MOV_B64 EXEC, Tmp
///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
@@ -62,8 +74,10 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/raw_ostream.h"
@@ -76,8 +90,10 @@
enum {
StateWQM = 0x1,
- StateWWM = 0x2,
- StateExact = 0x4,
+ StateStrictWWM = 0x2,
+ StateStrictWQM = 0x4,
+ StateExact = 0x8,
+ StateStrict = StateStrictWWM | StateStrictWQM,
};
struct PrintState {
@@ -89,19 +105,23 @@
#ifndef NDEBUG
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
- if (PS.State & StateWQM)
- OS << "WQM";
- if (PS.State & StateWWM) {
- if (PS.State & StateWQM)
- OS << '|';
- OS << "WWM";
- }
- if (PS.State & StateExact) {
- if (PS.State & (StateWQM | StateWWM))
- OS << '|';
- OS << "Exact";
- }
+ static const std::pair<char, const char *> Mapping[] = {
+ std::make_pair(StateWQM, "WQM"),
+ std::make_pair(StateStrictWWM, "StrictWWM"),
+ std::make_pair(StateStrictWQM, "StrictWQM"),
+ std::make_pair(StateExact, "Exact")};
+ char State = PS.State;
+ for (auto M : Mapping) {
+ if (State & M.first) {
+ OS << M.second;
+ State &= ~M.first;
+
+ if (State)
+ OS << '|';
+ }
+ }
+ assert(State == 0);
return OS;
}
#endif
@@ -116,6 +136,8 @@
char Needs = 0;
char InNeeds = 0;
char OutNeeds = 0;
+ char InitialState = 0;
+ bool NeedsLowering = false;
};
struct WorkItem {
@@ -129,23 +151,33 @@
class SIWholeQuadMode : public MachineFunctionPass {
private:
- CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ MachineDominatorTree *MDT;
+ MachinePostDominatorTree *PDT;
unsigned AndOpc;
- unsigned XorTermrOpc;
+ unsigned AndN2Opc;
+ unsigned XorOpc;
+ unsigned AndSaveExecOpc;
unsigned OrSaveExecOpc;
- unsigned Exec;
+ unsigned WQMOpc;
+ Register Exec;
+ Register LiveMaskReg;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+ // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
+ DenseMap<const MachineInstr *, char> StateTransition;
+
+ SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+ SmallVector<MachineInstr *, 4> KillInstrs;
void printInfo();
@@ -153,6 +185,8 @@
std::vector<WorkItem> &Worklist);
void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+ void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -167,17 +201,27 @@
MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg);
+ Register SaveWQM);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedWQM);
- void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveOrig);
- void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedOrig);
- void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+ Register SavedWQM);
+ void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ Register SaveOrig, char StrictStateNeeded);
+ void fromStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before, Register SavedOrig,
+ char NonStrictState, char CurrentStrictState);
- void lowerLiveMaskQueries(unsigned LiveMaskReg);
+ MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
+
+ MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
+ bool IsWQM);
+ MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
+
+ void lowerBlock(MachineBasicBlock &MBB);
+ void processBlock(MachineBasicBlock &MBB, bool IsEntry);
+
+ void lowerLiveMaskQueries();
void lowerCopyInstrs();
+ void lowerKillInstrs(bool IsWQM);
public:
static char ID;
@@ -193,9 +237,17 @@
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
};
} // end anonymous namespace
@@ -205,6 +257,8 @@
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
@@ -241,8 +295,6 @@
assert(!(Flag & StateExact) && Flag != 0);
- LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
-
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -254,6 +306,7 @@
if ((II.Needs & Flag) == Flag)
return;
+ LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
II.Needs |= Flag;
Worklist.push_back(&MI);
}
@@ -262,108 +315,167 @@
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
Register Reg, unsigned SubReg, char Flag,
std::vector<WorkItem> &Worklist) {
- assert(!MRI->isSSA());
-
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
- if (!UseLRQ.valueIn())
+ const VNInfo *Value = UseLRQ.valueIn();
+ if (!Value)
return;
- SmallPtrSet<const VNInfo *, 4> Visited;
- SmallVector<const VNInfo *, 4> ToProcess;
- ToProcess.push_back(UseLRQ.valueIn());
+ // Note: this code assumes that lane masks on AMDGPU completely
+ // cover registers.
+ const LaneBitmask UseLanes =
+ SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+ : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
+ : LaneBitmask::getNone());
+
+ // Perform a depth-first iteration of the LiveRange graph marking defs.
+ // Stop processing of a given branch when all use lanes have been defined.
+ // The first definition stops processing for a physical register.
+ struct PhiEntry {
+ const VNInfo *Phi;
+ unsigned PredIdx;
+ LaneBitmask DefinedLanes;
+
+ PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
+ : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
+ };
+ using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
+ SmallVector<PhiEntry, 2> PhiStack;
+ SmallSet<VisitKey, 4> Visited;
+ LaneBitmask DefinedLanes;
+ unsigned NextPredIdx = 0; // Only used for processing phi nodes
do {
- const VNInfo *Value = ToProcess.pop_back_val();
- Visited.insert(Value);
+ const VNInfo *NextValue = nullptr;
+ const VisitKey Key(Value, DefinedLanes);
+
+ if (!Visited.count(Key)) {
+ Visited.insert(Key);
+ // On first visit to a phi then start processing first predecessor
+ NextPredIdx = 0;
+ }
if (Value->isPHIDef()) {
- // Need to mark all defs used in the PHI node
+ // Each predecessor node in the phi must be processed as a subgraph
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
assert(MBB && "Phi-def has no defining MBB");
- for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
- PE = MBB->pred_end();
- PI != PE; ++PI) {
+
+ // Find next predecessor to process
+ unsigned Idx = NextPredIdx;
+ auto PI = MBB->pred_begin() + Idx;
+ auto PE = MBB->pred_end();
+ for (; PI != PE && !NextValue; ++PI, ++Idx) {
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ if (!Visited.count(VisitKey(VN, DefinedLanes)))
+ NextValue = VN;
}
}
+
+ // If there are more predecessors to process; add phi to stack
+ if (PI != PE)
+ PhiStack.emplace_back(Value, Idx, DefinedLanes);
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
- markInstruction(*MI, Flag, Worklist);
- // Iterate over all operands to find relevant definitions
- for (const MachineOperand &Op : MI->operands()) {
- if (!(Op.isReg() && Op.getReg() == Reg))
- continue;
+ if (Reg.isVirtual()) {
+ // Iterate over all operands to find relevant definitions
+ bool HasDef = false;
+ for (const MachineOperand &Op : MI->operands()) {
+ if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
+ continue;
- // Does this def cover whole register?
- bool DefinesFullReg =
- Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
- if (!DefinesFullReg) {
- // Partial definition; need to follow and mark input value
+ // Compute lanes defined and overlap with use
+ LaneBitmask OpLanes =
+ Op.isUndef() ? LaneBitmask::getAll()
+ : TRI->getSubRegIndexLaneMask(Op.getSubReg());
+ LaneBitmask Overlap = (UseLanes & OpLanes);
+
+ // Record if this instruction defined any of use
+ HasDef |= Overlap.any();
+
+ // Mark any lanes defined
+ DefinedLanes |= OpLanes;
+ }
+
+ // Check if all lanes of use have been defined
+ if ((DefinedLanes & UseLanes) != UseLanes) {
+ // Definition not complete; need to process input value
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
if (const VNInfo *VN = LRQ.valueIn()) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ if (!Visited.count(VisitKey(VN, DefinedLanes)))
+ NextValue = VN;
}
}
+
+ // Only mark the instruction if it defines some part of the use
+ if (HasDef)
+ markInstruction(*MI, Flag, Worklist);
+ } else {
+ // For physical registers simply mark the defining instruction
+ markInstruction(*MI, Flag, Worklist);
}
}
- } while (!ToProcess.empty());
+
+ if (!NextValue && !PhiStack.empty()) {
+ // Reach end of chain; revert to processing last phi
+ PhiEntry &Entry = PhiStack.back();
+ NextValue = Entry.Phi;
+ NextPredIdx = Entry.PredIdx;
+ DefinedLanes = Entry.DefinedLanes;
+ PhiStack.pop_back();
+ }
+
+ Value = NextValue;
+ } while (Value);
+}
+
+void SIWholeQuadMode::markOperand(const MachineInstr &MI,
+ const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ assert(Op.isReg());
+ Register Reg = Op.getReg();
+
+ // Ignore some hardware registers
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ return;
+ default:
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
+ << " for " << MI);
+ if (Reg.isVirtual()) {
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+ } else {
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+ ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ }
+ }
}
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
-
LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
<< MI);
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
-
- Register Reg = Use.getReg();
-
- // Handle physical registers that we need to track; this is mostly relevant
- // for VCC, which can appear as the (implicit) input of a uniform branch,
- // e.g. when a loop counter is stored in a VGPR.
- if (!Reg.isVirtual()) {
- if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
- continue;
-
- for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
- ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
-
- if (MRI->isSSA()) {
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
- markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
- Worklist);
- } else {
- markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
- }
- }
-
- continue;
- }
-
- if (MRI->isSSA()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, Flag, Worklist);
- } else {
- LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
- }
+ markOperand(MI, Use, Flag, Worklist);
}
}
@@ -392,6 +504,9 @@
char Flags = 0;
if (TII->isWQM(Opcode)) {
+ // If LOD is not supported WQM is not needed.
+ if (!ST->hasExtendedImageInsts())
+ continue;
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
@@ -407,27 +522,31 @@
LowerToCopyInstrs.push_back(&MI);
SoftWQMInstrs.push_back(&MI);
continue;
- } else if (Opcode == AMDGPU::WWM) {
- // The WWM intrinsic doesn't make the same guarantee, and plus it needs
- // to be executed in WQM or Exact so that its copy doesn't clobber
- // inactive lanes.
- markInstructionUses(MI, StateWWM, Worklist);
- GlobalFlags |= StateWWM;
+ } else if (Opcode == AMDGPU::STRICT_WWM) {
+ // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
+ // it needs to be executed in WQM or Exact so that its copy doesn't
+ // clobber inactive lanes.
+ markInstructionUses(MI, StateStrictWWM, Worklist);
+ GlobalFlags |= StateStrictWWM;
+ LowerToMovInstrs.push_back(&MI);
+ continue;
+ } else if (Opcode == AMDGPU::STRICT_WQM) {
+ // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
+ // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
+ // quads that have at least one active thread.
+ markInstructionUses(MI, StateStrictWQM, Worklist);
+ GlobalFlags |= StateStrictWQM;
LowerToMovInstrs.push_back(&MI);
continue;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
- III.Disabled = StateWWM;
+ III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
if (Inactive.isUndef()) {
LowerToCopyInstrs.push_back(&MI);
} else {
- Register Reg = Inactive.getReg();
- if (Reg.isVirtual()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Reg))
- markInstruction(DefMI, StateWWM, Worklist);
- }
+ markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
}
SetInactiveInstrs.push_back(&MI);
@@ -439,15 +558,21 @@
Worklist.push_back(&MBB);
}
GlobalFlags |= StateExact;
- III.Disabled = StateWQM | StateWWM;
+ III.Disabled = StateWQM | StateStrict;
continue;
} else {
- if (Opcode == AMDGPU::SI_PS_LIVE) {
+ if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
LiveMaskQueries.push_back(&MI);
+ } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+ Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+ Opcode == AMDGPU::SI_DEMOTE_I1) {
+ KillInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.
+ // FIXME: is this still valid?
for (const MachineOperand &MO : MI.defs()) {
if (!MO.isReg())
continue;
@@ -510,7 +635,7 @@
// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {
- char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
+ char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -526,10 +651,12 @@
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
- // Ensure we process a block containing WWM, even if it does not require any
- // WQM transitions.
- if (II.Needs & StateWWM)
- BI.Needs |= StateWWM;
+ // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
+ // not require any WQM transitions.
+ if (II.Needs & StateStrictWWM)
+ BI.Needs |= StateStrictWWM;
+ if (II.Needs & StateStrictWQM)
+ BI.Needs |= StateStrictWQM;
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -604,6 +731,339 @@
return Restore;
}
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+ MachineInstr *TermMI) {
+ LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
+ << *TermMI << "\n");
+
+ MachineBasicBlock *SplitBB =
+ BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
+
+ // Convert last instruction in block to a terminator.
+ // Note: this only covers the expected patterns
+ unsigned NewOpcode = 0;
+ switch (TermMI->getOpcode()) {
+ case AMDGPU::S_AND_B32:
+ NewOpcode = AMDGPU::S_AND_B32_term;
+ break;
+ case AMDGPU::S_AND_B64:
+ NewOpcode = AMDGPU::S_AND_B64_term;
+ break;
+ case AMDGPU::S_MOV_B32:
+ NewOpcode = AMDGPU::S_MOV_B32_term;
+ break;
+ case AMDGPU::S_MOV_B64:
+ NewOpcode = AMDGPU::S_MOV_B64_term;
+ break;
+ default:
+ break;
+ }
+ if (NewOpcode)
+ TermMI->setDesc(TII->get(NewOpcode));
+
+ if (SplitBB != BB) {
+ // Update dominator trees
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
+ if (MDT)
+ MDT->getBase().applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->getBase().applyUpdates(DTUpdates);
+
+ // Link blocks
+ MachineInstr *MI =
+ BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(SplitBB);
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+
+ return SplitBB;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
+ MachineInstr &MI) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opcode = 0;
+
+ assert(MI.getOperand(0).isReg());
+
+ // Comparison is for live lanes; however here we compute the inverse
+ // (killed lanes). This is because VCMP will always generate 0 bits
+ // for inactive lanes so a mask of live lanes would not be correct
+ // inside control flow.
+ // Invert the comparison by swapping the operands and adjusting
+ // the comparison codes.
+
+ switch (MI.getOperand(2).getImm()) {
+ case ISD::SETUEQ:
+ Opcode = AMDGPU::V_CMP_LG_F32_e64;
+ break;
+ case ISD::SETUGT:
+ Opcode = AMDGPU::V_CMP_GE_F32_e64;
+ break;
+ case ISD::SETUGE:
+ Opcode = AMDGPU::V_CMP_GT_F32_e64;
+ break;
+ case ISD::SETULT:
+ Opcode = AMDGPU::V_CMP_LE_F32_e64;
+ break;
+ case ISD::SETULE:
+ Opcode = AMDGPU::V_CMP_LT_F32_e64;
+ break;
+ case ISD::SETUNE:
+ Opcode = AMDGPU::V_CMP_EQ_F32_e64;
+ break;
+ case ISD::SETO:
+ Opcode = AMDGPU::V_CMP_O_F32_e64;
+ break;
+ case ISD::SETUO:
+ Opcode = AMDGPU::V_CMP_U_F32_e64;
+ break;
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ Opcode = AMDGPU::V_CMP_NLT_F32_e64;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Opcode = AMDGPU::V_CMP_NLE_F32_e64;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ Opcode = AMDGPU::V_CMP_NGT_F32_e64;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Opcode = AMDGPU::V_CMP_NGE_F32_e64;
+ break;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ Opcode = AMDGPU::V_CMP_NLG_F32_e64;
+ break;
+ default:
+ llvm_unreachable("invalid ISD:SET cond code");
+ }
+
+ // Pick opcode based on comparison type.
+ MachineInstr *VcmpMI;
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ if (TRI->isVGPR(*MRI, Op0.getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
+ } else {
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .addReg(AMDGPU::VCC, RegState::Define)
+ .addImm(0) // src0 modifiers
+ .add(Op1)
+ .addImm(0) // src1 modifiers
+ .add(Op0)
+ .addImm(0); // omod
+ }
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
+ MachineInstr *MaskUpdateMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(VCC);
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ MachineInstr *ExecMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
+
+ assert(MBB.succ_size() == 1);
+ MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+
+ // Update live intervals
+ LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
+ MBB.remove(&MI);
+
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*ExecMaskMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ return NewTerm;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
+ MachineInstr &MI, bool IsWQM) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstr *MaskUpdateMI = nullptr;
+
+ const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
+ const MachineOperand &Op = MI.getOperand(0);
+ int64_t KillVal = MI.getOperand(1).getImm();
+ MachineInstr *ComputeKilledMaskMI = nullptr;
+ Register CndReg = !Op.isImm() ? Op.getReg() : Register();
+ Register TmpReg;
+
+ // Is this a static or dynamic kill?
+ if (Op.isImm()) {
+ if (Op.getImm() == KillVal) {
+ // Static: all active lanes are killed
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(Exec);
+ } else {
+ // Static: kill does nothing
+ MachineInstr *NewTerm = nullptr;
+ if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ } else {
+ assert(MBB.succ_size() == 1);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+ LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+ }
+ MBB.remove(&MI);
+ return NewTerm;
+ }
+ } else {
+ if (!KillVal) {
+ // Op represents live lanes after kill,
+ // so exec mask needs to be factored in.
+ TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ ComputeKilledMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(TmpReg);
+ } else {
+ // Op represents lanes to kill
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .add(Op);
+ }
+ }
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ // In the case we got this far some lanes are still live,
+ // update EXEC to deactivate lanes as appropriate.
+ MachineInstr *NewTerm;
+ MachineInstr *WQMMaskMI = nullptr;
+ Register LiveMaskWQM;
+ if (IsDemote) {
+ // Demotes deactive quads with only helper lanes
+ LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
+ WQMMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
+ .addReg(Exec)
+ .addReg(LiveMaskWQM);
+ } else {
+ // Kills deactivate lanes
+ if (Op.isImm()) {
+ unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+ } else if (!IsWQM) {
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+ .addReg(Exec)
+ .addReg(LiveMaskReg);
+ } else {
+ unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
+ NewTerm =
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+ }
+ }
+
+ // Update live intervals
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MBB.remove(&MI);
+ assert(EarlyTermMI);
+ assert(MaskUpdateMI);
+ assert(NewTerm);
+ if (ComputeKilledMaskMI)
+ LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ if (WQMMaskMI)
+ LIS->InsertMachineInstrInMaps(*WQMMaskMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ if (CndReg) {
+ LIS->removeInterval(CndReg);
+ LIS->createAndComputeVirtRegInterval(CndReg);
+ }
+ if (TmpReg)
+ LIS->createAndComputeVirtRegInterval(TmpReg);
+ if (LiveMaskWQM)
+ LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
+
+ return NewTerm;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
+ auto BII = Blocks.find(&MBB);
+ if (BII == Blocks.end())
+ return;
+
+ const BlockInfo &BI = BII->second;
+ if (!BI.NeedsLowering)
+ return;
+
+ LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
+
+ SmallVector<MachineInstr *, 4> SplitPoints;
+ char State = BI.InitialState;
+
+ auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+ while (II != IE) {
+ auto Next = std::next(II);
+ MachineInstr &MI = *II;
+
+ if (StateTransition.count(&MI))
+ State = StateTransition[&MI];
+
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_DEMOTE_I1:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(MBB, MI);
+ break;
+ default:
+ break;
+ }
+ if (SplitPoint)
+ SplitPoints.push_back(SplitPoint);
+
+ II = Next;
+ }
+
+ // Perform splitting after instruction scan to simplify iteration.
+ if (!SplitPoints.empty()) {
+ MachineBasicBlock *BB = &MBB;
+ for (MachineInstr *MI : SplitPoints) {
+ BB = splitBlock(BB, MI);
+ }
+ }
+}
+
// Return an iterator in the (inclusive) range [First, Last] at which
// instructions can be safely inserted, keeping in mind that some of the
// instructions we want to add necessarily clobber SCC.
@@ -680,93 +1140,108 @@
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg) {
+ Register SaveWQM) {
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
- SaveWQM)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
.addReg(LiveMaskReg);
} else {
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
- Exec)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateExact;
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SavedWQM) {
+ Register SavedWQM) {
MachineInstr *MI;
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateWQM;
}
-void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator Before,
- unsigned SaveOrig) {
+void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ Register SaveOrig, char StrictStateNeeded) {
MachineInstr *MI;
-
assert(SaveOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
- .addImm(-1);
+ assert(StrictStateNeeded == StateStrictWWM ||
+ StrictStateNeeded == StateStrictWQM);
+
+ if (StrictStateNeeded == StateStrictWWM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
+ SaveOrig)
+ .addImm(-1);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
+ SaveOrig)
+ .addImm(-1);
+ }
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateStrictWWM;
}
-void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator Before,
- unsigned SavedOrig) {
+void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ Register SavedOrig, char NonStrictState,
+ char CurrentStrictState) {
MachineInstr *MI;
assert(SavedOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
- ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
- .addReg(SavedOrig);
+ assert(CurrentStrictState == StateStrictWWM ||
+ CurrentStrictState == StateStrictWQM);
+
+ if (CurrentStrictState == StateStrictWWM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
+ Exec)
+ .addReg(SavedOrig);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
+ Exec)
+ .addReg(SavedOrig);
+ }
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = NonStrictState;
}
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
- bool isEntry) {
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())
return;
- const BlockInfo &BI = BII->second;
+ BlockInfo &BI = BII->second;
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
- if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+ if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+ BI.InitialState = StateWQM;
return;
+ }
LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
<< ":\n");
- unsigned SavedWQMReg = 0;
- unsigned SavedNonWWMReg = 0;
- bool WQMFromExec = isEntry;
- char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
- char NonWWMState = 0;
+ Register SavedWQMReg;
+ Register SavedNonStrictReg;
+ bool WQMFromExec = IsEntry;
+ char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+ char NonStrictState = 0;
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- if (isEntry) {
+ if (IsEntry) {
// Skip the instruction that saves LiveMask
if (II != IE && II->getOpcode() == AMDGPU::COPY)
++II;
@@ -776,22 +1251,25 @@
// Exact or vice versa.
MachineBasicBlock::iterator FirstWQM = IE;
- // This stores the first instruction where it's safe to switch from WWM to
- // Exact/WQM or to switch to WWM. It must always be the same as, or after,
- // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
- // switch to/from WQM as well.
- MachineBasicBlock::iterator FirstWWM = IE;
+ // This stores the first instruction where it's safe to switch from Strict
+ // mode to Exact/WQM or to switch to Strict mode. It must always be the same
+ // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
+ // be safe to switch to/from WQM as well.
+ MachineBasicBlock::iterator FirstStrict = IE;
+
+ // Record initial state is block information.
+ BI.InitialState = State;
for (;;) {
MachineBasicBlock::iterator Next = II;
- char Needs = StateExact | StateWQM; // WWM is disabled by default
+ char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
char OutNeeds = 0;
if (FirstWQM == IE)
FirstWQM = II;
- if (FirstWWM == IE)
- FirstWWM = II;
+ if (FirstStrict == IE)
+ FirstStrict = II;
// First, figure out the allowed states (Needs) based on the propagated
// flags.
@@ -801,8 +1279,10 @@
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
- if (III->second.Needs & StateWWM)
- Needs = StateWWM;
+ if (III->second.Needs & StateStrictWWM)
+ Needs = StateStrictWWM;
+ else if (III->second.Needs & StateStrictWQM)
+ Needs = StateStrictWQM;
else if (III->second.Needs & StateWQM)
Needs = StateWQM;
else
@@ -811,8 +1291,8 @@
}
} else {
// If the instruction doesn't actually need a correct EXEC, then we can
- // safely leave WWM enabled.
- Needs = StateExact | StateWQM | StateWWM;
+ // safely leave Strict mode enabled.
+ Needs = StateExact | StateWQM | StateStrict;
}
if (MI.isTerminator() && OutNeeds == StateExact)
@@ -832,32 +1312,56 @@
// Now, transition if necessary.
if (!(Needs & State)) {
MachineBasicBlock::iterator First;
- if (State == StateWWM || Needs == StateWWM) {
- // We must switch to or from WWM
- First = FirstWWM;
+ if (State == StateStrictWWM || Needs == StateStrictWWM ||
+ State == StateStrictWQM || Needs == StateStrictWQM) {
+ // We must switch to or from Strict mode.
+ First = FirstStrict;
} else {
- // We only need to switch to/from WQM, so we can use FirstWQM
+ // We only need to switch to/from WQM, so we can use FirstWQM.
First = FirstWQM;
}
+ // Whether we need to save SCC depends on start and end states.
+ bool SaveSCC = false;
+ switch (State) {
+ case StateExact:
+ case StateStrictWWM:
+ case StateStrictWQM:
+ // Exact/Strict -> Strict: save SCC
+ // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
+ // Exact/Strict -> Exact: no save
+ SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
+ break;
+ case StateWQM:
+ // WQM -> Exact/Strict: save SCC
+ SaveSCC = !(Needs & StateWQM);
+ break;
+ default:
+ llvm_unreachable("Unknown state");
+ break;
+ }
MachineBasicBlock::iterator Before =
- prepareInsertion(MBB, First, II, Needs == StateWQM,
- Needs == StateExact || WQMFromExec);
+ prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
- if (State == StateWWM) {
- assert(SavedNonWWMReg);
- fromWWM(MBB, Before, SavedNonWWMReg);
- LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
- SavedNonWWMReg = 0;
- State = NonWWMState;
+ if (State & StateStrict) {
+ assert(State == StateStrictWWM || State == StateStrictWQM);
+ assert(SavedNonStrictReg);
+ fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
+
+ LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
+ SavedNonStrictReg = 0;
+ State = NonStrictState;
}
- if (Needs == StateWWM) {
- NonWWMState = State;
- assert(!SavedNonWWMReg);
- SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
- toWWM(MBB, Before, SavedNonWWMReg);
- State = StateWWM;
+ if (Needs & StateStrict) {
+ NonStrictState = State;
+ assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
+ assert(!SavedNonStrictReg);
+ SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
+
+ toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
+ State = Needs;
+
} else {
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM)) {
@@ -865,7 +1369,7 @@
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
}
- toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ toExact(MBB, Before, SavedWQMReg);
State = StateExact;
} else if (State == StateExact && (Needs & StateWQM) &&
!(Needs & StateExact)) {
@@ -879,17 +1383,18 @@
}
State = StateWQM;
} else {
- // We can get here if we transitioned from WWM to a non-WWM state that
- // already matches our needs, but we shouldn't need to do anything.
+ // We can get here if we transitioned from StrictWWM to a
+ // non-StrictWWM state that already matches our needs, but we
+ // shouldn't need to do anything.
assert(Needs & State);
}
}
}
- if (Needs != (StateExact | StateWQM | StateWWM)) {
+ if (Needs != (StateExact | StateWQM | StateStrict)) {
if (Needs != (StateExact | StateWQM))
FirstWQM = IE;
- FirstWWM = IE;
+ FirstStrict = IE;
}
if (II == IE)
@@ -898,10 +1403,10 @@
II = Next;
}
assert(!SavedWQMReg);
- assert(!SavedNonWWMReg);
+ assert(!SavedNonStrictReg);
}
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+void SIWholeQuadMode::lowerLiveMaskQueries() {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
@@ -931,9 +1436,12 @@
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
- // And make it implicitly depend on exec (like all VALU movs should do).
- MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- } else if (!MRI->isSSA()) {
+ // Check that it already implicitly depends on exec (like all VALU movs
+ // should do).
+ assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
+ return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
+ }));
+ } else {
// Remove early-clobber and exec dependency from simple SGPR copies.
// This allows some to be eliminated during/post RA.
LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
@@ -969,13 +1477,38 @@
}
}
+void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
+ for (MachineInstr *MI : KillInstrs) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI->getOpcode()) {
+ case AMDGPU::SI_DEMOTE_I1:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(*MBB, *MI);
+ break;
+ default:
+ continue;
+ }
+ if (SplitPoint)
+ splitBlock(MBB, SplitPoint);
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
+ << " ------------- \n");
+ LLVM_DEBUG(MF.dump(););
+
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
- CallingConv = MF.getFunction().getCallingConv();
+ KillInstrs.clear();
+ StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -983,64 +1516,72 @@
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
- XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
- XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
- char GlobalFlags = analyzeFunction(MF);
- unsigned LiveMaskReg = 0;
- if (!(GlobalFlags & StateWQM)) {
- lowerLiveMaskQueries(Exec);
- if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
- return !LiveMaskQueries.empty();
- } else {
- // Store a copy of the original live mask when required
- MachineBasicBlock &Entry = MF.front();
- MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+ const char GlobalFlags = analyzeFunction(MF);
+ const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
- if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
- LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
- MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
- }
+ LiveMaskReg = Exec;
- lowerLiveMaskQueries(LiveMaskReg);
+ // Shader is simple does not need any state changes or any complex lowering
+ if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
+ LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ lowerLiveMaskQueries();
+ return !LiveMaskQueries.empty();
+ }
- if (GlobalFlags == StateWQM) {
- // For a shader that needs only WQM, we can just set it once.
- auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
- : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
+ MachineBasicBlock &Entry = MF.front();
+ MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
- lowerCopyInstrs();
- // EntryMI may become invalid here
- return true;
- }
+ // Store a copy of the original live mask when required
+ if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+ LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ MachineInstr *MI =
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
}
LLVM_DEBUG(printInfo());
+ lowerLiveMaskQueries();
lowerCopyInstrs();
- // Handle the general case
- for (auto BII : Blocks)
- processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+ // Shader only needs WQM
+ if (GlobalFlags == StateWQM) {
+ auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
+ lowerKillInstrs(true);
+ } else {
+ for (auto BII : Blocks)
+ processBlock(*BII.first, BII.first == &Entry);
+ // Lowering blocks causes block splitting so perform as a second pass.
+ for (auto BII : Blocks)
+ lowerBlock(*BII.first);
+ }
- if (LiveMaskReg)
+ // Compute live range for live mask
+ if (LiveMaskReg != Exec)
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
// Physical registers like SCC aren't tracked by default anyway, so just
@@ -1048,5 +1589,9 @@
// the analysis results.
LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ // If we performed any kills then recompute EXEC
+ if (!KillInstrs.empty())
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 5b8896c..8502ed6 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -57,10 +57,19 @@
Instruction Opcode = !cast<Instruction>(NAME);
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let LGKM_CNT = ps.LGKM_CNT;
+ let SMRD = ps.SMRD;
+ let mayStore = ps.mayStore;
+ let mayLoad = ps.mayLoad;
+ let hasSideEffects = ps.hasSideEffects;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let SMRD = ps.SMRD;
+ let SchedRW = ps.SchedRW;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+
+ let TSFlags = ps.TSFlags;
bit is_buffer = ps.is_buffer;
@@ -69,6 +78,7 @@
bits<7> sdst;
bits<32> offset;
bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+ bits<5> cpol;
}
class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
@@ -120,8 +130,8 @@
RegisterClass dstClass> {
def _IMM : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
- " $sdst, $sbase, $offset$glc$dlc", []> {
+ (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
+ " $sdst, $sbase, $offset$cpol", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_IMM";
@@ -131,8 +141,8 @@
def _SGPR : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
- " $sdst, $sbase, $offset$glc$dlc", []> {
+ (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
+ " $sdst, $sbase, $offset$cpol", []> {
let BaseClass = baseClass;
let PseudoInstr = opName # "_SGPR";
let has_glc = 1;
@@ -144,8 +154,8 @@
RegisterClass baseClass,
RegisterClass srcClass> {
def _IMM : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
- " $sdata, $sbase, $offset$glc$dlc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol),
+ " $sdata, $sbase, $offset$cpol", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let SrcClass = srcClass;
@@ -153,8 +163,8 @@
}
def _SGPR : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
- " $sdata, $sbase, $offset$glc$dlc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
+ " $sdata, $sbase, $offset$cpol", []> {
let BaseClass = baseClass;
let SrcClass = srcClass;
let PseudoInstr = opName # "_SGPR";
@@ -227,24 +237,32 @@
let ScalarStore = 1;
let hasSideEffects = 1;
let maybeAtomic = 1;
+
+ let IsAtomicNoRet = !not(isRet);
+ let IsAtomicRet = isRet;
+
+ let AsmMatchConverter = "cvtSMEMAtomic";
}
class SM_Pseudo_Atomic<string opName,
RegisterClass baseClass,
RegisterClass dataClass,
bit isImm,
- bit isRet> :
+ bit isRet,
+ string opNameWithSuffix = opName # !if(isImm,
+ !if(isRet, "_IMM_RTN", "_IMM"),
+ !if(isRet, "_SGPR_RTN", "_SGPR")),
+ Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> :
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc),
- (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
- !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
- isRet> {
+ (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol",
+ isRet>,
+ AtomicNoRet <opNameWithSuffix, isRet> {
let offset_is_imm = isImm;
- let PseudoInstr = opName # !if(isImm,
- !if(isRet, "_IMM_RTN", "_IMM"),
- !if(isRet, "_SGPR_RTN", "_SGPR"));
+ let PseudoInstr = opNameWithSuffix;
let Constraints = !if(isRet, "$sdst = $sdata", "");
let DisableEncoding = !if(isRet, "$sdata", "");
@@ -456,13 +474,13 @@
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_si : SMRD_Real_si <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol);
}
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -490,32 +508,31 @@
: SM_Real<ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
, Enc64 {
- bit glc;
-
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
- let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
let Inst{17} = imm;
let Inst{25-18} = op;
let Inst{31-26} = 0x30; //encoding
// VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
// Offset value is corrected accordingly when offset is encoded/decoded.
- let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?);
+ let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?);
+ let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?);
}
multiclass SM_Real_Loads_vi<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -533,11 +550,11 @@
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -589,15 +606,16 @@
//===----------------------------------------------------------------------===//
class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
- : SMEM_Real_vi <op, ps> {
+ : SMEM_Real_vi <op, ps>,
+ AtomicNoRet <!subst("_RTN","",NAME), ps.glc> {
bits<7> sdata;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let glc = ps.glc;
- let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+ let cpol{CPolBit.GLC} = ps.glc;
+ let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
@@ -686,13 +704,7 @@
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace = "GFX7";
- let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
-
- let LGKM_CNT = ps.LGKM_CNT;
- let mayLoad = ps.mayLoad;
- let mayStore = ps.mayStore;
- let hasSideEffects = ps.hasSideEffects;
- let SchedRW = ps.SchedRW;
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, CPol:$cpol);
let Inst{7-0} = 0xff;
let Inst{8} = 0;
@@ -764,26 +776,26 @@
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> {
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isGFX7Only];
}
// 3. SGPR offset
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
>;
// 4. No offset
def : GCNPat <
(vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
>;
}
@@ -791,8 +803,7 @@
// 1. Offset as an immediate
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy),
- (extract_dlc $cachepolicy)))> {
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
let AddedComplexity = 2;
}
@@ -800,7 +811,7 @@
def : GCNPat <
(vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
(!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
- (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> {
+ (extract_cpol $cachepolicy))> {
let OtherPredicates = [isGFX7Only];
let AddedComplexity = 1;
}
@@ -808,8 +819,7 @@
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy),
- (extract_dlc $cachepolicy)))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_cpol $cachepolicy)))
>;
}
@@ -858,14 +868,16 @@
>;
} // let OtherPredicates = [HasSMemTimeInst]
-let OtherPredicates = [HasNoSMemTimeInst] in {
+let OtherPredicates = [HasShaderCyclesRegister] in {
def : GCNPat <
(i64 (readcyclecounter)),
(REG_SEQUENCE SReg_64,
(S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0,
- (S_MOV_B32 (i32 0)), sub1)
->;
-} // let OtherPredicates = [HasNoSMemTimeInst]
+ (S_MOV_B32 (i32 0)), sub1)> {
+ // Prefer this to s_memtime because it has lower and more predictable latency.
+ let AddedComplexity = 1;
+}
+} // let OtherPredicates = [HasShaderCyclesRegister]
//===----------------------------------------------------------------------===//
// GFX10.
@@ -873,16 +885,13 @@
class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
- bit glc;
- bit dlc;
-
let AssemblerPredicate = isGFX10Plus;
let DecoderNamespace = "GFX10";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
- let Inst{14} = !if(ps.has_dlc, dlc, ?);
- let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
let Inst{25-18} = op;
let Inst{31-26} = 0x3d;
let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
@@ -894,10 +903,10 @@
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -914,11 +923,11 @@
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -973,18 +982,18 @@
defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">;
class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
- : SMEM_Real_gfx10 <op, ps> {
+ : SMEM_Real_gfx10 <op, ps>,
+ AtomicNoRet <!subst("_RTN","",NAME), ps.glc> {
bits<7> sdata;
- bit dlc;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let glc = ps.glc;
+ let cpol{CPolBit.GLC} = ps.glc;
- let Inst{14} = !if(ps.has_dlc, dlc, 0);
- let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+ let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
+ let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 7426af9..e969701 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -59,6 +59,8 @@
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOP1 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Size = 4;
@@ -66,6 +68,9 @@
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<7> sdst;
@@ -157,7 +162,7 @@
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def S_MOV_B32 : SOP1_32 <"s_mov_b32">;
def S_MOV_B64 : SOP1_64 <"s_mov_b64">;
- } // End isRematerializeable = 1
+ } // End isReMaterializable = 1
let Uses = [SCC] in {
def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">;
@@ -192,10 +197,14 @@
>;
}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
[(set i32:$sdst, (bitreverse i32:$src0))]
>;
-def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64",
+ [(set i64:$sdst, (bitreverse i64:$src0))]
+>;
+} // End isReMaterializable = 1, isAsCheapAsAMove = 1
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
@@ -208,6 +217,7 @@
>;
} // End Defs = [SCC]
+let isReMaterializable = 1 in {
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
@@ -235,11 +245,13 @@
def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
[(set i32:$sdst, (sext_inreg i32:$src0, i16))]
>;
+} // End isReMaterializable = 1
def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>;
def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
+
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -291,7 +303,9 @@
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
let Defs = [SCC] in {
-def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+def S_ABS_I32 : SOP1_32 <"s_abs_i32",
+ [(set i32:$sdst, (abs i32:$src0))]
+ >;
} // End Defs = [SCC]
let SubtargetPredicate = HasVGPRIndexMode in {
@@ -309,6 +323,7 @@
def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">;
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+ let isReMaterializable = 1 in
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
} // End SubtargetPredicate = isGFX9Plus
@@ -363,14 +378,19 @@
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOP2 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<7> sdst;
@@ -596,6 +616,7 @@
>;
} // End Defs = [SCC]
+let isReMaterializable = 1 in {
def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
[(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
@@ -605,7 +626,7 @@
[(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
let isCommutable = 1;
}
-
+} // End isReMaterializable = 1
} // End AddedComplexity = 1
let Defs = [SCC] in {
@@ -640,9 +661,11 @@
}
let SubtargetPredicate = isGFX9Plus in {
- def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
- def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
- def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+ let isReMaterializable = 1 in {
+ def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
+ def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
+ def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+ } // End isReMaterializable = 1
let Defs = [SCC] in {
def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32",
@@ -659,12 +682,12 @@
>;
} // End Defs = [SCC]
- let isCommutable = 1 in {
+ let isCommutable = 1, isReMaterializable = 1 in {
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32",
[(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>;
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32",
[(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>;
- }
+ } // End isCommutable = 1, isReMaterializable = 1
} // End SubtargetPredicate = isGFX9Plus
//===----------------------------------------------------------------------===//
@@ -693,6 +716,8 @@
class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList,
ps.Mnemonic # " " # ps.AsmOperands, []> {
+ let SALU = 1;
+ let SOPK = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -701,6 +726,11 @@
let AsmMatchConverter = ps.AsmMatchConverter;
let DisableEncoding = ps.DisableEncoding;
let Constraints = ps.Constraints;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let isBranch = ps.isBranch;
+ let isCall = ps.isCall;
// encoding
bits<7> sdst;
@@ -947,15 +977,20 @@
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOPC = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<8> src0;
@@ -1075,15 +1110,20 @@
class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # ps.AsmOperands, []> {
+ let SALU = 1;
+ let SOPP = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
bits <16> simm16;
}
@@ -1226,7 +1266,8 @@
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
-def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i16imm:$simm16), "$simm16">;
+def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
+ [(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
// On SI the documentation says sleep for approximately 64 * low 2
@@ -1433,8 +1474,9 @@
//===----------------------------------------------------------------------===//
multiclass SOP1_Real_gfx10<bits<8> op> {
- def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx10 : SOP1_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
@@ -1462,8 +1504,9 @@
multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
- def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOP1_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
@@ -1524,8 +1567,9 @@
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx10<bits<7> op> {
- def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP2_Pseudo>(NAME);
+ def _gfx10 : SOP2_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
@@ -1543,8 +1587,9 @@
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
- def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOP2_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
@@ -1600,13 +1645,15 @@
//===----------------------------------------------------------------------===//
multiclass SOPK_Real32_gfx10<bits<5> op> {
- def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real32<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
multiclass SOPK_Real64_gfx10<bits<5> op> {
- def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real64<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_VERSION : SOPK_Real32_gfx10<0x001>;
@@ -1623,13 +1670,15 @@
//===----------------------------------------------------------------------===//
multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
- def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPK_Real32<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
- def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPK_Real64<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
@@ -1665,21 +1714,24 @@
//===----------------------------------------------------------------------===//
multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic> {
- def _gfx6_gfx7 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPP_Real_32<op, ps, real_name>,
+ Select_gfx6_gfx7<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _vi : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _vi : SOPP_Real_32<op, ps, real_name>,
+ Select_vi<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_32_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx10 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx10 : SOPP_Real_32<op, ps, real_name>,
+ Select_gfx10<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
}
multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
@@ -1693,21 +1745,24 @@
//64 bit encodings, for Relaxation
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx6_gfx7 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPP_Real_64<op, ps, real_name>,
+ Select_gfx6_gfx7<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _vi : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _vi : SOPP_Real_64<op, ps, real_name>,
+ Select_vi<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_64_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx10 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx10 : SOPP_Real_64<op, ps, real_name>,
+ Select_gfx10<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_gfx10">;
}
multiclass SOPP_Real_64_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
@@ -1727,18 +1782,7 @@
defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
-defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
-defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
-defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
-defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
-defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
-defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
-defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
-defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
-defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
-defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
-defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
@@ -1765,23 +1809,40 @@
defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>;
defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>;
+let isBranch = 1 in {
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
+defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
+defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
+}
+
//===----------------------------------------------------------------------===//
// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
//===----------------------------------------------------------------------===//
multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
- def _gfx6_gfx7 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPC_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
- def _vi : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_vi<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _vi : SOPC_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx10<bits<7> op> {
- def _gfx10 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _gfx10 : SOPC_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c8a85d7..0bee902 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -15,19 +15,19 @@
namespace SendMsg {
// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
-const char* const IdSymbolic[] = {
+const char *const IdSymbolic[ID_GAPS_LAST_] = {
nullptr,
"MSG_INTERRUPT",
"MSG_GS",
"MSG_GS_DONE",
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
+ "MSG_SAVEWAVE",
+ "MSG_STALL_WAVE_GEN",
+ "MSG_HALT_WAVES",
+ "MSG_ORDERED_PS_DONE",
+ "MSG_EARLY_PRIM_DEALLOC",
"MSG_GS_ALLOC_REQ",
"MSG_GET_DOORBELL",
- nullptr,
+ "MSG_GET_DDID",
nullptr,
nullptr,
nullptr,
@@ -35,7 +35,7 @@
};
// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
-const char* const OpSysSymbolic[] = {
+const char *const OpSysSymbolic[OP_SYS_LAST_] = {
nullptr,
"SYSMSG_OP_ECC_ERR_INTERRUPT",
"SYSMSG_OP_REG_RD",
@@ -43,7 +43,7 @@
"SYSMSG_OP_TTRACE_PC"
};
-const char* const OpGsSymbolic[] = {
+const char *const OpGsSymbolic[OP_GS_LAST_] = {
"GS_OP_NOP",
"GS_OP_CUT",
"GS_OP_EMIT",
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 3eb27c5..d1deb57 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#include "SIDefines.h"
+
namespace llvm {
class StringLiteral;
@@ -17,9 +19,9 @@
namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
-extern const char* const IdSymbolic[];
-extern const char* const OpSysSymbolic[];
-extern const char* const OpGsSymbolic[];
+extern const char *const IdSymbolic[ID_GAPS_LAST_];
+extern const char *const OpSysSymbolic[OP_SYS_LAST_];
+extern const char *const OpGsSymbolic[OP_GS_LAST_];
} // namespace SendMsg
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4c1e4de..29bbf50 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -30,7 +30,8 @@
static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
"amdhsa-code-object-version", llvm::cl::Hidden,
- llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
+ llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
+ llvm::cl::ZeroOrMore);
namespace {
@@ -96,23 +97,36 @@
return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
case 3:
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ case 4:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
default:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
+ Twine(AmdhsaCodeObjectVersion));
}
}
bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
return false;
}
bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
return false;
}
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
+ return false;
+}
+
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
+ return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
@@ -141,6 +155,34 @@
return NewInfo ? NewInfo->Opcode : -1;
}
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported) {
+ unsigned AddrWords = BaseOpcode->NumExtraArgs;
+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (IsA16)
+ AddrWords += divideCeil(AddrComponents, 2);
+ else
+ AddrWords += AddrComponents;
+
+ // Note: For subtargets that support A16 but not G16, enabling A16 also
+ // enables 16 bit gradients.
+ // For subtargets that support A16 (operand) and G16 (done with a different
+ // instruction encoding), they are independent.
+
+ if (BaseOpcode->Gradients) {
+ if ((IsA16 && !IsG16Supported) || BaseOpcode->G16)
+ // There are two gradients per coordinate, we pack them separately.
+ // For the 3d case,
+ // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+ AddrWords += alignTo<2>(Dim->NumGradients / 2);
+ else
+ AddrWords += Dim->NumGradients;
+ }
+ return AddrWords;
+}
+
struct MUBUFInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -148,6 +190,7 @@
bool has_vaddr;
bool has_srsrc;
bool has_soffset;
+ bool IsBufferInv;
};
struct MTBUFInfo {
@@ -164,12 +207,23 @@
bool IsBuffer;
};
+struct VOPInfo {
+ uint16_t Opcode;
+ bool IsSingle;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
#define GET_SMInfoTable_DECL
#define GET_SMInfoTable_IMPL
+#define GET_VOP1InfoTable_DECL
+#define GET_VOP1InfoTable_IMPL
+#define GET_VOP2InfoTable_DECL
+#define GET_VOP2InfoTable_IMPL
+#define GET_VOP3InfoTable_DECL
+#define GET_VOP3InfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -232,11 +286,31 @@
return Info ? Info->has_soffset : false;
}
+bool getMUBUFIsBufferInv(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->IsBufferInv : false;
+}
+
bool getSMEMIsBuffer(unsigned Opc) {
const SMInfo *Info = getSMEMOpcodeHelper(Opc);
return Info ? Info->IsBuffer : false;
}
+bool getVOP1IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP1OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP2IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP2OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP3IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP3OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
@@ -247,7 +321,8 @@
namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
- : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {
+ : STI(STI), XnackSetting(TargetIDSetting::Any),
+ SramEccSetting(TargetIDSetting::Any) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -334,25 +409,109 @@
}
}
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
- auto TargetTriple = STI->getTargetTriple();
- auto Version = getIsaVersion(STI->getCPU());
+std::string AMDGPUTargetID::toString() const {
+ std::string StringRep = "";
+ raw_string_ostream StreamRep(StringRep);
- Stream << TargetTriple.getArchName() << '-'
- << TargetTriple.getVendorName() << '-'
- << TargetTriple.getOSName() << '-'
- << TargetTriple.getEnvironmentName() << '-'
- << "gfx"
- << Version.Major
- << Version.Minor
- << Version.Stepping;
+ auto TargetTriple = STI.getTargetTriple();
+ auto Version = getIsaVersion(STI.getCPU());
- if (hasXNACK(*STI))
- Stream << "+xnack";
- if (hasSRAMECC(*STI))
- Stream << "+sramecc";
+ StreamRep << TargetTriple.getArchName() << '-'
+ << TargetTriple.getVendorName() << '-'
+ << TargetTriple.getOSName() << '-'
+ << TargetTriple.getEnvironmentName() << '-';
- Stream.flush();
+ std::string Processor = "";
+ // TODO: Following else statement is present here because we used various
+ // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
+ // Remove once all aliases are removed from GCNProcessors.td.
+ if (Version.Major >= 9)
+ Processor = STI.getCPU().str();
+ else
+ Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) +
+ Twine(Version.Stepping))
+ .str();
+
+ std::string Features = "";
+ if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVersion) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ // Code object V2 only supported specific processors and had fixed
+ // settings for the XNACK.
+ if (Processor == "gfx600") {
+ } else if (Processor == "gfx601") {
+ } else if (Processor == "gfx602") {
+ } else if (Processor == "gfx700") {
+ } else if (Processor == "gfx701") {
+ } else if (Processor == "gfx702") {
+ } else if (Processor == "gfx703") {
+ } else if (Processor == "gfx704") {
+ } else if (Processor == "gfx705") {
+ } else if (Processor == "gfx801") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx802") {
+ } else if (Processor == "gfx803") {
+ } else if (Processor == "gfx805") {
+ } else if (Processor == "gfx810") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx900") {
+ if (isXnackOnOrAny())
+ Processor = "gfx901";
+ } else if (Processor == "gfx902") {
+ if (isXnackOnOrAny())
+ Processor = "gfx903";
+ } else if (Processor == "gfx904") {
+ if (isXnackOnOrAny())
+ Processor = "gfx905";
+ } else if (Processor == "gfx906") {
+ if (isXnackOnOrAny())
+ Processor = "gfx907";
+ } else if (Processor == "gfx90c") {
+ if (isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " with XNACK being ON or ANY");
+ } else {
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor);
+ }
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ // xnack.
+ if (isXnackOnOrAny())
+ Features += "+xnack";
+ // In code object v2 and v3, "sramecc" feature was spelled with a
+ // hyphen ("sram-ecc").
+ if (isSramEccOnOrAny())
+ Features += "+sram-ecc";
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ // sramecc.
+ if (getSramEccSetting() == TargetIDSetting::Off)
+ Features += ":sramecc-";
+ else if (getSramEccSetting() == TargetIDSetting::On)
+ Features += ":sramecc+";
+ // xnack.
+ if (getXnackSetting() == TargetIDSetting::Off)
+ Features += ":xnack-";
+ else if (getXnackSetting() == TargetIDSetting::On)
+ Features += ":xnack+";
+ break;
+ default:
+ break;
+ }
+ }
+
+ StreamRep << Processor << Features;
+
+ StreamRep.flush();
+ return StringRep;
}
unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
@@ -402,6 +561,8 @@
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
+ if (isGFX90A(*STI))
+ return 8;
if (!isGFX10Plus(*STI))
return 10;
return hasGFX10_3Insts(*STI) ? 16 : 20;
@@ -531,6 +692,9 @@
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
+
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -543,6 +707,8 @@
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
@@ -552,12 +718,16 @@
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
return 256;
}
@@ -653,6 +823,11 @@
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
}
+ if (AMDGPU::isGFX90A(*STI)) {
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
+ }
return KD;
}
@@ -1049,23 +1224,32 @@
return ID_UNKNOWN_;
}
-static bool isValidMsgId(int64_t MsgId) {
- return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];
-}
-
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
if (Strict) {
- if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
+ switch (MsgId) {
+ case ID_SAVEWAVE:
+ return isVI(STI) || isGFX9Plus(STI);
+ case ID_STALL_WAVE_GEN:
+ case ID_HALT_WAVES:
+ case ID_ORDERED_PS_DONE:
+ case ID_GS_ALLOC_REQ:
+ case ID_GET_DOORBELL:
return isGFX9Plus(STI);
- else
- return isValidMsgId(MsgId);
+ case ID_EARLY_PRIM_DEALLOC:
+ return isGFX9(STI);
+ case ID_GET_DDID:
+ return isGFX10Plus(STI);
+ default:
+ return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];
+ }
} else {
return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
}
}
StringRef getMsgName(int64_t MsgId) {
- return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";
+ assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);
+ return IdSymbolic[MsgId];
}
int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
@@ -1080,7 +1264,9 @@
return OP_UNKNOWN_;
}
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict) {
+ assert(isValidMsgId(MsgId, STI, Strict));
if (!Strict)
return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
@@ -1103,7 +1289,9 @@
return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
}
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict) {
+ assert(isValidMsgOp(MsgId, OpId, STI, Strict));
if (!Strict)
return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
@@ -1156,6 +1344,17 @@
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
+bool getHasColorExport(const Function &F) {
+ // As a safe default always respond as if PS has color exports.
+ return getIntegerAttribute(
+ F, "amdgpu-color-export",
+ F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0;
+}
+
+bool getHasDepthExport(const Function &F) {
+ return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0;
+}
+
bool isShader(CallingConv::ID cc) {
switch(cc) {
case CallingConv::AMDGPU_VS:
@@ -1259,6 +1458,10 @@
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding];
+}
+
bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
}
@@ -1267,6 +1470,14 @@
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
}
+bool isGFX90A(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1374,6 +1585,9 @@
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return true;
default:
return false;
@@ -1413,41 +1627,67 @@
case AMDGPU::VReg_64RegClassID:
case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
+ case AMDGPU::VReg_64_Align2RegClassID:
+ case AMDGPU::AReg_64_Align2RegClassID:
return 64;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
case AMDGPU::AReg_96RegClassID:
+ case AMDGPU::VReg_96_Align2RegClassID:
+ case AMDGPU::AReg_96_Align2RegClassID:
+ case AMDGPU::AV_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
case AMDGPU::AReg_128RegClassID:
+ case AMDGPU::VReg_128_Align2RegClassID:
+ case AMDGPU::AReg_128_Align2RegClassID:
+ case AMDGPU::AV_128RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
case AMDGPU::AReg_160RegClassID:
+ case AMDGPU::VReg_160_Align2RegClassID:
+ case AMDGPU::AReg_160_Align2RegClassID:
+ case AMDGPU::AV_160RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
case AMDGPU::VReg_192RegClassID:
case AMDGPU::AReg_192RegClassID:
+ case AMDGPU::VReg_192_Align2RegClassID:
+ case AMDGPU::AReg_192_Align2RegClassID:
return 192;
+ case AMDGPU::SGPR_224RegClassID:
+ case AMDGPU::SReg_224RegClassID:
+ case AMDGPU::VReg_224RegClassID:
+ case AMDGPU::AReg_224RegClassID:
+ case AMDGPU::VReg_224_Align2RegClassID:
+ case AMDGPU::AReg_224_Align2RegClassID:
+ return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
case AMDGPU::AReg_256RegClassID:
+ case AMDGPU::VReg_256_Align2RegClassID:
+ case AMDGPU::AReg_256_Align2RegClassID:
return 256;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
case AMDGPU::AReg_512RegClassID:
+ case AMDGPU::VReg_512_Align2RegClassID:
+ case AMDGPU::AReg_512_Align2RegClassID:
return 512;
case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
case AMDGPU::VReg_1024RegClassID:
case AMDGPU::AReg_1024RegClassID:
+ case AMDGPU::VReg_1024_Align2RegClassID:
+ case AMDGPU::AReg_1024_Align2RegClassID:
return 1024;
default:
llvm_unreachable("Unexpected register class");
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f937869..72c872d 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -44,6 +44,12 @@
/// \returns True if HSA OS ABI Version identification is 3,
/// false otherwise.
bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 4,
+/// false otherwise.
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3 or 4,
+/// false otherwise.
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);
struct GcnBufferFormatInfo {
unsigned Format;
@@ -78,6 +84,7 @@
class AMDGPUTargetID {
private:
+ const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;
@@ -145,10 +152,10 @@
void setTargetIDFromFeaturesString(StringRef FS);
void setTargetIDFromTargetIDStream(StringRef TargetID);
-};
-/// Streams isa version string for given subtarget \p STI into \p Stream.
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
+ /// \returns String representation of an object.
+ std::string toString() const;
+};
/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
@@ -284,6 +291,7 @@
bool Coordinates;
bool LodOrClampOrMip;
bool HasD16;
+ bool MSAA;
};
LLVM_READONLY
@@ -293,6 +301,7 @@
MIMGDim Dim;
uint8_t NumCoords;
uint8_t NumGradients;
+ bool MSAA;
bool DA;
uint8_t Encoding;
const char *AsmSuffix;
@@ -338,6 +347,11 @@
LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+LLVM_READONLY
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported);
+
struct MIMGInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -386,9 +400,21 @@
bool getMUBUFHasSoffset(unsigned Opc);
LLVM_READONLY
+bool getMUBUFIsBufferInv(unsigned Opc);
+
+LLVM_READONLY
bool getSMEMIsBuffer(unsigned Opc);
LLVM_READONLY
+bool getVOP1IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP2IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP3IsSingle(unsigned Opc);
+
+LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
@@ -459,6 +485,14 @@
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
}
+ bool hasWaitExceptVsCnt() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
+ }
+
+ bool hasWaitVsCnt() const {
+ return VsCnt != ~0u;
+ }
+
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
@@ -627,10 +661,12 @@
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict = true);
LLVM_READNONE
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
bool msgRequiresOp(int64_t MsgId);
@@ -653,6 +689,10 @@
unsigned getInitialPSInputAddr(const Function &F);
+bool getHasColorExport(const Function &F);
+
+bool getHasDepthExport(const Function &F);
+
LLVM_READNONE
bool isShader(CallingConv::ID CC);
@@ -701,8 +741,11 @@
bool isGFX10(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
+bool isGFX90A(const MCSubtargetInfo &STI);
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -746,12 +789,17 @@
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return 8;
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -847,6 +895,11 @@
const GCNSubtarget *Subtarget,
Align Alignment = Align(4));
+LLVM_READNONE
+inline bool isLegal64BitDPPControl(unsigned DC) {
+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+}
+
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
new file mode 100644
index 0000000..da8fcf3
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -0,0 +1,355 @@
+//===- AMDGPULDSUtils.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULDSUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/ReplaceConstant.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+ Module &M;
+ CallGraph CG;
+ SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+ // Collect all address taken functions within the module.
+ void collectAddressTakenFunctions() {
+ auto *ECNode = CG.getExternalCallingNode();
+
+ for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
+ auto *CGN = GI->second;
+ auto *F = CGN->getFunction();
+ if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
+ continue;
+ AddressTakenFunctions.insert(CGN);
+ }
+ }
+
+ // For given kernel, collect all its reachable non-kernel functions.
+ SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+ SmallPtrSet<Function *, 8> ReachableCallees;
+
+ // Call graph node which represents this kernel.
+ auto *KCGN = CG[K];
+
+ // Go through all call graph nodes reachable from the node representing this
+ // kernel, visit all their call sites, if the call site is direct, add
+ // corresponding callee to reachable callee set, if it is indirect, resolve
+ // the indirect call site to potential reachable callees, add them to
+ // reachable callee set, and repeat the process for the newly added
+ // potential callee nodes.
+ //
+ // FIXME: Need to handle bit-casted function pointers.
+ //
+ SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
+ SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+ while (!CGNStack.empty()) {
+ auto *CGN = CGNStack.pop_back_val();
+
+ if (!VisitedCGNodes.insert(CGN).second)
+ continue;
+
+ for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
+ auto *RCB = cast<CallBase>(GI->first.getValue());
+ auto *RCGN = GI->second;
+
+ if (auto *DCallee = RCGN->getFunction()) {
+ ReachableCallees.insert(DCallee);
+ } else if (RCB->isIndirectCall()) {
+ auto *RCBFTy = RCB->getFunctionType();
+ for (auto *ACGN : AddressTakenFunctions) {
+ auto *ACallee = ACGN->getFunction();
+ if (ACallee->getFunctionType() == RCBFTy) {
+ ReachableCallees.insert(ACallee);
+ CGNStack.append(df_begin(ACGN), df_end(ACGN));
+ }
+ }
+ }
+ }
+ }
+
+ return ReachableCallees;
+ }
+
+public:
+ explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+ // Collect address taken functions.
+ collectAddressTakenFunctions();
+ }
+
+ void collectReachableCallees(
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ // Collect reachable callee set for each kernel defined in the module.
+ for (Function &F : M.functions()) {
+ if (!AMDGPU::isKernelCC(&F))
+ continue;
+ Function *K = &F;
+ KernelToCallees[K] = collectReachableCallees(K);
+ }
+ }
+};
+
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ CollectReachableCallees CRC{M};
+ CRC.collectReachableCallees(KernelToCallees);
+}
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+ SmallPtrSet<Function *, 8> LDSAccessors;
+ SmallVector<User *, 8> UserStack(GV->users());
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ while (!UserStack.empty()) {
+ auto *U = UserStack.pop_back_val();
+
+ // `U` is already visited? continue to next one.
+ if (!VisitedUsers.insert(U).second)
+ continue;
+
+ // `U` is a global variable which is initialized with LDS. Ignore LDS.
+ if (isa<GlobalValue>(U))
+ return SmallPtrSet<Function *, 8>();
+
+ // Recursively explore constant users.
+ if (isa<Constant>(U)) {
+ append_range(UserStack, U->users());
+ continue;
+ }
+
+ // `U` should be an instruction, if it belongs to a non-kernel function F,
+ // then collect F.
+ Function *F = cast<Instruction>(U)->getFunction();
+ if (!AMDGPU::isKernelCC(F))
+ LDSAccessors.insert(F);
+ }
+
+ return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+ DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+ SmallVector<User *, 8> UserStack;
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ UserStack.push_back(U);
+
+ while (!UserStack.empty()) {
+ auto *UU = UserStack.pop_back_val();
+
+ if (!VisitedUsers.insert(UU).second)
+ continue;
+
+ if (isa<GlobalValue>(UU))
+ continue;
+
+ if (isa<Constant>(UU)) {
+ append_range(UserStack, UU->users());
+ continue;
+ }
+
+ auto *I = cast<Instruction>(UU);
+ Function *F = I->getFunction();
+ if (CollectKernelInsts) {
+ if (!AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ } else {
+ if (AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ }
+
+ FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+ FunctionToInsts[F].insert(I);
+ }
+
+ return FunctionToInsts;
+}
+
+bool isKernelCC(const Function *Func) {
+ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+}
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+ return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+ GV->getValueType());
+}
+
+static void collectFunctionUses(User *U, const Function *F,
+ SetVector<Instruction *> &InstUsers) {
+ SmallVector<User *> Stack{U};
+
+ while (!Stack.empty()) {
+ U = Stack.pop_back_val();
+
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getFunction() == F)
+ InstUsers.insert(I);
+ continue;
+ }
+
+ if (!isa<ConstantExpr>(U))
+ continue;
+
+ append_range(Stack, U->users());
+ }
+}
+
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
+ SetVector<Instruction *> InstUsers;
+
+ collectFunctionUses(C, F, InstUsers);
+ for (Instruction *I : InstUsers) {
+ convertConstantExprsToInstructions(I, C);
+ }
+}
+
+bool hasUserInstruction(const GlobalValue *GV) {
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV->users());
+
+ while (!Stack.empty()) {
+ const User *U = Stack.pop_back_val();
+
+ if (!Visited.insert(U).second)
+ continue;
+
+ if (isa<Instruction>(U))
+ return true;
+
+ append_range(Stack, U->users());
+ }
+
+ return false;
+}
+
+bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+ // We are not interested in kernel LDS lowering for module LDS itself.
+ if (F && GV.getName() == "llvm.amdgcn.module.lds")
+ return false;
+
+ bool Ret = false;
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV.users());
+ SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
+
+ assert(!F || isKernelCC(F));
+
+ while (!Stack.empty()) {
+ const User *V = Stack.pop_back_val();
+ Visited.insert(V);
+
+ if (auto *G = dyn_cast<GlobalValue>(V)) {
+ StringRef GName = G->getName();
+ if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
+ // For kernel LDS lowering, if G is not a compiler.used list, then we
+ // cannot lower the lds GV since we cannot replace the use of GV within
+ // G.
+ return false;
+ }
+ GlobalUsers.insert(G);
+ continue;
+ }
+
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *UF = I->getFunction();
+ if (UF == F) {
+ // Used from this kernel, we want to put it into the structure.
+ Ret = true;
+ } else if (!F) {
+ // For module LDS lowering, lowering is required if the user instruction
+ // is from non-kernel function.
+ Ret |= !isKernelCC(UF);
+ }
+ continue;
+ }
+
+ // User V should be a constant, recursively visit users of V.
+ assert(isa<Constant>(V) && "Expected a constant.");
+ append_range(Stack, V->users());
+ }
+
+ if (!F && !Ret) {
+ // For module LDS lowering, we have not yet decided if we should lower GV or
+ // not. Explore all global users of GV, and check if atleast one of these
+ // global users appear as an use within an instruction (possibly nested use
+ // via constant expression), if so, then conservately lower LDS.
+ for (auto *G : GlobalUsers)
+ Ret |= hasUserInstruction(G);
+ }
+
+ return Ret;
+}
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F) {
+ std::vector<llvm::GlobalVariable *> LocalVars;
+ for (auto &GV : M.globals()) {
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+ continue;
+ }
+ if (!GV.hasInitializer()) {
+ // addrspace(3) without initializer implies cuda/hip extern __shared__
+ // the semantics for such a variable appears to be that all extern
+ // __shared__ variables alias one another, in which case this transform
+ // is not required
+ continue;
+ }
+ if (!isa<UndefValue>(GV.getInitializer())) {
+ // Initializers are unimplemented for local address space.
+ // Leave such variables in place for consistent error reporting.
+ continue;
+ }
+ if (GV.isConstant()) {
+ // A constant undef variable can't be written to, and any load is
+ // undef, so it should be eliminated by the optimizer. It could be
+ // dropped by the back end if not. This pass skips over it.
+ continue;
+ }
+ if (!shouldLowerLDSToStruct(GV, F)) {
+ continue;
+ }
+ LocalVars.push_back(&GV);
+ }
+ return LocalVars;
+}
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+ SmallVector<GlobalValue *, 32> TmpVec;
+ collectUsedGlobalVariables(M, TmpVec, true);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ TmpVec.clear();
+ collectUsedGlobalVariables(M, TmpVec, false);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ return UsedList;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
new file mode 100644
index 0000000..ffcafb9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -0,0 +1,70 @@
+//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Constants.h"
+
+namespace llvm {
+
+class ConstantExpr;
+
+namespace AMDGPU {
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+bool isKernelCC(const Function *Func);
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+
+/// \returns true if a given global variable \p GV (or its global users) appear
+/// as an use within some instruction (either from kernel or from non-kernel).
+bool hasUserInstruction(const GlobalValue *GV);
+
+/// \returns true if an LDS global requres lowering to a module LDS structure
+/// if \p F is not given. If \p F is given it must be a kernel and function
+/// \returns true if an LDS global is directly used from that kernel and it
+/// is safe to replace its uses with a kernel LDS structure member.
+bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+ const Function *F = nullptr);
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F = nullptr);
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
+
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index b7dd757..f6b5975 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -41,7 +41,7 @@
}
return;
}
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
if (!NamedMD || !NamedMD->getNumOperands()) {
// Emit msgpack metadata by default
@@ -69,7 +69,7 @@
// Metadata.
bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
BlobType = Type;
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
return setFromLegacyBlob(Blob);
return setFromMsgPackBlob(Blob);
}
@@ -243,6 +243,27 @@
Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
}
+// Set the amount of LDS used in bytes in the metadata.
+void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".lds_size"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -592,6 +613,41 @@
{0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},
{0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},
+ {0x2e28, "COMPUTE_PGM_RSRC3"},
+ {0x2e2a, "COMPUTE_SHADER_CHKSUM"},
+ {0x2e24, "COMPUTE_USER_ACCUM_0"},
+ {0x2e25, "COMPUTE_USER_ACCUM_1"},
+ {0x2e26, "COMPUTE_USER_ACCUM_2"},
+ {0x2e27, "COMPUTE_USER_ACCUM_3"},
+ {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"},
+ {0xa2d3, "GE_NGG_SUBGRP_CNTL"},
+ {0xc25f, "GE_STEREO_CNTL"},
+ {0xc262, "GE_USER_VGPR_EN"},
+ {0xc258, "IA_MULTI_VGT_PARAM_PIPED"},
+ {0xa210, "PA_STEREO_CNTL"},
+ {0xa1c2, "SPI_SHADER_IDX_FORMAT"},
+ {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"},
+ {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"},
+ {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"},
+ {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"},
+ {0x2c88, "SPI_SHADER_PGM_LO_GS"},
+ {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"},
+ {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"},
+ {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"},
+ {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"},
+ {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"},
+ {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"},
+ {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"},
+ {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"},
+ {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"},
+ {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"},
+ {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"},
+ {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"},
+ {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"},
+ {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"},
+ {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"},
+ {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"},
+
{0, nullptr}};
auto Entry = RegInfoTable;
for (; Entry->Num && Entry->Num != RegNum; ++Entry)
@@ -653,7 +709,7 @@
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
toMsgPackBlob(Blob);
@@ -790,7 +846,7 @@
}
// Get .note record type of metadata blob to be emitted:
-// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned AMDGPUPALMetadata::getType() const {
@@ -799,12 +855,12 @@
// Return whether the blob type is legacy PAL metadata.
bool AMDGPUPALMetadata::isLegacy() const {
- return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ return BlobType == ELF::NT_AMD_PAL_METADATA;
}
// Set legacy PAL metadata format.
void AMDGPUPALMetadata::setLegacy() {
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
}
// Erase all PAL metadata.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 8fa1f73..7fdd9a8 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -80,6 +80,21 @@
// Set the stack frame size of a function in the metadata.
void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+ // Set the amount of LDS used in bytes in the metadata. This is an optional
+ // advisory record for logging etc; wave dispatch actually uses the rsrc1
+ // register for the shader stage to determine the amount of LDS to allocate.
+ void setFunctionLdsSize(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used vgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of vgprs to allocate.
+ void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used sgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of sgprs to allocate.
+ void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val);
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void setWave32(unsigned CC);
@@ -95,7 +110,7 @@
const char *getVendor() const;
// Get .note record type of metadata blob to be emitted:
- // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+ // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned getType() const;
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index fdb6e2e..a9f9d0e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,10 +1,12 @@
add_llvm_component_library(LLVMAMDGPUUtils
- AMDGPUBaseInfo.cpp
- AMDKernelCodeTUtils.cpp
AMDGPUAsmUtils.cpp
+ AMDGPUBaseInfo.cpp
+ AMDGPULDSUtils.cpp
AMDGPUPALMetadata.cpp
+ AMDKernelCodeTUtils.cpp
LINK_COMPONENTS
+ Analysis
Core
MC
BinaryFormat
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f1e4700..35d5fe1 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -60,9 +60,12 @@
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP1 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -79,6 +82,10 @@
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -144,6 +151,15 @@
def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
+ VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+
+ let HasOMod = 1;
+}
+def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>;
+def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>;
+def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
+
//===----------------------------------------------------------------------===//
// VOP1 Instructions
//===----------------------------------------------------------------------===//
@@ -187,8 +203,10 @@
let Inst{31-25} = 0x3f; //encoding
}
+let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
-defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+// OMod clears exceptions when set in this instruction
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
@@ -196,7 +214,8 @@
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
-defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+// OMod clears exceptions when set in this instruction
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
@@ -213,11 +232,12 @@
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
}
-defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
-defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-let FPDPRounding = 1 in {
+// OMod clears exceptions when set in these 2 instructions
+defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_uint>;
+defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>;
+let FPDPRounding = 1, isReMaterializable = 0 in {
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
-} // End FPDPRounding = 1
+} // End FPDPRounding = 1, isReMaterializable = 0
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
@@ -268,7 +288,7 @@
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
let SchedRW = [WriteDoubleAdd] in {
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
@@ -277,6 +297,7 @@
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
+} // End isReMaterializable = 1
let VOPAsmPrefer32Bit = 1 in {
defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
@@ -337,6 +358,7 @@
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
} // End Uses = [M0, EXEC]
+let isReMaterializable = 1 in {
let SubtargetPredicate = isGFX6GFX7 in {
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_LOG_CLAMP_F32 :
@@ -351,12 +373,12 @@
VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
- let SchedRW = [WriteDouble] in {
+ let SchedRW = [WriteTrans64] in {
defm V_RCP_CLAMP_F64 :
VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 :
VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
- } // End SchedRW = [WriteDouble]
+ } // End SchedRW = [WriteTrans64]
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX7GFX8GFX9 in {
@@ -374,6 +396,7 @@
defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = isGFX7Plus
+} // End isReMaterializable = 1
let SubtargetPredicate = Has16BitInsts in {
@@ -381,8 +404,9 @@
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
} // End FPDPRounding = 1
-defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
-defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+// OMod clears exceptions when set in these two instructions
+defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_sint>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -393,7 +417,7 @@
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
-defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
@@ -434,11 +458,12 @@
let SchedRW = [Write64Bit, Write64Bit];
}
+ let isReMaterializable = 1 in
defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
let mayRaiseFPException = 0 in {
- defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
- defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+ defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16_SPECIAL_OMOD>;
+ defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16_SPECIAL_OMOD>;
} // End mayRaiseFPException = 0
} // End SubtargetPredicate = isGFX9Plus
@@ -461,6 +486,18 @@
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus
+def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> {
+ let DstRC = RegisterOperand<AGPR_32>;
+ let Src0RC32 = RegisterOperand<AGPR_32>;
+ let Asm32 = " $vdst, $src0";
+}
+
+def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> {
+ let SubtargetPredicate = isGFX90APlus;
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -471,6 +508,7 @@
let Defs = ps.Defs;
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
+ let TRANS = ps.TRANS;
bits<8> vdst;
let Inst{8-0} = 0xfa;
@@ -498,9 +536,6 @@
let Inst{16-9} = op;
let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f;
-
- let AssemblerPredicate = HasDPP8;
- let SubtargetPredicate = HasDPP8;
}
//===----------------------------------------------------------------------===//
@@ -823,6 +858,8 @@
defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>;
defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;
+defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>;
+
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 7a334ea..7860b7e 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -81,9 +81,12 @@
}
class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP2 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -101,6 +104,9 @@
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
}
class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -237,7 +243,9 @@
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let isReMaterializable = 1;
+ }
}
}
@@ -267,10 +275,9 @@
(ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
(ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
field bit HasExt = 0;
+ let IsSingle = 1;
- // Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
- field string Asm32 = " $vdst, $src0, $src1, $imm";
+ field string Asm32 = "$vdst, $src0, $src1, $imm";
}
def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -280,37 +287,38 @@
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
field bit HasExt = 0;
+ let IsSingle = 1;
- // Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
- field string Asm32 = " $vdst, $src0, $imm, $src1";
+ field string Asm32 = "$vdst, $src0, $imm, $src1";
}
def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F32 : VOP_MADMK <f32>;
+class getRegisterOperandForVT<ValueType VT> {
+ RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
+}
+
// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
- let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
+ let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
-
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp8:$dpp8, FI:$fi);
-
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
@@ -335,6 +343,8 @@
def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
+let HasExtSDWA = 0, HasExt64BitDPP = 1 in
+def VOP_MAC_F64 : VOP_MAC <f64>;
class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
let HasClamp = 0;
@@ -448,6 +458,7 @@
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -464,6 +475,7 @@
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -473,10 +485,11 @@
//===----------------------------------------------------------------------===//
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-let SubtargetPredicate = HasMadMacF32Insts in
+let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
+let isReMaterializable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
@@ -498,6 +511,7 @@
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
+} // End isReMaterializable = 1
let mayRaiseFPException = 0 in {
let OtherPredicates = [HasMadMacF32Insts] in {
@@ -510,6 +524,7 @@
} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
// isConvertibleToThreeAddress = 1
+let isReMaterializable = 1 in
def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
} // End OtherPredicates = [HasMadMacF32Insts]
} // End mayRaiseFPException = 0
@@ -524,7 +539,7 @@
defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
-let SubtargetPredicate = HasAddNoCarryInsts in {
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
@@ -543,12 +558,12 @@
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
@@ -572,7 +587,9 @@
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
} // End SubtargetPredicate = isGFX6GFX7
} // End isCommutable = 1
+} // End isReMaterializable = 1
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
@@ -672,7 +689,8 @@
let SubtargetPredicate = HasDLInsts in {
-defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+let isReMaterializable = 1 in
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
let Constraints = "$vdst = $src2",
DisableEncoding = "$src2",
@@ -692,6 +710,14 @@
} // End SubtargetPredicate = HasFmaLegacy32
+let SubtargetPredicate = isGFX90APlus,
+ Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1,
+ SchedRW = [WriteDoubleAdd] in
+defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;
+
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
@@ -735,17 +761,21 @@
}
} // End AddedComplexity = 30
+let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in {
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+
+let isCommutable = 1 in
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+}
+
let SubtargetPredicate = isGFX10Plus in {
-def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
-let FPDPRounding = 1 in
+let FPDPRounding = 1 in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
-let isCommutable = 1 in {
-def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
-let FPDPRounding = 1 in
+let isCommutable = 1 in
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
-} // End isCommutable = 1
+} // End FPDPRounding = 1
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
@@ -913,8 +943,6 @@
let Inst{30-25} = op;
let Inst{31} = 0x0;
- let AssemblerPredicate = HasDPP8;
- let SubtargetPredicate = HasDPP8;
let OtherPredicates = ps.OtherPredicates;
}
@@ -1122,14 +1150,18 @@
multiclass VOP3Only_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let IsSingle = 1;
+ }
}
//===---------------------------- VOP3beOnly ----------------------------===//
multiclass VOP3beOnly_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let IsSingle = 1;
+ }
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
@@ -1177,7 +1209,10 @@
defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+
+let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+}
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -1251,20 +1286,20 @@
VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
}
- multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e32_gfx6_gfx7 :
- VOP2_Real<!cast<VOP2_Pseudo>(PseudoName#"_e32"), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(PseudoName#"_e32").Pfl>;
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl>;
}
- multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
- VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>;
}
- multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
- VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -1281,16 +1316,16 @@
VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
multiclass VOP2be_Real_gfx6_gfx7_with_name<bits<6> op,
- string PseudoName, string asmName> {
- defvar ps32 = !cast<VOP2_Pseudo>(PseudoName#"_e32");
- defvar ps64 = !cast<VOP3_Pseudo>(PseudoName#"_e64");
+ string opName, string asmName> {
+ defvar ps32 = !cast<VOP2_Pseudo>(opName#"_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps32.AsmOperands in {
- defm "" : VOP2_Real_e32_gfx6_gfx7<op, PseudoName>;
+ defm "" : VOP2_Real_e32_gfx6_gfx7<op, opName>;
}
let AsmString = asmName # ps64.AsmOperands in {
- defm "" : VOP2be_Real_e64_gfx6_gfx7<op, PseudoName>;
+ defm "" : VOP2be_Real_e64_gfx6_gfx7<op, opName>;
}
}
@@ -1391,10 +1426,7 @@
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Hack to stop printing _e64
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
- let OutOperandList = (outs VGPR_32:$vdst);
- let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+ let IsSingle = 1;
}
}
@@ -1525,6 +1557,7 @@
defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
+let AssemblerPredicate = isGCN3ExcludingGFX90A in
defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
@@ -1641,6 +1674,42 @@
} // End SubtargetPredicate = HasDLInsts
+let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
+ multiclass VOP2_Real_e32_gfx90a <bits<6> op> {
+ def _e32_gfx90a :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+
+ multiclass VOP2_Real_e64_gfx90a <bits<10> op> {
+ def _e64_gfx90a :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ multiclass Base_VOP2_Real_e32e64_gfx90a <bits<6> op> :
+ VOP2_Real_e32_gfx90a<op>,
+ VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>;
+
+ multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> :
+ Base_VOP2_Real_e32e64_gfx90a<op> {
+
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx90a :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+ let DecoderNamespace = "SDWA9";
+ }
+ }
+} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A"
+
+let SubtargetPredicate = isGFX90APlus in {
+ defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>;
+ let IsSingle = 1 in {
+ defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
+ }
+} // End SubtargetPredicate = isGFX90APlus
+
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42dc995..ee3b87f 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -184,47 +184,24 @@
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
-
- // FIXME: Hack to stop printing _e64
- let Outs64 = (outs DstRC.RegClass:$vdst);
- let Asm64 =
- " " # !if(Features.HasOpSel,
- getAsmVOP3OpSel<NumSrcArgs,
- HasIntClamp,
- P.HasOMod,
- HasSrc0FloatMods,
- HasSrc1FloatMods,
- HasSrc2FloatMods>.ret,
- !if(Features.HasClamp,
- getAsm64<HasDst, NumSrcArgs, HasIntClamp,
- HasModifiers, HasOMod, DstVT>.ret,
- P.Asm64));
- let NeedPatGen = P.NeedPatGen;
+ let IsSingle = 1;
}
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
- let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+ let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+ let IsSingle = 1;
}
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
-}
-
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VReg_64>;
-}
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
-
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VReg_64>;
+ let IsSingle = 1;
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
- let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
//===----------------------------------------------------------------------===//
@@ -287,7 +264,7 @@
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
- let Outs64 = (outs VGPR_32:$vdst);
+ let Outs64 = (outs DstRC.RegClass:$vdst);
let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
}
@@ -298,6 +275,7 @@
let isCommutable = 1 in {
+let isReMaterializable = 1 in {
let mayRaiseFPException = 0 in {
let SubtargetPredicate = HasMadMacF32Insts in {
defm V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -325,12 +303,13 @@
defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteIntMul] in {
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteIntMul]
+} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
// v_div_fmas_f32:
@@ -351,6 +330,7 @@
} // End isCommutable = 1
+let isReMaterializable = 1 in {
let mayRaiseFPException = 0 in {
defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
@@ -364,22 +344,27 @@
defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
-let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
-defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
-defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
-defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
-defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
-defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
-defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
-defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
-defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
-defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+// XXX - No FPException seems suspect but manual doesn't say it does
+let mayRaiseFPException = 0 in {
+ let isCommutable = 1 in {
+ defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+ defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+ defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+ defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+ defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+ defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+ } // End isCommutable = 1
+ defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+ defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+ defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
} // End mayRaiseFPException = 0
-defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+let isCommutable = 1 in {
+ defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+} // End isCommutable = 1
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
@@ -388,6 +373,7 @@
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+} // End isReMaterializable = 1
let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
@@ -399,6 +385,7 @@
defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>;
} // End mayRaiseFPException = 0
+let isReMaterializable = 1 in
defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
let Constraints = "@earlyclobber $vdst" in {
@@ -406,6 +393,7 @@
} // End Constraints = "@earlyclobber $vdst"
+let isReMaterializable = 1 in {
let SchedRW = [WriteDouble] in {
defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
} // End SchedRW = [WriteDouble]
@@ -423,12 +411,14 @@
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
+} // End isReMaterializable = 1
def : GCNPat<
(i32 (getDivergentFrag<sext>.ret i16:$src)),
(i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
>;
+let isReMaterializable = 1 in {
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
} // End SubtargetPredicate = isGFX6GFX7GFX10
@@ -438,6 +428,7 @@
defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write32Bit]
+} // End isReMaterializable = 1
let SubtargetPredicate = isGFX7Plus in {
@@ -447,10 +438,10 @@
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
let isCommutable = 1 in {
-let SchedRW = [WriteQuarterRate32, WriteSALU] in {
+let SchedRW = [WriteIntMul, WriteSALU] in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
-} // End SchedRW = [WriteQuarterRate32, WriteSALU]
+} // End SchedRW = [WriteIntMul, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX7Plus
@@ -476,6 +467,7 @@
let FPDPRounding = 1 in {
defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [MODE, M0, EXEC] in {
+ let OtherPredicates = [isNotGFX90APlus] in
// For some reason the intrinsic operands are in a different order
// from the instruction operands.
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -497,24 +489,24 @@
let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+let OtherPredicates = [isNotGFX90APlus] in
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus
-let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
+// This predicate should only apply to the selection pattern. The
+// instruction still exists and should decode on subtargets with
+// other bank counts.
+let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
[(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
(i32 timm:$attrchan),
(i32 timm:$attr),
- (i1 timm:$high), M0))]> {
- // This predicate should only apply to the selection pattern. The
- // instruction still exists and should decode on subtargets with
- // other bank counts.
- let OtherPredicates = [has32BankLDS];
-}
+ (i1 timm:$high), M0))]>;
+} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1
-
+let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
+} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -527,11 +519,11 @@
), VGPR_32)), sub1)
>;
-let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]
let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
@@ -618,16 +610,16 @@
}
let SubtargetPredicate = isGFX9Plus in {
-defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
-defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
+let isCommutable = 1, isReMaterializable = 1 in {
+ defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+ defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+} // End isCommutable = 1, isReMaterializable = 1
+// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this
+// to the new src0.
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
@@ -649,8 +641,13 @@
defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+
+let isReMaterializable = 1 in {
defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+} // End isReMaterializable = 1
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -729,7 +726,9 @@
let SubtargetPredicate = isGFX10Plus in {
- defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ let isCommutable = 1, isReMaterializable = 1 in {
+ defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ } // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
@@ -833,6 +832,7 @@
VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands;
+ let IsSingle = 1;
}
}
multiclass VOP3be_Real_gfx10<bits<10> op> {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 64e70b8..48f5eb1 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,71 +10,82 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
-class VOP3PInst<string OpName, VOPProfile P,
- SDPatternOperator node = null_frag,
- bit HasExplicitClamp = 0> :
- VOP3P_Pseudo<OpName, P,
- !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
->;
+// Used for FMA_MIX* and MAD_MIX* insts
+// Their operands are only sort of f16 operands. Depending on
+// op_sel_hi, these may be interpreted as f32. The inline immediate
+// values are really f16 converted to f32, so we treat these as f16
+// operands.
+class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+ bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+ bit UseTiedOutput = useTiedOutput;
+
+ dag srcs =
+ (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
+ FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+
+ // FIXME: clampmod0 misbehaves with the non-default vdst_in
+ // following it. For now workaround this by requiring clamp
+ // in tied patterns. This should use undef_tied_input, but it
+ // seems underdeveloped and doesn't apply the right register
+ // class constraints.
+ dag mods = !con(!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
+ (ins clampmod0:$clamp)),
+ (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
+ // We use Ins64 because that is the one which populates InOperandList
+ // due to the logic in class VOP3_Pseudo
+ let Ins64 = !con(srcs, mods);
+ let Asm64 =
+ "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+}
+
+multiclass VOP3PInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> {
+ def NAME : VOP3P_Pseudo<OpName, P,
+ !if (P.HasModifiers,
+ getVOP3PModPat<P, node, HasExplicitClamp>.ret,
+ getVOP3Pat<P, node>.ret)>;
+}
+
// Non-packed instructions that use the VOP3P encoding.
// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
-class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
- SDPatternOperator node = null_frag> :
- VOP3P_Pseudo<OpName, P> {
- // These operands are only sort of f16 operands. Depending on
- // op_sel_hi, these may be interpreted as f32. The inline immediate
- // values are really f16 converted to f32, so we treat these as f16
- // operands.
- let InOperandList =
- !con(
- !con(
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2),
- // FIXME: clampmod0 misbehaves with the non-default vdst_in
- // following it. For now workaround this by requiring clamp
- // in tied patterns. This should use undef_tied_input, but it
- // seems underdeveloped and doesn't apply the right register
- // class constraints.
- !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
- (ins clampmod0:$clamp))),
- (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
-
- let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(UseTiedOutput, "$vdst_in", "");
- let AsmOperands =
- " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P,
+ SDPatternOperator node = null_frag> {
+ def NAME : VOP3P_Pseudo<OpName, P> {
+ let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+ }
}
let isCommutable = 1 in {
-def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
let FPDPRounding = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
} // End FPDPRounding = 1
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
-def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
-def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
-def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
-def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
-def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
-def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
let SubtargetPredicate = HasVOP3PInsts in {
@@ -169,14 +180,14 @@
// Size of src arguments (16/32) is controlled by op_sel.
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
let isCommutable = 1, mayRaiseFPException = 0 in {
-def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
-def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
let ClampLo = 0, ClampHi = 1 in {
-def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
} // End FPDPRounding = 1
}
@@ -188,14 +199,14 @@
// Essentially the same as the mad_mix versions
let SubtargetPredicate = HasFmaMixInsts in {
let isCommutable = 1 in {
-def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
-def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
let ClampLo = 0, ClampHi = 1 in {
-def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
} // End FPDPRounding = 1
}
@@ -287,25 +298,30 @@
let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
- VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
- AMDGPUfdot2, 1/*ExplicitClamp*/>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
+defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
+defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
-def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
-def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End SubtargetPredicate = HasDot2Insts
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
+ VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+ AMDGPUfdot2, 1/*ExplicitClamp*/>;
+defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+
+} // End SubtargetPredicate = HasDot7Insts
+
let SubtargetPredicate = HasDot1Insts in {
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
+defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
+defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
@@ -319,7 +335,7 @@
def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
(add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
foreach Type = ["U", "I"] in
let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
@@ -327,7 +343,7 @@
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[1, 2, 3, 4, 5, 6, 7], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
// in the compile time. Directly handle the pattern generated by the FE here.
@@ -337,12 +353,19 @@
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[7, 1, 2, 3, 4, 5, 6], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
def ADst_32 : VOPDstOperand<AGPR_32>;
+def ADst_64 : VOPDstOperand<AReg_64>;
def ADst_128 : VOPDstOperand<AReg_128>;
+def ADst_256 : VOPDstOperand<AReg_256>;
def ADst_512 : VOPDstOperand<AReg_512>;
def ADst_1024 : VOPDstOperand<AReg_1024>;
+def VDst_64 : VOPDstOperand<VReg_64>;
+def VDst_128 : VOPDstOperand<VReg_128>;
+def VDst_256 : VOPDstOperand<VReg_256>;
+def VDst_512 : VOPDstOperand<VReg_512>;
+def VDst_1024 : VOPDstOperand<VReg_1024>;
def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
let Src0RC64 = ARegSrc_32;
@@ -362,7 +385,10 @@
let Src2RC64 = _SrcRC;
let HasOpSel = 0;
let HasClamp = 0;
- let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let HasIntClamp = 0;
+ let HasOMod = 0;
+ let HasModifiers = 0;
+ let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
}
@@ -378,6 +404,29 @@
def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
+
+def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
+def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
+def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, VISrc_1024_f32, VDst_1024>;
+def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>;
+def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>;
+def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>;
+def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>;
+def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
let Predicates = [HasMAIInsts] in {
@@ -388,32 +437,57 @@
} // End isMoveImm = 1
} // End isAsCheapAsAMove = 1, isReMaterializable = 1
-// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
-let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-defm V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
-defm V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
-defm V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
-defm V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
-defm V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
-defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
-defm V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
-defm V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
-defm V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
-defm V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
-defm V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
-defm V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+ let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+ // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+ defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>;
+
+ let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+ defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>;
+ } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+}
+
+defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>;
+defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>;
+defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>;
+defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
+defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>;
+defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>;
+defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>;
+defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
+defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
+defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>;
+defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>;
+defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>;
+defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
} // End SubtargetPredicate = HasMAIInsts
+let Predicates = [isGFX90APlus] in {
+ defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+ defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+ defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+ defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+ defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+
+ defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
+ defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>;
+} // End Predicates = [isGFX90APlus]
+
+let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
+ defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+ defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+ defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
+
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
@@ -430,23 +504,36 @@
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicate = HasVOP3PInsts;
let DecoderNamespace = "GFX8";
+ let VOP3P = 1;
}
}
multiclass VOP3P_Real_MAI<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
- VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
- let Inst{14} = 1; // op_sel_hi(2) default value
- let Inst{59} = 1; // op_sel_hi(0) default value
- let Inst{60} = 1; // op_sel_hi(1) default value
+ let Inst{14} = ?; // op_sel_hi(2)
+ let Inst{59} = ?; // op_sel_hi(0)
+ let Inst{60} = ?; // op_sel_hi(1)
}
}
-multiclass VOP3P_Real_MFMA<bits<7> op> {
+multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
+ let SubtargetPredicate = isGFX90AOnly,
+ AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in {
+ def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>;
+
+ def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>;
+ } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A"
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op> :
+ VOP3P_Real_MFMA_gfx90a <op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
- VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
}
@@ -494,13 +581,18 @@
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>;
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
@@ -536,16 +628,31 @@
} // End SubtargetPredicate = HasMAIInsts
+defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>;
+defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>;
+defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>;
+defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>;
+defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
+defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
+defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
+
+let SubtargetPredicate = HasPackedFP32Ops in {
+ defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
+ defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
+ defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;
+ defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>;
+} // End SubtargetPredicate = HasPackedFP32Ops
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
multiclass VOP3P_Real_gfx10<bits<7> op> {
def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>;
defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>;
@@ -572,13 +679,18 @@
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>;
defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>;
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 99599c5..c0cc9102 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -27,10 +27,6 @@
let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
let Inst{24-17} = op;
let Inst{31-25} = 0x3e; // encoding
-
- // VOPC disallows dst_sel and dst_unused as they have no effect on destination
- let Inst{42-40} = 0;
- let Inst{44-43} = 0;
}
class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -56,6 +52,8 @@
let Asm32 = "$src0, $src1";
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
+ // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+ let EmitDstSel = 0;
let Outs64 = (outs VOPDstS64orS32:$sdst);
list<SchedReadWrite> Schedule = sched;
}
@@ -106,6 +104,8 @@
InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOPC = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -121,6 +121,9 @@
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
}
class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -760,7 +763,7 @@
// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
-multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
let WaveSizePredicate = isWave64 in
def : GCNPat <
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
@@ -807,7 +810,7 @@
defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
-multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
let WaveSizePredicate = isWave64 in
def : GCNPat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
diff --git a/src/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/src/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 282c100..5f6f664 100644
--- a/src/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/src/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -140,10 +140,18 @@
let VOP3P = 1;
}
+class VOP_Real<VOP_Pseudo ps> {
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bit IsSingle = ps.Pfl.IsSingle;
+}
+
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP3 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let UseNamedOperandTable = 1;
@@ -162,6 +170,10 @@
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
VOPProfile Pfl = ps.Pfl;
}
@@ -317,7 +329,7 @@
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -326,14 +338,14 @@
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
bits<8> vdst;
bits<10> src0;
bits<10> src1;
@@ -341,14 +353,13 @@
bits<3> blgp;
bits<3> cbsz;
bits<4> abid;
- bits<1> clamp;
let Inst{7-0} = vdst;
let Inst{10-8} = !if(P.HasSrc1, cbsz, 0);
let Inst{14-11} = !if(P.HasSrc1, abid, 0);
- let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+ let Inst{15} = acc_cd;
let Inst{22-16} = op;
let Inst{31-23} = 0x1a7; //encoding
@@ -411,8 +422,8 @@
bits<1> clamp;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
+ let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
+ let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
@@ -462,8 +473,8 @@
bits<1> clamp;
bits<2> omod;
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
+ let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
+ let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
}
@@ -515,12 +526,13 @@
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
+ let VALU = 1;
+ let SDWA = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -536,17 +548,22 @@
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> {
+ let VALU = 1;
+ let SDWA = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -564,6 +581,10 @@
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -628,8 +649,8 @@
string AsmOperands = P.AsmDPP;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = HasDPP;
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -643,12 +664,13 @@
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let DPP = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -665,6 +687,10 @@
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -683,8 +709,8 @@
let Size = 8;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = HasDPP;
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -795,3 +821,17 @@
include "VOP2Instructions.td"
include "VOP3Instructions.td"
include "VOP3PInstructions.td"
+
+
+class VOPInfoTable <string Format> : GenericTable {
+ let FilterClass = Format # "_Real";
+ let CppTypeName = "VOPInfo";
+ let Fields = ["Opcode", "IsSingle"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "get" # Format # "OpcodeHelper";
+}
+
+def VOP1InfoTable : VOPInfoTable<"VOP1">;
+def VOP2InfoTable : VOPInfoTable<"VOP2">;
+def VOP3InfoTable : VOPInfoTable<"VOP3">;
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARC.td b/src/llvm-project/llvm/lib/Target/ARC/ARC.td
index 846f1bb..142ce7f 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARC.td
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARC.td
@@ -8,6 +8,18 @@
include "llvm/Target/Target.td"
+//===----------------------------------------------------------------------===//
+// ARC Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureNORM
+ : SubtargetFeature<"norm", "Xnorm", "true",
+ "Enable support for norm instruction.">;
+
+//===----------------------------------------------------------------------===//
+// Registers, calling conventions, instruction descriptions
+//===----------------------------------------------------------------------===//
+
include "ARCRegisterInfo.td"
include "ARCInstrInfo.td"
include "ARCCallingConv.td"
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/src/llvm-project/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
index 2f05ac4..0e3e4d3 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -10,8 +10,6 @@
// range conditional branches.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "arc-branch-finalize"
-
#include "ARCInstrInfo.h"
#include "ARCTargetMachine.h"
#include "MCTargetDesc/ARCInfo.h"
@@ -24,6 +22,8 @@
#include "llvm/Support/Debug.h"
#include <vector>
+#define DEBUG_TYPE "arc-branch-finalize"
+
using namespace llvm;
namespace llvm {
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index ead5931..6b5802d 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -493,6 +493,6 @@
bool HasFP = MF.getTarget().Options.DisableFramePointerElim(MF) ||
MF.getFrameInfo().hasVarSizedObjects() ||
MF.getFrameInfo().isFrameAddressTaken() ||
- RegInfo->needsStackRealignment(MF);
+ RegInfo->hasStackRealignment(MF);
return HasFP;
}
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCInstrFormats.td b/src/llvm-project/llvm/lib/Target/ARC/ARCInstrFormats.td
index 584844d..5f539c9 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -261,6 +261,32 @@
let Inst{5-0} = subop;
}
+// Single Operand Immediate Instructions.
+// 1-register, unsigned 6-bit immediate Single Operand instruction with
+// condition code.
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0] | 1| 1| subop| F|B[5-3] |U6 |1|cc |
+class F32_SOP_CC_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+ string asmstr, list<dag> pattern> :
+ InstARC<4, outs, ins, asmstr, pattern> {
+
+ bits<5> cc;
+ bits<6> U6;
+ bits<6> B;
+
+ let Inst{31-27} = major;
+ let Inst{26-24} = B{2-0};
+ let Inst{23-22} = 0b11;
+ let Inst{21-16} = subop;
+ let Inst{15} = F;
+ let Inst{14-12} = B{5-3};
+ let Inst{11-6} = U6;
+ let Inst{5} = 1;
+ let Inst{4-0} = cc;
+
+ let DecoderMethod = "DecodeCCRU6Instruction";
+}
+
// Dual Operand Instructions. Inst[21-16] specifies the specific operation
// for this format.
@@ -327,6 +353,30 @@
let Inst{5-0} = A;
}
+// 2-register, unsigned 6-bit immediate Dual Operand instruction with
+// condition code. This instruction uses B as the first 2 operands
+// (i.e, add.cc B, B, u6).
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0] | 1| 1| subop| F|B[5-3] |U6 |1|cc |
+class F32_DOP_CC_RRU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+ string asmstr, list<dag> pattern> :
+ InstARC<4, outs, ins, asmstr, pattern> {
+ bits<5> cc;
+ bits<6> U6;
+ bits<6> B;
+ bits<6> A;
+
+ let Inst{31-27} = major;
+ let Inst{26-24} = B{2-0};
+ let Inst{23-22} = 0b11;
+ let Inst{21-16} = subop;
+ let Inst{15} = F;
+ let Inst{14-12} = B{5-3};
+ let Inst{11-6} = U6;
+ let Inst{5} = 1;
+ let Inst{4-0} = cc;
+}
+
// 2-register, signed 12-bit immediate Dual Operand instruction.
// This instruction uses B as the first 2 operands (i.e., add B, B, -128).
// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCInstrInfo.td b/src/llvm-project/llvm/lib/Target/ARC/ARCInstrInfo.td
index 8fe393d..ea3e416 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -12,6 +12,24 @@
include "ARCInstrFormats.td"
+//===----------------------------------------------------------------------===//
+// Operand Pattern Stuff.
+//===----------------------------------------------------------------------===//
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+ def CCOp : PredicateOperand<i32, (ops i32imm), (ops)>;
+
+// The "u6" operand of a RRU6-type instruction
+let PrintMethod = "printU6" in {
+ def u6 : Operand<i32>, ImmLeaf<i32, [{
+ return isUInt<6>(Imm);
+ }]>;
+ def wide_u6 : Operand<i64>, ImmLeaf<i64, [{
+ return isUInt<6>(Imm);
+ }]>;
+}
+
// ---------------------------------------------------------------------------
// Selection DAG Nodes.
// ---------------------------------------------------------------------------
@@ -118,12 +136,13 @@
// Generic 3 operand binary instructions (i.e., add r0, r1, r2).
multiclass ArcBinaryInst<bits<5> major, bits<6> mincode,
- string opasm> {
+ string opasm, bit Commutable> {
// 3 register variant.
def _rrr : F32_DOP_RR<major, mincode, 0, (outs GPR32:$A),
(ins GPR32:$B, GPR32:$C),
!strconcat(opasm, "\t$A, $B, $C"),
- []>;
+ []>
+ { let isCommutable = Commutable; }
def _f_rrr : F32_DOP_RR<major, mincode, 1, (outs GPR32:$A),
(ins GPR32:$B, GPR32:$C),
!strconcat(opasm, ".f\t$A, $B, $C"),
@@ -141,6 +160,23 @@
[]>
{ let Defs = [STATUS32]; }
+ def _cc_rru6 : F32_DOP_CC_RRU6<major, mincode, 0, (outs GPR32:$A),
+ (ins immU6:$U6, ccond:$cc, GPR32:$B),
+ !strconcat(opasm, ".$cc\t$A, $B, $U6"),
+ []> {
+ let Uses = [STATUS32];
+ let Constraints = "$A = $B";
+ }
+
+ def _cc_f_rru6 : F32_DOP_CC_RRU6<major, mincode, 1, (outs GPR32:$A),
+ (ins immU6:$U6, ccond:$cc, GPR32:$B),
+ !strconcat(opasm, ".$cc.f\t$A, $B, $U6"),
+ []> {
+ let Defs = [STATUS32];
+ let Uses = [STATUS32];
+ let Constraints = "$A = $B";
+ }
+
// 2 register with 32-bit immediate variant.
def _rrlimm : F32_DOP_RLIMM<major, mincode, 0,
(outs GPR32:$A),
@@ -198,13 +234,15 @@
}
-multiclass ArcBinaryGEN4Inst<bits<6> mincode, string opasm> :
- ArcBinaryInst<0b00100, mincode, opasm>;
+multiclass ArcBinaryGEN4Inst<bits<6> mincode, string opasm, bit Commutable = 0> :
+ ArcBinaryInst<0b00100, mincode, opasm, Commutable>;
multiclass ArcBinaryEXT5Inst<bits<6> mincode, string opasm> :
- ArcBinaryInst<0b00101, mincode, opasm>;
+ ArcBinaryInst<0b00101, mincode, opasm, 0>;
multiclass ArcUnaryGEN4Inst<bits<6> mincode, string opasm> :
ArcUnaryInst<0b00100, mincode, opasm>;
+multiclass ArcUnaryEXT5Inst<bits<6> mincode, string opasm> :
+ ArcUnaryInst<0b00101, mincode, opasm>;
// Pattern generation for different instruction variants.
multiclass MultiPat<SDPatternOperator InFrag,
@@ -219,24 +257,25 @@
// ---------------------------------------------------------------------------
// Definitions for 3 operand binary instructions.
-defm ADD : ArcBinaryGEN4Inst<0b000000, "add">;
+defm ADD : ArcBinaryGEN4Inst<0b000000, "add",1>;
defm SUB : ArcBinaryGEN4Inst<0b000010, "sub">;
defm SUB1 : ArcBinaryGEN4Inst<0b010111, "sub1">;
defm SUB2 : ArcBinaryGEN4Inst<0b011000, "sub2">;
defm SUB3 : ArcBinaryGEN4Inst<0b011001, "sub3">;
-defm OR : ArcBinaryGEN4Inst<0b000101, "or">;
-defm AND : ArcBinaryGEN4Inst<0b000100, "and">;
-defm XOR : ArcBinaryGEN4Inst<0b000111, "xor">;
-defm MAX : ArcBinaryGEN4Inst<0b001000, "max">;
-defm MIN : ArcBinaryGEN4Inst<0b001001, "min">;
+defm RSUB : ArcBinaryGEN4Inst<0b001110, "rsub">;
+defm OR : ArcBinaryGEN4Inst<0b000101, "or",1>;
+defm AND : ArcBinaryGEN4Inst<0b000100, "and",1>;
+defm XOR : ArcBinaryGEN4Inst<0b000111, "xor",1>;
+defm MAX : ArcBinaryGEN4Inst<0b001000, "max",1>;
+defm MIN : ArcBinaryGEN4Inst<0b001001, "min",1>;
defm ASL : ArcBinaryEXT5Inst<0b000000, "asl">;
defm LSR : ArcBinaryEXT5Inst<0b000001, "lsr">;
defm ASR : ArcBinaryEXT5Inst<0b000010, "asr">;
defm ROR : ArcBinaryEXT5Inst<0b000011, "ror">;
-defm MPY : ArcBinaryGEN4Inst<0b011010, "mpy">;
-defm MPYM : ArcBinaryGEN4Inst<0b011011, "mpym">;
-defm MPYMU : ArcBinaryGEN4Inst<0b011100, "mpymu">;
-defm SETEQ : ArcBinaryGEN4Inst<0b111000, "seteq">;
+defm MPY : ArcBinaryGEN4Inst<0b011010, "mpy",1>;
+defm MPYM : ArcBinaryGEN4Inst<0b011011, "mpym",1>;
+defm MPYMU : ArcBinaryGEN4Inst<0b011100, "mpymu",1>;
+defm SETEQ : ArcBinaryGEN4Inst<0b111000, "seteq",1>;
// Patterns for 3 operand binary instructions.
defm : MultiPat<add, ADD_rrr, ADD_rru6, ADD_rrlimm>;
@@ -261,6 +300,9 @@
defm SEXB : ArcUnaryGEN4Inst<0b000101, "sexb">;
defm SEXH : ArcUnaryGEN4Inst<0b000110, "sexh">;
+// Extension unary instruction definitions.
+defm FLS : ArcUnaryEXT5Inst<0b010011, "fls">;
+
// General Unary Instruction fragments.
def : Pat<(sext_inreg i32:$a, i8), (SEXB_rr i32:$a)>;
def : Pat<(sext_inreg i32:$a, i16), (SEXH_rr i32:$a)>;
@@ -298,14 +340,24 @@
def cmov : PatFrag<(ops node:$op1, node:$op2, node:$cc),
(ARCcmov $op1, $op2, $cc)>;
let Uses = [STATUS32] in {
-def MOVcc : F32_DOP_CC_RR<0b00100, 0b001010, 0,
- (outs GPR32:$B),
- (ins GPR32:$C, GPR32:$fval, cmovpred:$cc),
- !strconcat("mov.", "$cc\t$B, $C"),
- [(set GPR32:$B, (cmov i32:$C, i32:$fval, cmovpred:$cc))]> {
- let Constraints = "$B = $fval";
+ def MOVcc : F32_DOP_CC_RR<0b00100, 0b001010, 0,
+ (outs GPR32:$B),
+ (ins GPR32:$C, GPR32:$fval, cmovpred:$cc),
+ !strconcat("mov.", "$cc\t$B, $C"),
+ [(set GPR32:$B, (cmov i32:$C, i32:$fval, cmovpred:$cc))]> {
+ let Constraints = "$B = $fval";
+ }
+
+ def MOVcc_ru6 : F32_SOP_CC_RU6<0b00100, 0b001010, 0,
+ (outs GPR32:$b), (ins u6:$c, CCOp:$cc, GPR32:$b2),
+ "mov.$cc\t$b, $c", []> {
+ let isAsCheapAsAMove=0;
+ let isPredicable=1;
+ let isReMaterializable=0;
+ let Constraints="$b2 = $b";
+ }
}
-}
+
def : Pat<(ARCGAWrapper tglobaladdr:$addr),
(MOV_rlimm tglobaladdr:$addr)>;
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
index c49af8a..fb84dd9 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.td b/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.td
index 82fdccc..5f2bc79 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCRegisterInfo.td
@@ -29,7 +29,7 @@
def R1 : Core< 1, "%r1">, DwarfRegNum<[1]>;
def R2 : Core< 2, "%r2">, DwarfRegNum<[2]>;
def R3 : Core< 3, "%r3">, DwarfRegNum<[3]>;
-let CostPerUse=1 in {
+let CostPerUse=[1] in {
def R4 : Core< 4, "%r4">, DwarfRegNum<[4]>;
def R5 : Core< 5, "%r5">, DwarfRegNum<[5]>;
def R6 : Core< 6, "%r6">, DwarfRegNum<[6]>;
@@ -44,7 +44,7 @@
def R14 : Core<14, "%r14">, DwarfRegNum<[14]>;
def R15 : Core<15, "%r15">, DwarfRegNum<[15]>;
-let CostPerUse=1 in {
+let CostPerUse=[1] in {
def R16 : Core<16, "%r16">, DwarfRegNum<[16]>;
def R17 : Core<17, "%r17">, DwarfRegNum<[17]>;
def R18 : Core<18, "%r18">, DwarfRegNum<[18]>;
diff --git a/src/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h b/src/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
index 1f1b27f..6a48562 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
@@ -29,6 +29,8 @@
class TargetMachine;
class ARCSubtarget : public ARCGenSubtargetInfo {
+ bool Xnorm = false;
+
virtual void anchor();
ARCInstrInfo InstrInfo;
ARCFrameLowering FrameLowering;
@@ -58,6 +60,8 @@
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
+
+ bool hasNorm() const { return Xnorm; }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/src/llvm-project/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index 611fd0e..b7033d0 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -107,6 +107,9 @@
static DecodeStatus DecodeLdRLImmInstruction(MCInst &, uint64_t, uint64_t,
const void *);
+static DecodeStatus DecodeCCRU6Instruction(MCInst &, uint64_t, uint64_t,
+ const void *);
+
static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t, uint64_t,
const void *);
@@ -167,19 +170,19 @@
static bool DecodeSymbolicOperand(MCInst &Inst, uint64_t Address,
uint64_t Value, const void *Decoder) {
- static const uint64_t atLeast = 2;
+ static const uint64_t AtLeast = 2;
// TODO: Try to force emitter to use MCDisassembler* instead of void*.
auto Disassembler = static_cast<const MCDisassembler *>(Decoder);
return (nullptr != Disassembler &&
Disassembler->tryAddingSymbolicOperand(Inst, Value, Address, true, 0,
- atLeast));
+ AtLeast));
}
static void DecodeSymbolicOperandOff(MCInst &Inst, uint64_t Address,
uint64_t Offset, const void *Decoder) {
- uint64_t nextAddress = Address + Offset;
+ uint64_t NextAddress = Address + Offset;
- if (!DecodeSymbolicOperand(Inst, Address, nextAddress, Decoder))
+ if (!DecodeSymbolicOperand(Inst, Address, NextAddress, Decoder))
Inst.addOperand(MCOperand::createImm(Offset));
}
@@ -272,9 +275,9 @@
const void *Decoder) {
LLVM_DEBUG(dbgs() << "Decoding MOV_S h-register\n");
using Field = decltype(Insn);
- Field h = fieldFromInstruction(Insn, 5, 3) |
+ Field H = fieldFromInstruction(Insn, 5, 3) |
(fieldFromInstruction(Insn, 0, 2) << 3);
- Field g = fieldFromInstruction(Insn, 8, 3) |
+ Field G = fieldFromInstruction(Insn, 8, 3) |
(fieldFromInstruction(Insn, 3, 2) << 3);
auto DecodeRegisterOrImm = [&Inst, Address, Decoder](Field RegNum,
@@ -287,10 +290,25 @@
return DecodeGPR32RegisterClass(Inst, RegNum, Address, Decoder);
};
- if (MCDisassembler::Success != DecodeRegisterOrImm(g, 0))
+ if (MCDisassembler::Success != DecodeRegisterOrImm(G, 0))
return MCDisassembler::Fail;
- return DecodeRegisterOrImm(h, Insn >> 16u);
+ return DecodeRegisterOrImm(H, Insn >> 16u);
+}
+
+static DecodeStatus DecodeCCRU6Instruction(MCInst &Inst, uint64_t Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned DstB;
+ LLVM_DEBUG(dbgs() << "Decoding CCRU6 instruction:\n");
+ DstB = decodeBField(Insn);
+ DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
+ using Field = decltype(Insn);
+ Field U6Field = fieldFromInstruction(Insn, 6, 11);
+ Inst.addOperand(MCOperand::createImm(U6Field));
+ Field CCField = fieldFromInstruction(Insn, 0, 4);
+ Inst.addOperand(MCOperand::createImm(CCField));
+ return MCDisassembler::Success;
}
DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
diff --git a/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp
index 8eefae5..f34b698 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp
@@ -178,3 +178,30 @@
assert(Op.isImm() && "Predicate operand is immediate.");
O << ARCBRCondCodeToString((ARCCC::BRCondCode)Op.getImm());
}
+
+void ARCInstPrinter::printCCOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ O << ARCCondCodeToString((ARCCC::CondCode)MI->getOperand(OpNum).getImm());
+}
+
+void ARCInstPrinter::printU6ShiftedBy(unsigned ShiftBy, const MCInst *MI,
+ int OpNum, raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ unsigned Value = MO.getImm();
+ unsigned Value2 = Value >> ShiftBy;
+ if (Value2 > 0x3F || (Value2 << ShiftBy != Value)) {
+ errs() << "!!! Instruction has out-of-range U6 immediate operand:\n"
+ << " Opcode is " << MI->getOpcode() << "; operand value is "
+ << Value;
+ if (ShiftBy)
+ errs() << " scaled by " << (1 << ShiftBy) << "\n";
+ assert(false && "instruction has wrong format");
+ }
+ }
+ printOperand(MI, OpNum, O);
+}
+
+void ARCInstPrinter::printU6(const MCInst *MI, int OpNum, raw_ostream &O) {
+ printU6ShiftedBy(0, MI, OpNum, O);
+}
diff --git a/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h b/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
index f6f8f9d..6f52e8f 100644
--- a/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
@@ -33,6 +33,8 @@
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &O) override;
+ void printCCOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU6(const MCInst *MI, int OpNum, raw_ostream &O);
private:
void printMemOperandRI(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -44,6 +46,8 @@
void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
void printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
+ void printU6ShiftedBy(unsigned ShiftBy, const MCInst *MI, int OpNum,
+ raw_ostream &O);
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARM.h b/src/llvm-project/llvm/lib/Target/ARM/ARM.h
index f4fdc98..5500783 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -48,7 +48,7 @@
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
FunctionPass *createMVEVPTBlockPass();
-FunctionPass *createMVEVPTOptimisationsPass();
+FunctionPass *createMVETPAndVPTOptimisationsPass();
FunctionPass *createARMOptimizeBarriersPass();
FunctionPass *createThumb2SizeReductionPass(
std::function<bool(const Function &)> Ftor = nullptr);
@@ -58,6 +58,7 @@
Pass *createMVEGatherScatterLoweringPass();
FunctionPass *createARMSLSHardeningPass();
FunctionPass *createARMIndirectThunks();
+Pass *createMVELaneInterleavingPass();
void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
ARMAsmPrinter &AP);
@@ -70,12 +71,13 @@
void initializeThumb2SizeReducePass(PassRegistry &);
void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
-void initializeMVEVPTOptimisationsPass(PassRegistry &);
+void initializeMVETPAndVPTOptimisationsPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
void initializeARMBlockPlacementPass(PassRegistry &);
void initializeMVETailPredicationPass(PassRegistry &);
void initializeMVEGatherScatterLoweringPass(PassRegistry &);
void initializeARMSLSHardeningPass(PassRegistry &);
+void initializeMVELaneInterleavingPass(PassRegistry &);
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARM.td b/src/llvm-project/llvm/lib/Target/ARM/ARM.td
index 3d0a0bf..5c1bed1 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -573,8 +573,9 @@
def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
"HardenSlsBlr", "true",
"Harden against straight line speculation across indirect calls">;
-
-
+def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat",
+ "HardenSlsNoComdat", "true",
+ "Generate thunk code for SLS mitigation in the normal text section">;
//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
@@ -852,7 +853,7 @@
FeatureCRC,
FeatureRAS,
FeatureDotProd]>;
-def ARMv87a : Architecture<"armv8.7-a", "ARMv86a", [HasV8_7aOps,
+def ARMv87a : Architecture<"armv8.7-a", "ARMv87a", [HasV8_7aOps,
FeatureAClass,
FeatureDB,
FeatureFPARMv8,
@@ -1000,12 +1001,15 @@
FeatureVFP2,
FeatureHasSlowFPVMLx]>;
-def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>;
-def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>;
-def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>;
-def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m,
+ FeatureHasNoBranchPredictor]>;
+def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m,
+ FeatureHasNoBranchPredictor]>;
+def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m,
+ FeatureHasNoBranchPredictor]>;
+def : Processor<"sc000", ARMV6Itineraries, [ARMv6m,
+ FeatureHasNoBranchPredictor]>;
-def : Processor<"arm1176j-s", ARMV6Itineraries, [ARMv6kz]>;
def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>;
def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz,
FeatureVFP2,
@@ -1199,7 +1203,8 @@
FeatureUseMISched]>;
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
- FeatureNoMovt]>;
+ FeatureNoMovt,
+ FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
FeatureDSP,
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 04e2186..ba594b7 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1293,7 +1293,6 @@
const MachineFunction &MF = *MI->getParent()->getParent();
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
- unsigned FramePtr = STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
// If we just ended a constant pool, mark it as such.
if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -2039,12 +2038,12 @@
if (STI.isTargetDarwin() || STI.isTargetWindows()) {
// These platforms always use the same frame register
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
- .addReg(FramePtr)
- .addReg(SrcReg)
- .addImm(0)
- // Predicate.
- .addImm(ARMCC::AL)
- .addReg(0));
+ .addReg(STI.getFramePointerReg())
+ .addReg(SrcReg)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
} else {
// If the calling code might use either R7 or R11 as
// frame pointer register, restore it into both.
@@ -2109,12 +2108,12 @@
if (STI.isTargetDarwin() || STI.isTargetWindows()) {
// These platforms always use the same frame register
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
- .addReg(FramePtr)
- .addReg(SrcReg)
- .addImm(0)
- // Predicate.
- .addImm(ARMCC::AL)
- .addReg(0));
+ .addReg(STI.getFramePointerReg())
+ .addReg(SrcReg)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
} else {
// If the calling code might use either R7 or R11 as
// frame pointer register, restore it into both.
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e418d53..9b058ff 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1306,19 +1306,14 @@
case ARM::tSTRspi:
case ARM::VSTRD:
case ARM::VSTRS:
+ case ARM::VSTR_P0_off:
+ case ARM::MVE_VSTRWU32:
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
}
break;
- case ARM::VSTR_P0_off:
- if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
- MI.getOperand(1).getImm() == 0) {
- FrameIndex = MI.getOperand(0).getIndex();
- return ARM::P0;
- }
- break;
case ARM::VST1q64:
case ARM::VST1d64TPseudo:
case ARM::VST1d64QPseudo:
@@ -1543,19 +1538,14 @@
case ARM::tLDRspi:
case ARM::VLDRD:
case ARM::VLDRS:
+ case ARM::VLDR_P0_off:
+ case ARM::MVE_VLDRWU32:
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
}
break;
- case ARM::VLDR_P0_off:
- if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
- MI.getOperand(1).getImm() == 0) {
- FrameIndex = MI.getOperand(0).getIndex();
- return ARM::P0;
- }
- break;
case ARM::VLD1q64:
case ARM::VLD1d8TPseudo:
case ARM::VLD1d16TPseudo:
@@ -5391,7 +5381,9 @@
switch (MI.getOpcode()) {
case ARM::VSETLNi32:
+ case ARM::MVE_VMOV_to_lane_32:
// dX = VSETLNi32 dY, rZ, imm
+ // qX = MVE_VMOV_to_lane_32 qY, rZ, imm
const MachineOperand &MOBaseReg = MI.getOperand(1);
const MachineOperand &MOInsertedReg = MI.getOperand(2);
if (MOInsertedReg.isUndef())
@@ -5402,7 +5394,7 @@
InsertedReg.Reg = MOInsertedReg.getReg();
InsertedReg.SubReg = MOInsertedReg.getSubReg();
- InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1;
+ InsertedReg.SubIdx = ARM::ssub_0 + MOIndex.getImm();
return true;
}
llvm_unreachable("Target dependent opcode missing");
@@ -6120,6 +6112,7 @@
// Be conservative with ARMv8.1 MVE instructions.
if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+ Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2WhileLoopStartTP ||
Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd ||
Opc == ARM::t2LoopEndDec)
return outliner::InstrType::Illegal;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 1b843c4..0ebba0d 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -366,7 +366,9 @@
bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
return MI->getOpcode() == ARM::t2LoopEndDec ||
- MI->getOpcode() == ARM::t2DoLoopStartTP;
+ MI->getOpcode() == ARM::t2DoLoopStartTP ||
+ MI->getOpcode() == ARM::t2WhileLoopStartLR ||
+ MI->getOpcode() == ARM::t2WhileLoopStartTP;
}
private:
@@ -644,11 +646,6 @@
Opc == ARM::t2BR_JT;
}
-static inline bool isLowOverheadTerminatorOpcode(int Opc) {
- return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
- Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec;
-}
-
static inline
bool isIndirectBranchOpcode(int Opc) {
return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
@@ -888,8 +885,12 @@
return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
case ARMII::AddrModeT2_i8:
return std::abs(Imm) < (((1 << 8) * 1) - 1);
+ case ARMII::AddrMode2:
+ return std::abs(Imm) < (((1 << 12) * 1) - 1);
case ARMII::AddrModeT2_i12:
return Imm >= 0 && Imm < (((1 << 12) * 1) - 1);
+ case ARMII::AddrModeT2_i8s4:
+ return std::abs(Imm) < (((1 << 8) * 4) - 1) && Imm % 4 == 0;
default:
llvm_unreachable("Unhandled Addressing mode");
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 1a264da..4883e56 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -59,10 +59,6 @@
ARM_MC::initLLVMToCVRegMapping(this);
}
-static unsigned getFramePointerReg(const ARMSubtarget &STI) {
- return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
-}
-
const MCPhysReg*
ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
@@ -79,6 +75,11 @@
return CSR_NoRegs_SaveList;
} else if (F.getCallingConv() == CallingConv::CFGuard_Check) {
return CSR_Win_AAPCS_CFGuard_Check_SaveList;
+ } else if (F.getCallingConv() == CallingConv::SwiftTail) {
+ return STI.isTargetDarwin()
+ ? CSR_iOS_SwiftTail_SaveList
+ : (UseSplitPush ? CSR_AAPCS_SplitPush_SwiftTail_SaveList
+ : CSR_AAPCS_SwiftTail_SaveList);
} else if (F.hasFnAttribute("interrupt")) {
if (STI.isMClass()) {
// M-class CPUs have hardware which saves the registers needed to allow a
@@ -129,6 +130,10 @@
return CSR_NoRegs_RegMask;
if (CC == CallingConv::CFGuard_Check)
return CSR_Win_AAPCS_CFGuard_Check_RegMask;
+ if (CC == CallingConv::SwiftTail) {
+ return STI.isTargetDarwin() ? CSR_iOS_SwiftTail_RegMask
+ : CSR_AAPCS_SwiftTail_RegMask;
+ }
if (STI.getTargetLowering()->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask
@@ -197,7 +202,7 @@
markSuperRegs(Reserved, ARM::FPSCR);
markSuperRegs(Reserved, ARM::APSR_NZCV);
if (TFI->hasFP(MF))
- markSuperRegs(Reserved, getFramePointerReg(STI));
+ markSuperRegs(Reserved, STI.getFramePointerReg());
if (hasBasePointer(MF))
markSuperRegs(Reserved, BasePtr);
// Some targets reserve R9.
@@ -234,7 +239,7 @@
BitVector Reserved(getNumRegs());
markSuperRegs(Reserved, ARM::PC);
if (TFI->hasFP(MF))
- markSuperRegs(Reserved, getFramePointerReg(STI));
+ markSuperRegs(Reserved, STI.getFramePointerReg());
if (hasBasePointer(MF))
markSuperRegs(Reserved, BasePtr);
assert(checkAllSuperRegsMarked(Reserved));
@@ -403,7 +408,7 @@
// If we have stack realignment and VLAs, we have no pointer to use to
// access the stack. If we have stack realignment, and a large call frame,
// we have no place to allocate the emergency spill slot.
- if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
+ if (hasStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
return true;
// Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
@@ -435,6 +440,7 @@
bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const ARMFrameLowering *TFI = getFrameLowering(MF);
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
// We can't realign the stack if:
// 1. Dynamic stack realignment is explicitly disabled,
// 2. There are VLAs in the function and the base pointer is disabled.
@@ -442,7 +448,7 @@
return false;
// Stack realignment requires a frame pointer. If we already started
// register allocation with frame pointer elimination, it is too late now.
- if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>())))
+ if (!MRI->canReserveReg(STI.getFramePointerReg()))
return false;
// We may also need a base pointer if there are dynamic allocas or stack
// pointer adjustments around calls.
@@ -458,8 +464,8 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
return true;
- return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken()
- || needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ hasStackRealignment(MF);
}
Register
@@ -468,7 +474,7 @@
const ARMFrameLowering *TFI = getFrameLowering(MF);
if (TFI->hasFP(MF))
- return getFramePointerReg(STI);
+ return STI.getFramePointerReg();
return ARM::SP;
}
@@ -909,3 +915,17 @@
}
return false;
}
+
+bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2).
+ if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 &&
+ SrcRC == &ARM::DPRRegClass &&
+ (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1))
+ return false;
+
+ return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
+ SrcRC, SrcSubReg);
+}
\ No newline at end of file
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5afb6c6..57d7842 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -209,6 +209,11 @@
unsigned DstSubReg,
const TargetRegisterClass *NewRC,
LiveIntervals &LIS) const override;
+
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index 9ba1600..5ea47f5 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -15,6 +15,7 @@
#include "ARMBaseInstrInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
+#include "MVETailPredUtils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
@@ -36,8 +37,10 @@
ARMBlockPlacement() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
- void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After);
+ void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *Before);
bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
+ bool fixBackwardsWLS(MachineLoop *ML);
+ bool processPostOrderLoops(MachineLoop *ML);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -57,9 +60,97 @@
INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
false)
+static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) {
+ for (auto &Terminator : MBB->terminators()) {
+ if (isWhileLoopStart(Terminator))
+ return &Terminator;
+ }
+ return nullptr;
+}
+
+/// Find WhileLoopStart in the loop predecessor BB or otherwise in its only
+/// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair.
+static MachineInstr *findWLS(MachineLoop *ML) {
+ MachineBasicBlock *Predecessor = ML->getLoopPredecessor();
+ if (!Predecessor)
+ return nullptr;
+ MachineInstr *WlsInstr = findWLSInBlock(Predecessor);
+ if (WlsInstr)
+ return WlsInstr;
+ if (Predecessor->pred_size() == 1)
+ return findWLSInBlock(*Predecessor->pred_begin());
+ return nullptr;
+}
+
+/// Checks if loop has a backwards branching WLS, and if possible, fixes it.
+/// This requires checking the predecessor (ie. preheader or it's predecessor)
+/// for a WLS and if its loopExit/target is before it.
+/// If moving the predecessor won't convert a WLS (to the predecessor) from
+/// a forward to a backward branching WLS, then move the predecessor block
+/// to before the loopExit/target.
+bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
+ MachineInstr *WlsInstr = findWLS(ML);
+ if (!WlsInstr)
+ return false;
+
+ MachineBasicBlock *Predecessor = WlsInstr->getParent();
+ MachineBasicBlock *LoopExit = getWhileLoopStartTargetBB(*WlsInstr);
+
+ // We don't want to move Preheader to before the function's entry block.
+ if (!LoopExit->getPrevNode())
+ return false;
+ if (blockIsBefore(Predecessor, LoopExit))
+ return false;
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
+ << Predecessor->getFullName() << " to "
+ << LoopExit->getFullName() << "\n");
+
+ // Make sure no forward branching WLSs to the Predecessor become backwards
+ // branching. An example loop structure where the Predecessor can't be moved,
+ // since bb2's WLS will become forwards once bb3 is moved before/above bb1.
+ //
+ // bb1: - LoopExit
+ // bb2:
+ // WLS bb3
+ // bb3: - Predecessor
+ // WLS bb1
+ // bb4: - Header
+ for (auto It = ++LoopExit->getIterator(); It != Predecessor->getIterator();
+ ++It) {
+ MachineBasicBlock *MBB = &*It;
+ for (auto &Terminator : MBB->terminators()) {
+ if (!isWhileLoopStart(Terminator))
+ continue;
+ MachineBasicBlock *WLSTarget = getWhileLoopStartTargetBB(Terminator);
+ // TODO: Analyse the blocks to make a decision if it would be worth
+ // moving Preheader even if we'd introduce a backwards WLS
+ if (WLSTarget == Predecessor) {
+ LLVM_DEBUG(
+ dbgs() << DEBUG_PREFIX
+ << "Can't move Predecessor"
+ "block as it would convert a WLS from forward to a "
+ "backwards branching WLS\n");
+ return false;
+ }
+ }
+ }
+
+ moveBasicBlock(Predecessor, LoopExit);
+ return true;
+}
+
+/// Updates ordering (of WLS BB and their loopExits) in inner loops first
+/// Returns true if any change was made in any of the loops
+bool ARMBlockPlacement::processPostOrderLoops(MachineLoop *ML) {
+ bool Changed = false;
+ for (auto *InnerML : *ML)
+ Changed |= processPostOrderLoops(InnerML);
+ return Changed | fixBackwardsWLS(ML);
+}
+
bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
- return false;
+ return false;
const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
if (!ST.hasLOB())
return false;
@@ -72,108 +163,9 @@
BBUtils->adjustBBOffsetsAfter(&MF.front());
bool Changed = false;
- // Find loops with a backwards branching WLS.
- // This requires looping over the loops in the function, checking each
- // preheader for a WLS and if its target is before the preheader. If moving
- // the target block wouldn't produce another backwards WLS or a new forwards
- // LE branch then move the target block after the preheader.
- for (auto *ML : *MLI) {
- MachineBasicBlock *Preheader = ML->getLoopPredecessor();
- if (!Preheader)
- continue;
-
- for (auto &Terminator : Preheader->terminators()) {
- if (Terminator.getOpcode() != ARM::t2WhileLoopStart)
- continue;
- MachineBasicBlock *LoopExit = Terminator.getOperand(1).getMBB();
- // We don't want to move the function's entry block.
- if (!LoopExit->getPrevNode())
- continue;
- if (blockIsBefore(Preheader, LoopExit))
- continue;
- LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
- << Preheader->getFullName() << " to "
- << LoopExit->getFullName() << "\n");
-
- // Make sure that moving the target block doesn't cause any of its WLSs
- // that were previously not backwards to become backwards
- bool CanMove = true;
- for (auto &LoopExitTerminator : LoopExit->terminators()) {
- if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStart)
- continue;
- // An example loop structure where the LoopExit can't be moved, since
- // bb1's WLS will become backwards once it's moved after bb3 bb1: -
- // LoopExit
- // WLS bb2 - LoopExit2
- // bb2:
- // ...
- // bb3: - Preheader
- // WLS bb1
- // bb4: - Header
- MachineBasicBlock *LoopExit2 =
- LoopExitTerminator.getOperand(1).getMBB();
- // If the WLS from LoopExit to LoopExit2 is already backwards then
- // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
- // after the Preheader then moving will keep it as a forward branch, so
- // it can be moved. If LoopExit2 is between the Preheader and LoopExit
- // then moving LoopExit will make it a backwards branch, so it can't be
- // moved since we'd fix one and introduce one backwards branch.
- // TODO: Analyse the blocks to make a decision if it would be worth
- // moving LoopExit even if LoopExit2 is between the Preheader and
- // LoopExit.
- if (!blockIsBefore(LoopExit2, LoopExit) &&
- (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) {
- LLVM_DEBUG(dbgs() << DEBUG_PREFIX
- << "Can't move the target block as it would "
- "introduce a new backwards WLS branch\n");
- CanMove = false;
- break;
- }
- }
-
- if (CanMove) {
- // Make sure no LEs become forwards.
- // An example loop structure where the LoopExit can't be moved, since
- // bb2's LE will become forwards once bb1 is moved after bb3.
- // bb1: - LoopExit
- // bb2:
- // LE bb1 - Terminator
- // bb3: - Preheader
- // WLS bb1
- // bb4: - Header
- for (auto It = LoopExit->getIterator(); It != Preheader->getIterator();
- It++) {
- MachineBasicBlock *MBB = &*It;
- for (auto &Terminator : MBB->terminators()) {
- if (Terminator.getOpcode() != ARM::t2LoopEndDec)
- continue;
- MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
- // The LE will become forwards branching if it branches to LoopExit
- // which isn't allowed by the architecture, so we should avoid
- // introducing these.
- // TODO: Analyse the blocks to make a decision if it would be worth
- // moving LoopExit even if we'd introduce a forwards LE
- if (LETarget == LoopExit) {
- LLVM_DEBUG(dbgs() << DEBUG_PREFIX
- << "Can't move the target block as it would "
- "introduce a new forwards LE branch\n");
- CanMove = false;
- break;
- }
- }
- }
-
- if (!CanMove)
- break;
- }
-
- if (CanMove) {
- moveBasicBlock(LoopExit, Preheader);
- Changed = true;
- break;
- }
- }
- }
+ // Find loops with a backwards branching WLS and fix if possible.
+ for (auto *ML : *MLI)
+ Changed |= processPostOrderLoops(ML);
return Changed;
}
@@ -183,17 +175,24 @@
return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB);
}
+// Moves a BasicBlock before another, without changing the control flow
void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
- MachineBasicBlock *After) {
- LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after "
- << After->getName() << "\n");
+ MachineBasicBlock *Before) {
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " before "
+ << Before->getName() << "\n");
MachineBasicBlock *BBPrevious = BB->getPrevNode();
assert(BBPrevious && "Cannot move the function entry basic block");
- MachineBasicBlock *AfterNext = After->getNextNode();
MachineBasicBlock *BBNext = BB->getNextNode();
- BB->moveAfter(After);
+ MachineBasicBlock *BeforePrev = Before->getPrevNode();
+ assert(BeforePrev &&
+ "Cannot move the given block to before the function entry block");
+ MachineFunction *F = BB->getParent();
+ BB->moveBefore(Before);
+ // Since only the blocks are to be moved around (but the control flow must
+ // not change), if there were any fall-throughs (to/from adjacent blocks),
+ // replace with unconditional branch to the fall through block.
auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from "
<< From->getName() << " to " << To->getName() << "\n");
@@ -217,12 +216,14 @@
// Fix fall-through to the moved BB from the one that used to be before it.
if (BBPrevious->isSuccessor(BB))
FixFallthrough(BBPrevious, BB);
- // Fix fall through from the destination BB to the one that used to follow.
- if (AfterNext && After->isSuccessor(AfterNext))
- FixFallthrough(After, AfterNext);
+ // Fix fall through from the destination BB to the one that used to before it.
+ if (BeforePrev->isSuccessor(Before))
+ FixFallthrough(BeforePrev, Before);
// Fix fall through from the moved BB to the one that used to follow.
if (BBNext && BB->isSuccessor(BBNext))
FixFallthrough(BB, BBNext);
- BBUtils->adjustBBOffsetsAfter(After);
+ F->RenumberBlocks();
+ BBUtils->computeAllBlockSizes();
+ BBUtils->adjustBBOffsetsAfter(&F->front());
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
index 6feed82..aff7ec8 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -87,12 +87,12 @@
/// function return values and call parameters).
struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
ARMOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
- CCAssignFn *AssignFn)
- : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB)
+ : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
@@ -121,19 +121,15 @@
MIB.addUse(PhysReg, RegState::Implicit);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
- "Unsupported size");
-
Register ExtReg = extendRegister(ValVReg, VA);
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- Align(1));
+ MPO, MachineMemOperand::MOStore, MemTy, Align(1));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
- unsigned assignCustomValue(const CallLowering::ArgInfo &Arg,
+ unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
ArrayRef<CCValAssign> VAs) override {
assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
@@ -168,69 +164,11 @@
return 1;
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
- CCState &State) override {
- if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State))
- return true;
-
- StackSize =
- std::max(StackSize, static_cast<uint64_t>(State.getNextStackOffset()));
- return false;
- }
-
- MachineInstrBuilder &MIB;
- uint64_t StackSize = 0;
+ MachineInstrBuilder MIB;
};
} // end anonymous namespace
-void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- MachineFunction &MF) const {
- const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
- LLVMContext &Ctx = OrigArg.Ty->getContext();
- const DataLayout &DL = MF.getDataLayout();
- const Function &F = MF.getFunction();
-
- SmallVector<EVT, 4> SplitVTs;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, nullptr, nullptr, 0);
- assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
-
- if (SplitVTs.size() == 1) {
- // Even if there is no splitting to do, we still want to replace the
- // original type (e.g. pointer type -> integer).
- auto Flags = OrigArg.Flags[0];
- Flags.setOrigAlign(DL.getABITypeAlign(OrigArg.Ty));
- SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- Flags, OrigArg.IsFixed);
- return;
- }
-
- // Create one ArgInfo for each virtual register.
- for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
- EVT SplitVT = SplitVTs[i];
- Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
- auto Flags = OrigArg.Flags[0];
-
- Flags.setOrigAlign(DL.getABITypeAlign(SplitTy));
-
- bool NeedsConsecutiveRegisters =
- TLI.functionArgumentNeedsConsecutiveRegisters(
- SplitTy, F.getCallingConv(), F.isVarArg());
- if (NeedsConsecutiveRegisters) {
- Flags.setInConsecutiveRegs();
- if (i == e - 1)
- Flags.setInConsecutiveRegsLast();
- }
-
- // FIXME: We also want to split SplitTy further.
- Register PartReg = OrigArg.Regs[i];
- SplitArgs.emplace_back(PartReg, SplitTy, Flags, OrigArg.IsFixed);
- }
-}
-
/// Lower the return value for the already existing \p Ret. This assumes that
/// \p MIRBuilder's insertion point is correct.
bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
@@ -243,23 +181,25 @@
auto &MF = MIRBuilder.getMF();
const auto &F = MF.getFunction();
- auto DL = MF.getDataLayout();
+ const auto &DL = MF.getDataLayout();
auto &TLI = *getTLI<ARMTargetLowering>();
if (!isSupportedType(DL, TLI, Val->getType()))
return false;
- ArgInfo OrigRetInfo(VRegs, Val->getType());
+ ArgInfo OrigRetInfo(VRegs, Val->getType(), 0);
setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
SmallVector<ArgInfo, 4> SplitRetInfos;
- splitToValueTypes(OrigRetInfo, SplitRetInfos, MF);
+ splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, F.getCallingConv());
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
- ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret,
- AssignFn);
- return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler);
+ OutgoingValueAssigner RetAssigner(AssignFn);
+ ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+ return determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg());
}
bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -284,48 +224,50 @@
/// formal arguments and call return values).
struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler {
ARMIncomingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, CCAssignFn AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ MachineRegisterInfo &MRI)
+ : IncomingValueHandler(MIRBuilder, MRI) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- int FI = MFI.CreateFixedObject(Size, Offset, true);
+ // Byval is assumed to be writable memory, but other stack passed arguments
+ // are not.
+ const bool IsImmutable = !Flags.isByVal();
+
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
return MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI)
.getReg(0);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
- "Unsupported size");
-
if (VA.getLocInfo() == CCValAssign::SExt ||
VA.getLocInfo() == CCValAssign::ZExt) {
// If the value is zero- or sign-extended, its size becomes 4 bytes, so
// that's what we should load.
- Size = 4;
+ MemTy = LLT::scalar(32);
assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
- auto LoadVReg = buildLoad(LLT::scalar(32), Addr, Size, MPO);
+ auto LoadVReg = buildLoad(LLT::scalar(32), Addr, MemTy, MPO);
MIRBuilder.buildTrunc(ValVReg, LoadVReg);
} else {
// If the value is not extended, a simple load will suffice.
- buildLoad(ValVReg, Addr, Size, MPO);
+ buildLoad(ValVReg, Addr, MemTy, MPO);
}
}
- MachineInstrBuilder buildLoad(const DstOp &Res, Register Addr, uint64_t Size,
+ MachineInstrBuilder buildLoad(const DstOp &Res, Register Addr, LLT MemTy,
MachinePointerInfo &MPO) {
MachineFunction &MF = MIRBuilder.getMF();
- auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size,
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, MemTy,
inferAlignFromPtrInfo(MF, MPO));
return MIRBuilder.buildLoad(Res, Addr, *MMO);
}
@@ -355,7 +297,7 @@
}
}
- unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg,
+ unsigned assignCustomValue(ARMCallLowering::ArgInfo &Arg,
ArrayRef<CCValAssign> VAs) override {
assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
@@ -398,9 +340,8 @@
};
struct FormalArgHandler : public ARMIncomingValueHandler {
- FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn AssignFn)
- : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : ARMIncomingValueHandler(MIRBuilder, MRI) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -429,7 +370,7 @@
auto &MF = MIRBuilder.getMF();
auto &MBB = MIRBuilder.getMBB();
- auto DL = MF.getDataLayout();
+ const auto &DL = MF.getDataLayout();
for (auto &Arg : F.args()) {
if (!isSupportedType(DL, TLI, Arg.getType()))
@@ -441,16 +382,16 @@
CCAssignFn *AssignFn =
TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
- FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
- AssignFn);
+ OutgoingValueAssigner ArgAssigner(AssignFn);
+ FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo());
SmallVector<ArgInfo, 8> SplitArgInfos;
unsigned Idx = 0;
for (auto &Arg : F.args()) {
- ArgInfo OrigArgInfo(VRegs[Idx], Arg.getType());
+ ArgInfo OrigArgInfo(VRegs[Idx], Arg.getType(), Idx);
setArgFlags(OrigArgInfo, Idx + AttributeList::FirstArgIndex, DL, F);
- splitToValueTypes(OrigArgInfo, SplitArgInfos, MF);
+ splitToValueTypes(OrigArgInfo, SplitArgInfos, DL, F.getCallingConv());
Idx++;
}
@@ -458,7 +399,9 @@
if (!MBB.empty())
MIRBuilder.setInstr(*MBB.begin());
- if (!handleAssignments(MIRBuilder, SplitArgInfos, ArgHandler))
+ if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgInfos,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg()))
return false;
// Move back to the end of the basic block.
@@ -470,8 +413,8 @@
struct CallReturnHandler : public ARMIncomingValueHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : ARMIncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -546,12 +489,14 @@
if (Arg.Flags[0].isByVal())
return false;
- splitToValueTypes(Arg, ArgInfos, MF);
+ splitToValueTypes(Arg, ArgInfos, DL, Info.CallConv);
}
auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, Info.IsVarArg);
- ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
- if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+ OutgoingValueAssigner ArgAssigner(ArgAssignFn);
+ ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, ArgInfos,
+ MIRBuilder, Info.CallConv, Info.IsVarArg))
return false;
// Now we can add the actual call instruction to the correct basic block.
@@ -562,19 +507,24 @@
return false;
ArgInfos.clear();
- splitToValueTypes(Info.OrigRet, ArgInfos, MF);
+ splitToValueTypes(Info.OrigRet, ArgInfos, DL, Info.CallConv);
auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg);
- CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
+ OutgoingValueAssigner Assigner(RetAssignFn);
+ CallReturnHandler RetHandler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(RetHandler, Assigner, ArgInfos,
+ MIRBuilder, Info.CallConv,
+ Info.IsVarArg))
return false;
}
// We now know the size of the stack - update the ADJCALLSTACKDOWN
// accordingly.
- CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
+ CallSeqStart.addImm(ArgAssigner.StackOffset)
+ .addImm(0)
+ .add(predOps(ARMCC::AL));
MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
- .addImm(ArgHandler.StackSize)
+ .addImm(ArgAssigner.StackOffset)
.addImm(0)
.add(predOps(ARMCC::AL));
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h b/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
index 3be73d4..87b18f81 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -47,12 +47,6 @@
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
ArrayRef<Register> VRegs,
MachineInstrBuilder &Ret) const;
-
- /// Split an argument into one or more arguments that the CC lowering can cope
- /// with.
- void splitToValueTypes(const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
index 67c822a..d8d9ca3 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -256,22 +256,26 @@
}
PendingMembers.clear();
return true;
- } else if (LocVT != MVT::i32)
+ }
+
+ if (LocVT != MVT::i32)
RegList = SRegList;
// Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
for (auto Reg : RegList)
State.AllocateReg(Reg);
+ // Clamp the alignment between 4 and 8.
+ if (State.getMachineFunction().getSubtarget<ARMSubtarget>().isTargetAEABI())
+ Alignment = ArgFlags.getNonZeroMemAlign() <= 4 ? Align(4) : Align(8);
+
// After the first item has been allocated, the rest are packed as tightly as
// possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
// be allocating a bunch of i32 slots).
- const Align RestAlign = std::min(Alignment, Align(Size));
-
for (auto &It : PendingMembers) {
It.convertToMem(State.AllocateStack(Size, Alignment));
State.addLoc(It);
- Alignment = RestAlign;
+ Alignment = Align(1);
}
// All pending members have now been allocated
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td b/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
index 3517274..a6dbe56 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -278,6 +278,9 @@
// R8 is used to pass swifterror, remove it from CSR.
def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>;
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS, R10)>;
+
// The order of callee-saved registers needs to match the order we actually push
// them in FrameLowering, because this order is what's used by
// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
@@ -290,6 +293,10 @@
def CSR_AAPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
R8)>;
+// R10 is used to pass swifterror, remove it from CSR.
+def CSR_AAPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
+ R10)>;
+
// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
// and the pointer return value are both passed in R0 in these cases, this can
// be partially modelled by treating R0 as a callee-saved register
@@ -305,6 +312,9 @@
// R8 is used to pass swifterror, remove it from CSR.
def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_iOS_SwiftTail : CalleeSavedRegs<(sub CSR_iOS, R10)>;
+
def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
(sub CSR_AAPCS_ThisReturn, R9))>;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 630490f..e15826f 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -297,12 +297,11 @@
void ARMConstantIslands::verify() {
#ifndef NDEBUG
BBInfoVector &BBInfo = BBUtils->getBBInfo();
- assert(std::is_sorted(MF->begin(), MF->end(),
- [&BBInfo](const MachineBasicBlock &LHS,
+ assert(is_sorted(*MF, [&BBInfo](const MachineBasicBlock &LHS,
const MachineBasicBlock &RHS) {
- return BBInfo[LHS.getNumber()].postOffset() <
- BBInfo[RHS.getNumber()].postOffset();
- }));
+ return BBInfo[LHS.getNumber()].postOffset() <
+ BBInfo[RHS.getNumber()].postOffset();
+ }));
LLVM_DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
CPUser &U = CPUsers[i];
@@ -851,7 +850,9 @@
case ARM::LDRcp:
case ARM::t2LDRpci:
case ARM::t2LDRHpci:
+ case ARM::t2LDRSHpci:
case ARM::t2LDRBpci:
+ case ARM::t2LDRSBpci:
Bits = 12; // +-offset_12
NegOk = true;
break;
@@ -1985,7 +1986,8 @@
LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI);
MachineInstr *NewBR =
BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc))
- .addReg(Reg, getKillRegState(RegKilled))
+ .addReg(Reg, getKillRegState(RegKilled) |
+ getRegState(Cmp.MI->getOperand(0)))
.addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
Cmp.MI->eraseFromParent();
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index a7f1765..2167ad5 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -190,40 +190,73 @@
{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true},
{ ARM::VLD1d16QPseudo, ARM::VLD1d16Q, true, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VLD1d16QPseudoWB_fixed, ARM::VLD1d16Qwb_fixed, true, true, false, SingleSpc, 4, 4 ,false},
+{ ARM::VLD1d16QPseudoWB_register, ARM::VLD1d16Qwb_register, true, true, true, SingleSpc, 4, 4 ,false},
{ ARM::VLD1d16TPseudo, ARM::VLD1d16T, true, false, false, SingleSpc, 3, 4 ,false},
+{ ARM::VLD1d16TPseudoWB_fixed, ARM::VLD1d16Twb_fixed, true, true, false, SingleSpc, 3, 4 ,false},
+{ ARM::VLD1d16TPseudoWB_register, ARM::VLD1d16Twb_register, true, true, true, SingleSpc, 3, 4 ,false},
+
{ ARM::VLD1d32QPseudo, ARM::VLD1d32Q, true, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VLD1d32QPseudoWB_fixed, ARM::VLD1d32Qwb_fixed, true, true, false, SingleSpc, 4, 2 ,false},
+{ ARM::VLD1d32QPseudoWB_register, ARM::VLD1d32Qwb_register, true, true, true, SingleSpc, 4, 2 ,false},
{ ARM::VLD1d32TPseudo, ARM::VLD1d32T, true, false, false, SingleSpc, 3, 2 ,false},
+{ ARM::VLD1d32TPseudoWB_fixed, ARM::VLD1d32Twb_fixed, true, true, false, SingleSpc, 3, 2 ,false},
+{ ARM::VLD1d32TPseudoWB_register, ARM::VLD1d32Twb_register, true, true, true, SingleSpc, 3, 2 ,false},
+
{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false},
{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false},
{ ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false},
{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false},
{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false},
{ ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false},
+
{ ARM::VLD1d8QPseudo, ARM::VLD1d8Q, true, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VLD1d8QPseudoWB_fixed, ARM::VLD1d8Qwb_fixed, true, true, false, SingleSpc, 4, 8 ,false},
+{ ARM::VLD1d8QPseudoWB_register, ARM::VLD1d8Qwb_register, true, true, true, SingleSpc, 4, 8 ,false},
{ ARM::VLD1d8TPseudo, ARM::VLD1d8T, true, false, false, SingleSpc, 3, 8 ,false},
+{ ARM::VLD1d8TPseudoWB_fixed, ARM::VLD1d8Twb_fixed, true, true, false, SingleSpc, 3, 8 ,false},
+{ ARM::VLD1d8TPseudoWB_register, ARM::VLD1d8Twb_register, true, true, true, SingleSpc, 3, 8 ,false},
+
{ ARM::VLD1q16HighQPseudo, ARM::VLD1d16Q, true, false, false, SingleHighQSpc, 4, 4 ,false},
+{ ARM::VLD1q16HighQPseudo_UPD, ARM::VLD1d16Qwb_fixed, true, true, true, SingleHighQSpc, 4, 4 ,false},
{ ARM::VLD1q16HighTPseudo, ARM::VLD1d16T, true, false, false, SingleHighTSpc, 3, 4 ,false},
+{ ARM::VLD1q16HighTPseudo_UPD, ARM::VLD1d16Twb_fixed, true, true, true, SingleHighTSpc, 3, 4 ,false},
{ ARM::VLD1q16LowQPseudo_UPD, ARM::VLD1d16Qwb_fixed, true, true, true, SingleLowSpc, 4, 4 ,false},
{ ARM::VLD1q16LowTPseudo_UPD, ARM::VLD1d16Twb_fixed, true, true, true, SingleLowSpc, 3, 4 ,false},
+
{ ARM::VLD1q32HighQPseudo, ARM::VLD1d32Q, true, false, false, SingleHighQSpc, 4, 2 ,false},
+{ ARM::VLD1q32HighQPseudo_UPD, ARM::VLD1d32Qwb_fixed, true, true, true, SingleHighQSpc, 4, 2 ,false},
{ ARM::VLD1q32HighTPseudo, ARM::VLD1d32T, true, false, false, SingleHighTSpc, 3, 2 ,false},
+{ ARM::VLD1q32HighTPseudo_UPD, ARM::VLD1d32Twb_fixed, true, true, true, SingleHighTSpc, 3, 2 ,false},
{ ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1d32Qwb_fixed, true, true, true, SingleLowSpc, 4, 2 ,false},
{ ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1d32Twb_fixed, true, true, true, SingleLowSpc, 3, 2 ,false},
+
{ ARM::VLD1q64HighQPseudo, ARM::VLD1d64Q, true, false, false, SingleHighQSpc, 4, 1 ,false},
+{ ARM::VLD1q64HighQPseudo_UPD, ARM::VLD1d64Qwb_fixed, true, true, true, SingleHighQSpc, 4, 1 ,false},
{ ARM::VLD1q64HighTPseudo, ARM::VLD1d64T, true, false, false, SingleHighTSpc, 3, 1 ,false},
+{ ARM::VLD1q64HighTPseudo_UPD, ARM::VLD1d64Twb_fixed, true, true, true, SingleHighTSpc, 3, 1 ,false},
{ ARM::VLD1q64LowQPseudo_UPD, ARM::VLD1d64Qwb_fixed, true, true, true, SingleLowSpc, 4, 1 ,false},
{ ARM::VLD1q64LowTPseudo_UPD, ARM::VLD1d64Twb_fixed, true, true, true, SingleLowSpc, 3, 1 ,false},
+
{ ARM::VLD1q8HighQPseudo, ARM::VLD1d8Q, true, false, false, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VLD1q8HighQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleHighQSpc, 4, 8 ,false},
{ ARM::VLD1q8HighTPseudo, ARM::VLD1d8T, true, false, false, SingleHighTSpc, 3, 8 ,false},
+{ ARM::VLD1q8HighTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleHighTSpc, 3, 8 ,false},
{ ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false},
{ ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false},
{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false},
{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudoWB_fixed, ARM::VLD2DUPd16x2wb_fixed, true, true, false, OddDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudoWB_register, ARM::VLD2DUPd16x2wb_register, true, true, true, OddDblSpc, 2, 4 ,false},
{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false},
{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudoWB_fixed, ARM::VLD2DUPd32x2wb_fixed, true, true, false, OddDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudoWB_register, ARM::VLD2DUPd32x2wb_register, true, true, true, OddDblSpc, 2, 2 ,false},
{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false},
{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudoWB_fixed, ARM::VLD2DUPd8x2wb_fixed, true, true, false, OddDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudoWB_register, ARM::VLD2DUPd8x2wb_register, true, true, true, OddDblSpc, 2, 8 ,false},
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
@@ -254,10 +287,13 @@
{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},
{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true},
{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true},
+{ ARM::VLD3DUPq16OddPseudo_UPD, ARM::VLD3DUPq16_UPD, true, true, true, OddDblSpc, 3, 4 ,true},
{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true},
{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true},
+{ ARM::VLD3DUPq32OddPseudo_UPD, ARM::VLD3DUPq32_UPD, true, true, true, OddDblSpc, 3, 2 ,true},
{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true},
{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true},
+{ ARM::VLD3DUPq8OddPseudo_UPD, ARM::VLD3DUPq8_UPD, true, true, true, OddDblSpc, 3, 8 ,true},
{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
@@ -295,10 +331,13 @@
{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},
{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true},
{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true},
+{ ARM::VLD4DUPq16OddPseudo_UPD, ARM::VLD4DUPq16_UPD, true, true, true, OddDblSpc, 4, 4 ,true},
{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true},
{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true},
+{ ARM::VLD4DUPq32OddPseudo_UPD, ARM::VLD4DUPq32_UPD, true, true, true, OddDblSpc, 4, 2 ,true},
{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true},
{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true},
+{ ARM::VLD4DUPq8OddPseudo_UPD, ARM::VLD4DUPq8_UPD, true, true, true, OddDblSpc, 4, 8 ,true},
{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
@@ -336,31 +375,58 @@
{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true},
{ ARM::VST1d16QPseudo, ARM::VST1d16Q, false, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VST1d16QPseudoWB_fixed, ARM::VST1d16Qwb_fixed, false, true, false, SingleSpc, 4, 4 ,false},
+{ ARM::VST1d16QPseudoWB_register, ARM::VST1d16Qwb_register, false, true, true, SingleSpc, 4, 4 ,false},
{ ARM::VST1d16TPseudo, ARM::VST1d16T, false, false, false, SingleSpc, 3, 4 ,false},
+{ ARM::VST1d16TPseudoWB_fixed, ARM::VST1d16Twb_fixed, false, true, false, SingleSpc, 3, 4 ,false},
+{ ARM::VST1d16TPseudoWB_register, ARM::VST1d16Twb_register, false, true, true, SingleSpc, 3, 4 ,false},
+
{ ARM::VST1d32QPseudo, ARM::VST1d32Q, false, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VST1d32QPseudoWB_fixed, ARM::VST1d32Qwb_fixed, false, true, false, SingleSpc, 4, 2 ,false},
+{ ARM::VST1d32QPseudoWB_register, ARM::VST1d32Qwb_register, false, true, true, SingleSpc, 4, 2 ,false},
{ ARM::VST1d32TPseudo, ARM::VST1d32T, false, false, false, SingleSpc, 3, 2 ,false},
+{ ARM::VST1d32TPseudoWB_fixed, ARM::VST1d32Twb_fixed, false, true, false, SingleSpc, 3, 2 ,false},
+{ ARM::VST1d32TPseudoWB_register, ARM::VST1d32Twb_register, false, true, true, SingleSpc, 3, 2 ,false},
+
{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false},
{ ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false},
-{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false},
+
{ ARM::VST1d8QPseudo, ARM::VST1d8Q, false, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VST1d8QPseudoWB_fixed, ARM::VST1d8Qwb_fixed, false, true, false, SingleSpc, 4, 8 ,false},
+{ ARM::VST1d8QPseudoWB_register, ARM::VST1d8Qwb_register, false, true, true, SingleSpc, 4, 8 ,false},
{ ARM::VST1d8TPseudo, ARM::VST1d8T, false, false, false, SingleSpc, 3, 8 ,false},
-{ ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false},
-{ ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false},
+{ ARM::VST1d8TPseudoWB_fixed, ARM::VST1d8Twb_fixed, false, true, false, SingleSpc, 3, 8 ,false},
+{ ARM::VST1d8TPseudoWB_register, ARM::VST1d8Twb_register, false, true, true, SingleSpc, 3, 8 ,false},
+
+{ ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false},
+{ ARM::VST1q16HighQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false},
+{ ARM::VST1q16HighTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleHighTSpc, 3, 4 ,false},
{ ARM::VST1q16LowQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleLowSpc, 4, 4 ,false},
{ ARM::VST1q16LowTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleLowSpc, 3, 4 ,false},
-{ ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false},
-{ ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false},
+
+{ ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false},
+{ ARM::VST1q32HighQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false},
+{ ARM::VST1q32HighTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleHighTSpc, 3, 2 ,false},
{ ARM::VST1q32LowQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleLowSpc, 4, 2 ,false},
{ ARM::VST1q32LowTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleLowSpc, 3, 2 ,false},
-{ ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false},
-{ ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false},
+
+{ ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false},
+{ ARM::VST1q64HighQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false},
+{ ARM::VST1q64HighTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleHighTSpc, 3, 1 ,false},
{ ARM::VST1q64LowQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleLowSpc, 4, 1 ,false},
{ ARM::VST1q64LowTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleLowSpc, 3, 1 ,false},
+
{ ARM::VST1q8HighQPseudo, ARM::VST1d8Q, false, false, false, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VST1q8HighQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleHighQSpc, 4, 8 ,false},
{ ARM::VST1q8HighTPseudo, ARM::VST1d8T, false, false, false, SingleHighTSpc, 3, 8 ,false},
+{ ARM::VST1q8HighTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleHighTSpc, 3, 8 ,false},
{ ARM::VST1q8LowQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleLowSpc, 4, 8 ,false},
{ ARM::VST1q8LowTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleLowSpc, 3, 8 ,false},
@@ -513,9 +579,18 @@
bool DstIsDead = MI.getOperand(OpIdx).isDead();
Register DstReg = MI.getOperand(OpIdx++).getReg();
- if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
- TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
- TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
+
+ bool IsVLD2DUP = TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_register ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_register ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_register;
+
+ if (IsVLD2DUP) {
unsigned SubRegIndex;
if (RegSpc == EvenDblSpc) {
SubRegIndex = ARM::dsub_0;
@@ -563,7 +638,10 @@
TableEntry->RealOpc == ARM::VLD1d8Twb_fixed ||
TableEntry->RealOpc == ARM::VLD1d16Twb_fixed ||
TableEntry->RealOpc == ARM::VLD1d32Twb_fixed ||
- TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) {
+ TableEntry->RealOpc == ARM::VLD1d64Twb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed) {
assert(AM6Offset.getReg() == 0 &&
"A fixed writing-back pseudo instruction provides an offset "
"register!");
@@ -576,9 +654,7 @@
// has an extra operand that is a use of the super-register. Record the
// operand index and skip over it.
unsigned SrcOpIdx = 0;
- if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 &&
- TableEntry->RealOpc != ARM::VLD2DUPd16x2 &&
- TableEntry->RealOpc != ARM::VLD2DUPd32x2) {
+ if (!IsVLD2DUP) {
if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
RegSpc == SingleHighTSpc)
@@ -1566,6 +1642,15 @@
Register DesiredReg = MI.getOperand(3).getReg();
Register NewReg = MI.getOperand(4).getReg();
+ if (IsThumb) {
+ assert(STI->hasV8MBaselineOps() &&
+ "CMP_SWAP not expected to be custom expanded for Thumb1");
+ assert((UxtOp == 0 || UxtOp == ARM::tUXTB || UxtOp == ARM::tUXTH) &&
+ "ARMv8-M.baseline does not have t2UXTB/t2UXTH");
+ assert(ARM::tGPRRegClass.contains(DesiredReg) &&
+ "DesiredReg used for UXT op must be tGPR");
+ }
+
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1974,7 +2059,7 @@
}
auto NewMI = std::prev(MBBI);
- for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+ for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
NewMI->addOperand(MBBI->getOperand(i));
@@ -2234,7 +2319,7 @@
*TII);
}
// If there's dynamic realignment, adjust for it.
- if (RI.needsStackRealignment(MF)) {
+ if (RI.hasStackRealignment(MF)) {
MachineFrameInfo &MFI = MF.getFrameInfo();
Align MaxAlign = MFI.getMaxAlign();
assert (!AFI->isThumb1OnlyFunction());
@@ -2251,7 +2336,6 @@
.add(predOps(ARMCC::AL))
.add(condCodeOp());
}
-
}
MI.eraseFromParent();
return true;
@@ -2543,8 +2627,14 @@
case ARM::VLD3d16Pseudo:
case ARM::VLD3d32Pseudo:
case ARM::VLD1d8TPseudo:
+ case ARM::VLD1d8TPseudoWB_fixed:
+ case ARM::VLD1d8TPseudoWB_register:
case ARM::VLD1d16TPseudo:
+ case ARM::VLD1d16TPseudoWB_fixed:
+ case ARM::VLD1d16TPseudoWB_register:
case ARM::VLD1d32TPseudo:
+ case ARM::VLD1d32TPseudoWB_fixed:
+ case ARM::VLD1d32TPseudoWB_register:
case ARM::VLD1d64TPseudo:
case ARM::VLD1d64TPseudoWB_fixed:
case ARM::VLD1d64TPseudoWB_register:
@@ -2564,26 +2654,40 @@
case ARM::VLD4d16Pseudo:
case ARM::VLD4d32Pseudo:
case ARM::VLD1d8QPseudo:
+ case ARM::VLD1d8QPseudoWB_fixed:
+ case ARM::VLD1d8QPseudoWB_register:
case ARM::VLD1d16QPseudo:
+ case ARM::VLD1d16QPseudoWB_fixed:
+ case ARM::VLD1d16QPseudoWB_register:
case ARM::VLD1d32QPseudo:
+ case ARM::VLD1d32QPseudoWB_fixed:
+ case ARM::VLD1d32QPseudoWB_register:
case ARM::VLD1d64QPseudo:
case ARM::VLD1d64QPseudoWB_fixed:
case ARM::VLD1d64QPseudoWB_register:
case ARM::VLD1q8HighQPseudo:
+ case ARM::VLD1q8HighQPseudo_UPD:
case ARM::VLD1q8LowQPseudo_UPD:
case ARM::VLD1q8HighTPseudo:
+ case ARM::VLD1q8HighTPseudo_UPD:
case ARM::VLD1q8LowTPseudo_UPD:
case ARM::VLD1q16HighQPseudo:
+ case ARM::VLD1q16HighQPseudo_UPD:
case ARM::VLD1q16LowQPseudo_UPD:
case ARM::VLD1q16HighTPseudo:
+ case ARM::VLD1q16HighTPseudo_UPD:
case ARM::VLD1q16LowTPseudo_UPD:
case ARM::VLD1q32HighQPseudo:
+ case ARM::VLD1q32HighQPseudo_UPD:
case ARM::VLD1q32LowQPseudo_UPD:
case ARM::VLD1q32HighTPseudo:
+ case ARM::VLD1q32HighTPseudo_UPD:
case ARM::VLD1q32LowTPseudo_UPD:
case ARM::VLD1q64HighQPseudo:
+ case ARM::VLD1q64HighQPseudo_UPD:
case ARM::VLD1q64LowQPseudo_UPD:
case ARM::VLD1q64HighTPseudo:
+ case ARM::VLD1q64HighTPseudo_UPD:
case ARM::VLD1q64LowTPseudo_UPD:
case ARM::VLD4d8Pseudo_UPD:
case ARM::VLD4d16Pseudo_UPD:
@@ -2615,18 +2719,30 @@
case ARM::VLD2DUPq16OddPseudo:
case ARM::VLD2DUPq32EvenPseudo:
case ARM::VLD2DUPq32OddPseudo:
+ case ARM::VLD2DUPq8OddPseudoWB_fixed:
+ case ARM::VLD2DUPq8OddPseudoWB_register:
+ case ARM::VLD2DUPq16OddPseudoWB_fixed:
+ case ARM::VLD2DUPq16OddPseudoWB_register:
+ case ARM::VLD2DUPq32OddPseudoWB_fixed:
+ case ARM::VLD2DUPq32OddPseudoWB_register:
case ARM::VLD3DUPq8EvenPseudo:
case ARM::VLD3DUPq8OddPseudo:
case ARM::VLD3DUPq16EvenPseudo:
case ARM::VLD3DUPq16OddPseudo:
case ARM::VLD3DUPq32EvenPseudo:
case ARM::VLD3DUPq32OddPseudo:
+ case ARM::VLD3DUPq8OddPseudo_UPD:
+ case ARM::VLD3DUPq16OddPseudo_UPD:
+ case ARM::VLD3DUPq32OddPseudo_UPD:
case ARM::VLD4DUPq8EvenPseudo:
case ARM::VLD4DUPq8OddPseudo:
case ARM::VLD4DUPq16EvenPseudo:
case ARM::VLD4DUPq16OddPseudo:
case ARM::VLD4DUPq32EvenPseudo:
case ARM::VLD4DUPq32OddPseudo:
+ case ARM::VLD4DUPq8OddPseudo_UPD:
+ case ARM::VLD4DUPq16OddPseudo_UPD:
+ case ARM::VLD4DUPq32OddPseudo_UPD:
ExpandVLD(MBBI);
return true;
@@ -2643,14 +2759,20 @@
case ARM::VST3d16Pseudo:
case ARM::VST3d32Pseudo:
case ARM::VST1d8TPseudo:
+ case ARM::VST1d8TPseudoWB_fixed:
+ case ARM::VST1d8TPseudoWB_register:
case ARM::VST1d16TPseudo:
+ case ARM::VST1d16TPseudoWB_fixed:
+ case ARM::VST1d16TPseudoWB_register:
case ARM::VST1d32TPseudo:
+ case ARM::VST1d32TPseudoWB_fixed:
+ case ARM::VST1d32TPseudoWB_register:
case ARM::VST1d64TPseudo:
+ case ARM::VST1d64TPseudoWB_fixed:
+ case ARM::VST1d64TPseudoWB_register:
case ARM::VST3d8Pseudo_UPD:
case ARM::VST3d16Pseudo_UPD:
case ARM::VST3d32Pseudo_UPD:
- case ARM::VST1d64TPseudoWB_fixed:
- case ARM::VST1d64TPseudoWB_register:
case ARM::VST3q8Pseudo_UPD:
case ARM::VST3q16Pseudo_UPD:
case ARM::VST3q32Pseudo_UPD:
@@ -2664,14 +2786,20 @@
case ARM::VST4d16Pseudo:
case ARM::VST4d32Pseudo:
case ARM::VST1d8QPseudo:
+ case ARM::VST1d8QPseudoWB_fixed:
+ case ARM::VST1d8QPseudoWB_register:
case ARM::VST1d16QPseudo:
+ case ARM::VST1d16QPseudoWB_fixed:
+ case ARM::VST1d16QPseudoWB_register:
case ARM::VST1d32QPseudo:
+ case ARM::VST1d32QPseudoWB_fixed:
+ case ARM::VST1d32QPseudoWB_register:
case ARM::VST1d64QPseudo:
+ case ARM::VST1d64QPseudoWB_fixed:
+ case ARM::VST1d64QPseudoWB_register:
case ARM::VST4d8Pseudo_UPD:
case ARM::VST4d16Pseudo_UPD:
case ARM::VST4d32Pseudo_UPD:
- case ARM::VST1d64QPseudoWB_fixed:
- case ARM::VST1d64QPseudoWB_register:
case ARM::VST1q8HighQPseudo:
case ARM::VST1q8LowQPseudo_UPD:
case ARM::VST1q8HighTPseudo:
@@ -2688,6 +2816,14 @@
case ARM::VST1q64LowQPseudo_UPD:
case ARM::VST1q64HighTPseudo:
case ARM::VST1q64LowTPseudo_UPD:
+ case ARM::VST1q8HighTPseudo_UPD:
+ case ARM::VST1q16HighTPseudo_UPD:
+ case ARM::VST1q32HighTPseudo_UPD:
+ case ARM::VST1q64HighTPseudo_UPD:
+ case ARM::VST1q8HighQPseudo_UPD:
+ case ARM::VST1q16HighQPseudo_UPD:
+ case ARM::VST1q32HighQPseudo_UPD:
+ case ARM::VST1q64HighQPseudo_UPD:
case ARM::VST4q8Pseudo_UPD:
case ARM::VST4q16Pseudo_UPD:
case ARM::VST4q32Pseudo_UPD:
@@ -2780,20 +2916,23 @@
case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+ case ARM::tCMP_SWAP_8:
+ assert(STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB,
+ NextMBBI);
+ case ARM::tCMP_SWAP_16:
+ assert(STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH,
+ NextMBBI);
+
case ARM::CMP_SWAP_8:
- if (STI->isThumb())
- return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
- ARM::tUXTB, NextMBBI);
- else
- return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
- ARM::UXTB, NextMBBI);
+ assert(!STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB, ARM::UXTB,
+ NextMBBI);
case ARM::CMP_SWAP_16:
- if (STI->isThumb())
- return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
- ARM::tUXTH, NextMBBI);
- else
- return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
- ARM::UXTH, NextMBBI);
+ assert(!STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH,
+ NextMBBI);
case ARM::CMP_SWAP_32:
if (STI->isThumb())
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
index da1d9af..28a076e 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -136,16 +136,13 @@
// Code from FastISel.cpp.
unsigned fastEmitInst_r(unsigned MachineInstOpcode,
- const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill);
+ const TargetRegisterClass *RC, unsigned Op0);
unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
+ unsigned Op0, unsigned Op1);
unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- uint64_t Imm);
+ unsigned Op0, uint64_t Imm);
unsigned fastEmitInst_i(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
uint64_t Imm);
@@ -299,7 +296,7 @@
unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill) {
+ unsigned Op0) {
Register ResultReg = createResultReg(RC);
const MCInstrDesc &II = TII.get(MachineInstOpcode);
@@ -308,10 +305,10 @@
Op0 = constrainOperandRegClass(II, Op0, 1);
if (II.getNumDefs() >= 1) {
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
- ResultReg).addReg(Op0, Op0IsKill * RegState::Kill));
+ ResultReg).addReg(Op0));
} else {
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(Op0, Op0IsKill * RegState::Kill));
+ .addReg(Op0));
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(II.ImplicitDefs[0]));
@@ -321,8 +318,7 @@
unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+ unsigned Op0, unsigned Op1) {
unsigned ResultReg = createResultReg(RC);
const MCInstrDesc &II = TII.get(MachineInstOpcode);
@@ -334,12 +330,12 @@
if (II.getNumDefs() >= 1) {
AddOptionalDefs(
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(Op0, Op0IsKill * RegState::Kill)
- .addReg(Op1, Op1IsKill * RegState::Kill));
+ .addReg(Op0)
+ .addReg(Op1));
} else {
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(Op0, Op0IsKill * RegState::Kill)
- .addReg(Op1, Op1IsKill * RegState::Kill));
+ .addReg(Op0)
+ .addReg(Op1));
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(II.ImplicitDefs[0]));
@@ -349,8 +345,7 @@
unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- uint64_t Imm) {
+ unsigned Op0, uint64_t Imm) {
unsigned ResultReg = createResultReg(RC);
const MCInstrDesc &II = TII.get(MachineInstOpcode);
@@ -360,11 +355,11 @@
if (II.getNumDefs() >= 1) {
AddOptionalDefs(
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addReg(Op0)
.addImm(Imm));
} else {
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addReg(Op0)
.addImm(Imm));
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
@@ -851,7 +846,7 @@
// get the reg+offset into a register.
if (needsLowering) {
Addr.Base.Reg = fastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
- /*Op0IsKill*/false, Addr.Offset, MVT::i32);
+ Addr.Offset, MVT::i32);
Addr.Offset = 0;
}
}
@@ -1854,6 +1849,7 @@
}
case CallingConv::ARM_AAPCS_VFP:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
if (!isVarArg)
return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
// Fall through to soft float variant, variadic functions don't
@@ -1967,8 +1963,7 @@
break;
}
case CCValAssign::BCvt: {
- unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
- /*TODO: Kill=*/false);
+ unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg);
assert(BC != 0 && "Failed to emit a bitcast!");
Arg = BC;
ArgVT = VA.getLocVT();
@@ -2186,10 +2181,11 @@
EVT LCREVT = TLI.getValueType(DL, GVTy);
if (!LCREVT.isSimple()) return 0;
- GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
- GlobalValue::ExternalLinkage, nullptr,
- Name);
- assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
+ GlobalValue *GV = M.getNamedGlobal(Name.str());
+ if (!GV)
+ GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
+ GlobalValue::ExternalLinkage, nullptr, Name);
+
return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
}
@@ -3020,6 +3016,7 @@
case CallingConv::ARM_AAPCS:
case CallingConv::ARM_APCS:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
break;
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 9eeb7f2..025e434 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -9,6 +9,102 @@
// This file contains the ARM implementation of TargetFrameLowering class.
//
//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+// On ARM, stack frames are structured as follows:
+//
+// The stack grows downward.
+//
+// All of the individual frame areas on the frame below are optional, i.e. it's
+// possible to create a function so that the particular area isn't present
+// in the frame.
+//
+// At function entry, the "frame" looks as follows:
+//
+// | | Higher address
+// |-----------------------------------|
+// | |
+// | arguments passed on the stack |
+// | |
+// |-----------------------------------| <- sp
+// | | Lower address
+//
+//
+// After the prologue has run, the frame has the following general structure.
+// Technically the last frame area (VLAs) doesn't get created until in the
+// main function body, after the prologue is run. However, it's depicted here
+// for completeness.
+//
+// | | Higher address
+// |-----------------------------------|
+// | |
+// | arguments passed on the stack |
+// | |
+// |-----------------------------------| <- (sp at function entry)
+// | |
+// | varargs from registers |
+// | |
+// |-----------------------------------|
+// | |
+// | prev_fp, prev_lr |
+// | (a.k.a. "frame record") |
+// | |
+// |- - - - - - - - - - - - - - - - - -| <- fp (r7 or r11)
+// | |
+// | callee-saved gpr registers |
+// | |
+// |-----------------------------------|
+// | |
+// | callee-saved fp/simd regs |
+// | |
+// |-----------------------------------|
+// |.empty.space.to.make.part.below....|
+// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
+// |.the.standard.8-byte.alignment.....| compile time; if present)
+// |-----------------------------------|
+// | |
+// | local variables of fixed size |
+// | including spill slots |
+// |-----------------------------------| <- base pointer (not defined by ABI,
+// |.variable-sized.local.variables....| LLVM chooses r6)
+// |.(VLAs)............................| (size of this area is unknown at
+// |...................................| compile time)
+// |-----------------------------------| <- sp
+// | | Lower address
+//
+//
+// To access the data in a frame, at-compile time, a constant offset must be
+// computable from one of the pointers (fp, bp, sp) to access it. The size
+// of the areas with a dotted background cannot be computed at compile-time
+// if they are present, making it required to have all three of fp, bp and
+// sp to be set up to be able to access all contents in the frame areas,
+// assuming all of the frame areas are non-empty.
+//
+// For most functions, some of the frame areas are empty. For those functions,
+// it may not be necessary to set up fp or bp:
+// * A base pointer is definitely needed when there are both VLAs and local
+// variables with more-than-default alignment requirements.
+// * A frame pointer is definitely needed when there are local variables with
+// more-than-default alignment requirements.
+//
+// In some cases when a base pointer is not strictly needed, it is generated
+// anyway when offsets from the frame pointer to access local variables become
+// so large that the offset can't be encoded in the immediate fields of loads
+// or stores.
+//
+// The frame pointer might be chosen to be r7 or r11, depending on the target
+// architecture and operating system. See ARMSubtarget::getFramePointerReg for
+// details.
+//
+// Outgoing function arguments must be at the bottom of the stack frame when
+// calling another function. If we do not have variable-sized stack objects, we
+// can allocate a "reserved call frame" area at the bottom of the local
+// variable area, large enough for all outgoing calls. If we do have VLAs, then
+// the stack pointer must be decremented and incremented around each call to
+// make space for the arguments below the VLAs.
+//
+//===----------------------------------------------------------------------===//
#include "ARMFrameLowering.h"
#include "ARMBaseInstrInfo.h"
@@ -110,8 +206,7 @@
return true;
// Frame pointer required for use within this function.
- return (RegInfo->needsStackRealignment(MF) ||
- MFI.hasVarSizedObjects() ||
+ return (RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken());
}
@@ -142,6 +237,41 @@
return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
}
+// Returns how much of the incoming argument stack area we should clean up in an
+// epilogue. For the C calling convention this will be 0, for guaranteed tail
+// call conventions it can be positive (a normal return or a tail call to a
+// function that uses less stack space for arguments) or negative (for a tail
+// call to a function that needs more stack space than us for arguments).
+static int getArgumentStackToRestore(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ bool IsTailCallReturn = false;
+ if (MBB.end() != MBBI) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ IsTailCallReturn = RetOpcode == ARM::TCRETURNdi ||
+ RetOpcode == ARM::TCRETURNri;
+ }
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ int ArgumentPopSize = 0;
+ if (IsTailCallReturn) {
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+ // For a tail-call in a callee-pops-arguments environment, some or all of
+ // the stack may actually be in use for the call's arguments, this is
+ // calculated during LowerCall and consumed here...
+ ArgumentPopSize = StackAdjust.getImm();
+ } else {
+ // ... otherwise the amount to pop is *all* of the argument space,
+ // conveniently stored in the MachineFunctionInfo by
+ // LowerFormalArguments. This will, of course, be zero for the C calling
+ // convention.
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
+ }
+
+ return ArgumentPopSize;
+}
+
static void emitRegPlusImmediate(
bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -711,7 +841,7 @@
// sure if we also have VLAs, we have a base pointer for frame access.
// If aligned NEON registers were spilled, the stack has already been
// realigned.
- if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
+ if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->hasStackRealignment(MF)) {
Align MaxAlign = MFI.getMaxAlign();
assert(!AFI->isThumb1OnlyFunction());
if (!AFI->isThumbFunction()) {
@@ -773,7 +903,13 @@
"This emitEpilogue does not support Thumb1!");
bool isARM = !AFI->isThumbFunction();
- unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ // Amount of stack space we reserved next to incoming args for either
+ // varargs registers or stack arguments in tail calls made by this function.
+ unsigned ReservedArgStack = AFI->getArgRegsSaveSize();
+
+ // How much of the stack used by incoming arguments this function is expected
+ // to restore in this particular epilogue.
+ int IncomingArgStackToRestore = getArgumentStackToRestore(MF, MBB);
int NumBytes = (int)MFI.getStackSize();
Register FramePtr = RegInfo->getFrameRegister(MF);
@@ -787,8 +923,8 @@
DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
if (!AFI->hasStackFrame()) {
- if (NumBytes - ArgRegsSaveSize != 0)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize,
+ if (NumBytes - ReservedArgStack != 0)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack,
MachineInstr::FrameDestroy);
} else {
// Unwind MBBI to point to first LDR / VLDRD.
@@ -802,7 +938,7 @@
}
// Move SP to start of FP callee save spill area.
- NumBytes -= (ArgRegsSaveSize +
+ NumBytes -= (ReservedArgStack +
AFI->getFPCXTSaveAreaSize() +
AFI->getGPRCalleeSavedArea1Size() +
AFI->getGPRCalleeSavedArea2Size() +
@@ -874,9 +1010,13 @@
if (AFI->getFPCXTSaveAreaSize()) MBBI++;
}
- if (ArgRegsSaveSize)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize,
+ if (ReservedArgStack || IncomingArgStackToRestore) {
+ assert((int)ReservedArgStack + IncomingArgStackToRestore >= 0 &&
+ "attempting to restore negative stack amount");
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII,
+ ReservedArgStack + IncomingArgStackToRestore,
MachineInstr::FrameDestroy);
+ }
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -909,7 +1049,7 @@
// When dynamically realigning the stack, use the frame pointer for
// parameters, and the stack/base pointer for locals.
- if (RegInfo->needsStackRealignment(MF)) {
+ if (RegInfo->hasStackRealignment(MF)) {
assert(hasFP(MF) && "dynamic stack realignment without a FP!");
if (isFixed) {
FrameReg = RegInfo->getFrameRegister(MF);
@@ -1089,19 +1229,16 @@
// The aligned reloads from area DPRCS2 are not inserted here.
if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
continue;
-
if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
- !isCmseEntry && !isTrap && STI.hasV5TOps()) {
- if (MBB.succ_empty()) {
- Reg = ARM::PC;
- // Fold the return instruction into the LDM.
- DeleteRet = true;
- LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
- // We 'restore' LR into PC so it is not live out of the return block:
- // Clear Restored bit.
- Info.setRestored(false);
- } else
- LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+ !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 &&
+ STI.hasV5TOps() && MBB.succ_empty()) {
+ Reg = ARM::PC;
+ // Fold the return instruction into the LDM.
+ DeleteRet = true;
+ LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+ // We 'restore' LR into PC so it is not live out of the return block:
+ // Clear Restored bit.
+ Info.setRestored(false);
}
// If NoGap is true, pop consecutive registers and then leave the rest
@@ -1687,7 +1824,7 @@
// instruction.
// FIXME: It will be better just to find spare register here.
if (AFI->isThumb2Function() &&
- (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+ (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)))
SavedRegs.set(ARM::R4);
// If a stack probe will be emitted, spill R4 and LR, since they are
@@ -1712,7 +1849,7 @@
// changes it, it'll be a spill, which implies we've used all the registers
// and so R4 is already used, so not marking it here will be OK.
// FIXME: It will be better just to find spare register here.
- if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF) ||
+ if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF) ||
MFI.estimateStackSize(MF) > 508)
SavedRegs.set(ARM::R4);
}
@@ -2193,31 +2330,37 @@
MachineBasicBlock::iterator I) const {
const ARMBaseInstrInfo &TII =
*static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ bool isARM = !AFI->isThumbFunction();
+ DebugLoc dl = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII.getCallFrameDestroyOpcode();
+ unsigned CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateCallFramePseudoInstr does not support Thumb1!");
+
+ int PIdx = I->findFirstPredOperandIdx();
+ ARMCC::CondCodes Pred = (PIdx == -1)
+ ? ARMCC::AL
+ : (ARMCC::CondCodes)I->getOperand(PIdx).getImm();
+ unsigned PredReg = TII.getFramePred(*I);
+
if (!hasReservedCallFrame(MF)) {
+ // Bail early if the callee is expected to do the adjustment.
+ if (IsDestroy && CalleePopAmount != -1U)
+ return MBB.erase(I);
+
// If we have alloca, convert as follows:
// ADJCALLSTACKDOWN -> sub, sp, sp, amount
// ADJCALLSTACKUP -> add, sp, sp, amount
- MachineInstr &Old = *I;
- DebugLoc dl = Old.getDebugLoc();
- unsigned Amount = TII.getFrameSize(Old);
+ unsigned Amount = TII.getFrameSize(*I);
if (Amount != 0) {
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
Amount = alignSPAdjust(Amount);
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- assert(!AFI->isThumb1OnlyFunction() &&
- "This eliminateCallFramePseudoInstr does not support Thumb1!");
- bool isARM = !AFI->isThumbFunction();
-
- // Replace the pseudo instruction with a new instruction...
- unsigned Opc = Old.getOpcode();
- int PIdx = Old.findFirstPredOperandIdx();
- ARMCC::CondCodes Pred =
- (PIdx == -1) ? ARMCC::AL
- : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
- unsigned PredReg = TII.getFramePred(Old);
if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
Pred, PredReg);
@@ -2227,6 +2370,11 @@
Pred, PredReg);
}
}
+ } else if (CalleePopAmount != -1U) {
+ // If the calling convention demands that the callee pops arguments from the
+ // stack, we want to add it back if we have a reserved call frame.
+ emitSPUpdate(isARM, MBB, I, dl, TII, -CalleePopAmount,
+ MachineInstr::NoFlags, Pred, PredReg);
}
return MBB.erase(I);
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 2a9a31d..9c7055d 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
#include "ARMTargetMachine.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "Utils/ARMBaseInfo.h"
+#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -86,16 +87,20 @@
bool CheckProfitability = true);
bool SelectImmShifterOperand(SDValue N, SDValue &A,
SDValue &B, bool CheckProfitability = true);
- bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
- SDValue &B, SDValue &C) {
+ bool SelectShiftRegShifterOperand(SDValue N, SDValue &A, SDValue &B,
+ SDValue &C) {
// Don't apply the profitability check
return SelectRegShifterOperand(N, A, B, C, false);
}
- bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
- SDValue &B) {
+ bool SelectShiftImmShifterOperand(SDValue N, SDValue &A, SDValue &B) {
// Don't apply the profitability check
return SelectImmShifterOperand(N, A, B, false);
}
+ bool SelectShiftImmShifterOperandOneUse(SDValue N, SDValue &A, SDValue &B) {
+ if (!N.hasOneUse())
+ return false;
+ return SelectImmShifterOperand(N, A, B, false);
+ }
bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
@@ -191,6 +196,11 @@
bool tryT1IndexedLoad(SDNode *N);
bool tryT2IndexedLoad(SDNode *N);
bool tryMVEIndexedLoad(SDNode *N);
+ bool tryFMULFixed(SDNode *N, SDLoc dl);
+ bool tryFP_TO_INT(SDNode *N, SDLoc dl);
+ bool transformFixedFloatingPointConversion(SDNode *N, SDNode *FMul,
+ bool IsUnsigned,
+ bool FixedToFloat);
/// SelectVLD - Select NEON load intrinsics. NumVecs should be
/// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
@@ -297,6 +307,8 @@
/// Try to select SBFX/UBFX instructions for ARM.
bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+ bool tryInsertVectorElt(SDNode *N);
+
// Select special operations if node forms integer ABS pattern
bool tryABSOp(SDNode *N);
@@ -1939,7 +1951,13 @@
case ARM::VLD1d64Qwb_fixed : return true;
case ARM::VLD1d32wb_fixed : return true;
case ARM::VLD1d64wb_fixed : return true;
+ case ARM::VLD1d8TPseudoWB_fixed : return true;
+ case ARM::VLD1d16TPseudoWB_fixed : return true;
+ case ARM::VLD1d32TPseudoWB_fixed : return true;
case ARM::VLD1d64TPseudoWB_fixed : return true;
+ case ARM::VLD1d8QPseudoWB_fixed : return true;
+ case ARM::VLD1d16QPseudoWB_fixed : return true;
+ case ARM::VLD1d32QPseudoWB_fixed : return true;
case ARM::VLD1d64QPseudoWB_fixed : return true;
case ARM::VLD1q8wb_fixed : return true;
case ARM::VLD1q16wb_fixed : return true;
@@ -1960,6 +1978,9 @@
case ARM::VLD2DUPd8wb_fixed : return true;
case ARM::VLD2DUPd16wb_fixed : return true;
case ARM::VLD2DUPd32wb_fixed : return true;
+ case ARM::VLD2DUPq8OddPseudoWB_fixed: return true;
+ case ARM::VLD2DUPq16OddPseudoWB_fixed: return true;
+ case ARM::VLD2DUPq32OddPseudoWB_fixed: return true;
}
}
@@ -1975,7 +1996,13 @@
case ARM::VST1q16wb_fixed : return true;
case ARM::VST1q32wb_fixed : return true;
case ARM::VST1q64wb_fixed : return true;
+ case ARM::VST1d8TPseudoWB_fixed : return true;
+ case ARM::VST1d16TPseudoWB_fixed : return true;
+ case ARM::VST1d32TPseudoWB_fixed : return true;
case ARM::VST1d64TPseudoWB_fixed : return true;
+ case ARM::VST1d8QPseudoWB_fixed : return true;
+ case ARM::VST1d16QPseudoWB_fixed : return true;
+ case ARM::VST1d32QPseudoWB_fixed : return true;
case ARM::VST1d64QPseudoWB_fixed : return true;
case ARM::VST2d8wb_fixed : return true;
case ARM::VST2d16wb_fixed : return true;
@@ -2003,7 +2030,13 @@
case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register;
case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
+ case ARM::VLD1d8TPseudoWB_fixed: return ARM::VLD1d8TPseudoWB_register;
+ case ARM::VLD1d16TPseudoWB_fixed: return ARM::VLD1d16TPseudoWB_register;
+ case ARM::VLD1d32TPseudoWB_fixed: return ARM::VLD1d32TPseudoWB_register;
case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
+ case ARM::VLD1d8QPseudoWB_fixed: return ARM::VLD1d8QPseudoWB_register;
+ case ARM::VLD1d16QPseudoWB_fixed: return ARM::VLD1d16QPseudoWB_register;
+ case ARM::VLD1d32QPseudoWB_fixed: return ARM::VLD1d32QPseudoWB_register;
case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
@@ -2011,6 +2044,9 @@
case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
+ case ARM::VLD2DUPq8OddPseudoWB_fixed: return ARM::VLD2DUPq8OddPseudoWB_register;
+ case ARM::VLD2DUPq16OddPseudoWB_fixed: return ARM::VLD2DUPq16OddPseudoWB_register;
+ case ARM::VLD2DUPq32OddPseudoWB_fixed: return ARM::VLD2DUPq32OddPseudoWB_register;
case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
@@ -2020,7 +2056,13 @@
case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register;
case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register;
case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register;
+ case ARM::VST1d8TPseudoWB_fixed: return ARM::VST1d8TPseudoWB_register;
+ case ARM::VST1d16TPseudoWB_fixed: return ARM::VST1d16TPseudoWB_register;
+ case ARM::VST1d32TPseudoWB_fixed: return ARM::VST1d32TPseudoWB_register;
case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+ case ARM::VST1d8QPseudoWB_fixed: return ARM::VST1d8QPseudoWB_register;
+ case ARM::VST1d16QPseudoWB_fixed: return ARM::VST1d16QPseudoWB_register;
+ case ARM::VST1d32QPseudoWB_fixed: return ARM::VST1d32QPseudoWB_register;
case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register;
@@ -2547,6 +2589,7 @@
ReplaceUses(SDValue(N, 0), SDValue(New, 1));
ReplaceUses(SDValue(N, 1), SDValue(New, 0));
ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+ transferMemOperands(N, New);
CurDAG->RemoveDeadNode(N);
}
@@ -2762,6 +2805,7 @@
CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops);
Data = SDValue(LoadInst, 0);
Chain = SDValue(LoadInst, 1);
+ transferMemOperands(N, LoadInst);
}
// The last may need a writeback on it
if (HasWriteback)
@@ -2769,6 +2813,7 @@
SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain};
auto LoadInst =
CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops);
+ transferMemOperands(N, LoadInst);
unsigned i;
for (i = 0; i < NumVecs; i++)
@@ -2954,51 +2999,47 @@
SDValue Pred = getAL(CurDAG, dl);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
- SDNode *VLdDup;
- if (is64BitVector || NumVecs == 1) {
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(MemAddr);
- Ops.push_back(Align);
- unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] :
- QOpcodes0[OpcodeIndex];
- if (isUpdating) {
- // fixed-stride update instructions don't have an explicit writeback
- // operand. It's implicit in the opcode itself.
- SDValue Inc = N->getOperand(2);
- bool IsImmUpdate =
- isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
- if (NumVecs <= 2 && !IsImmUpdate)
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- if (!IsImmUpdate)
- Ops.push_back(Inc);
- // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
- else if (NumVecs > 2)
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex]
+ : (NumVecs == 1) ? QOpcodes0[OpcodeIndex]
+ : QOpcodes1[OpcodeIndex];
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(2);
+ bool IsImmUpdate =
+ isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+ if (IsImmUpdate) {
+ if (!isVLDfixed(Opc))
Ops.push_back(Reg0);
+ } else {
+ if (isVLDfixed(Opc))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ Ops.push_back(Inc);
}
- Ops.push_back(Pred);
- Ops.push_back(Reg0);
- Ops.push_back(Chain);
- VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- } else if (NumVecs == 2) {
- const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain };
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
- dl, ResTys, OpsA);
-
- Chain = SDValue(VLdA, 1);
- const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain };
- VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
- } else {
- SDValue ImplDef =
- SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
- const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain };
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
- dl, ResTys, OpsA);
-
- SDValue SuperReg = SDValue(VLdA, 0);
- Chain = SDValue(VLdA, 1);
- const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain };
- VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
}
+ if (is64BitVector || NumVecs == 1) {
+ // Double registers and VLD1 quad registers are directly supported.
+ } else if (NumVecs == 2) {
+ const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain};
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
+ MVT::Other, OpsA);
+ Chain = SDValue(VLdA, 1);
+ } else {
+ SDValue ImplDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+ const SDValue OpsA[] = {MemAddr, Align, ImplDef, Pred, Reg0, Chain};
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
+ MVT::Other, OpsA);
+ Ops.push_back(SDValue(VLdA, 0));
+ Chain = SDValue(VLdA, 1);
+ }
+
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+
+ SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -3022,6 +3063,261 @@
CurDAG->RemoveDeadNode(N);
}
+bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+
+ SDLoc dl(N);
+
+ // We are trying to use VMOV/VMOVX/VINS to more efficiently lower insert and
+ // extracts of v8f16 and v8i16 vectors. Check that we have two adjacent
+ // inserts of the correct type:
+ SDValue Ins1 = SDValue(N, 0);
+ SDValue Ins2 = N->getOperand(0);
+ EVT VT = Ins1.getValueType();
+ if (Ins2.getOpcode() != ISD::INSERT_VECTOR_ELT || !Ins2.hasOneUse() ||
+ !isa<ConstantSDNode>(Ins1.getOperand(2)) ||
+ !isa<ConstantSDNode>(Ins2.getOperand(2)) ||
+ (VT != MVT::v8f16 && VT != MVT::v8i16) || (Ins2.getValueType() != VT))
+ return false;
+
+ unsigned Lane1 = Ins1.getConstantOperandVal(2);
+ unsigned Lane2 = Ins2.getConstantOperandVal(2);
+ if (Lane2 % 2 != 0 || Lane1 != Lane2 + 1)
+ return false;
+
+ // If the inserted values will be able to use T/B already, leave it to the
+ // existing tablegen patterns. For example VCVTT/VCVTB.
+ SDValue Val1 = Ins1.getOperand(1);
+ SDValue Val2 = Ins2.getOperand(1);
+ if (Val1.getOpcode() == ISD::FP_ROUND || Val2.getOpcode() == ISD::FP_ROUND)
+ return false;
+
+ // Check if the inserted values are both extracts.
+ if ((Val1.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val1.getOpcode() == ARMISD::VGETLANEu) &&
+ (Val2.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val2.getOpcode() == ARMISD::VGETLANEu) &&
+ isa<ConstantSDNode>(Val1.getOperand(1)) &&
+ isa<ConstantSDNode>(Val2.getOperand(1)) &&
+ (Val1.getOperand(0).getValueType() == MVT::v8f16 ||
+ Val1.getOperand(0).getValueType() == MVT::v8i16) &&
+ (Val2.getOperand(0).getValueType() == MVT::v8f16 ||
+ Val2.getOperand(0).getValueType() == MVT::v8i16)) {
+ unsigned ExtractLane1 = Val1.getConstantOperandVal(1);
+ unsigned ExtractLane2 = Val2.getConstantOperandVal(1);
+
+ // If the two extracted lanes are from the same place and adjacent, this
+ // simplifies into a f32 lane move.
+ if (Val1.getOperand(0) == Val2.getOperand(0) && ExtractLane2 % 2 == 0 &&
+ ExtractLane1 == ExtractLane2 + 1) {
+ SDValue NewExt = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val1.getOperand(0));
+ SDValue NewIns = CurDAG->getTargetInsertSubreg(
+ ARM::ssub_0 + Lane2 / 2, dl, VT, Ins2.getOperand(0),
+ NewExt);
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+
+ // Else v8i16 pattern of an extract and an insert, with a optional vmovx for
+ // extracting odd lanes.
+ if (VT == MVT::v8i16) {
+ SDValue Inp1 = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
+ SDValue Inp2 = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val2.getOperand(0));
+ if (ExtractLane1 % 2 != 0)
+ Inp1 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp1), 0);
+ if (ExtractLane2 % 2 != 0)
+ Inp2 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp2), 0);
+ SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Inp2, Inp1);
+ SDValue NewIns =
+ CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+ Ins2.getOperand(0), SDValue(VINS, 0));
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+ }
+
+ // The inserted values are not extracted - if they are f16 then insert them
+ // directly using a VINS.
+ if (VT == MVT::v8f16) {
+ SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
+ SDValue NewIns =
+ CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+ Ins2.getOperand(0), SDValue(VINS, 0));
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::transformFixedFloatingPointConversion(SDNode *N,
+ SDNode *FMul,
+ bool IsUnsigned,
+ bool FixedToFloat) {
+ auto Type = N->getValueType(0);
+ unsigned ScalarBits = Type.getScalarSizeInBits();
+ if (ScalarBits > 32)
+ return false;
+
+ SDNodeFlags FMulFlags = FMul->getFlags();
+ // The fixed-point vcvt and vcvt+vmul are not always equivalent if inf is
+ // allowed in 16 bit unsigned floats
+ if (ScalarBits == 16 && !FMulFlags.hasNoInfs() && IsUnsigned)
+ return false;
+
+ SDValue ImmNode = FMul->getOperand(1);
+ SDValue VecVal = FMul->getOperand(0);
+ if (VecVal->getOpcode() == ISD::UINT_TO_FP ||
+ VecVal->getOpcode() == ISD::SINT_TO_FP)
+ VecVal = VecVal->getOperand(0);
+
+ if (VecVal.getValueType().getScalarSizeInBits() != ScalarBits)
+ return false;
+
+ if (ImmNode.getOpcode() == ISD::BITCAST) {
+ if (ImmNode.getValueType().getScalarSizeInBits() != ScalarBits)
+ return false;
+ ImmNode = ImmNode.getOperand(0);
+ }
+
+ if (ImmNode.getValueType().getScalarSizeInBits() != ScalarBits)
+ return false;
+
+ APFloat ImmAPF(0.0f);
+ switch (ImmNode.getOpcode()) {
+ case ARMISD::VMOVIMM:
+ case ARMISD::VDUP: {
+ if (!isa<ConstantSDNode>(ImmNode.getOperand(0)))
+ return false;
+ unsigned Imm = ImmNode.getConstantOperandVal(0);
+ if (ImmNode.getOpcode() == ARMISD::VMOVIMM)
+ Imm = ARM_AM::decodeVMOVModImm(Imm, ScalarBits);
+ ImmAPF =
+ APFloat(ScalarBits == 32 ? APFloat::IEEEsingle() : APFloat::IEEEhalf(),
+ APInt(ScalarBits, Imm));
+ break;
+ }
+ case ARMISD::VMOVFPIMM: {
+ ImmAPF = APFloat(ARM_AM::getFPImmFloat(ImmNode.getConstantOperandVal(0)));
+ break;
+ }
+ default:
+ return false;
+ }
+
+ // Where n is the number of fractional bits, multiplying by 2^n will convert
+ // from float to fixed and multiplying by 2^-n will convert from fixed to
+ // float. Taking log2 of the factor (after taking the inverse in the case of
+ // float to fixed) will give n.
+ APFloat ToConvert = ImmAPF;
+ if (FixedToFloat) {
+ if (!ImmAPF.getExactInverse(&ToConvert))
+ return false;
+ }
+ APSInt Converted(64, 0);
+ bool IsExact;
+ ToConvert.convertToInteger(Converted, llvm::RoundingMode::NearestTiesToEven,
+ &IsExact);
+ if (!IsExact || !Converted.isPowerOf2())
+ return false;
+
+ unsigned FracBits = Converted.logBase2();
+ if (FracBits > ScalarBits)
+ return false;
+
+ SmallVector<SDValue, 3> Ops{
+ VecVal, CurDAG->getConstant(FracBits, SDLoc(N), MVT::i32)};
+ AddEmptyMVEPredicateToOps(Ops, SDLoc(N), Type);
+
+ unsigned int Opcode;
+ switch (ScalarBits) {
+ case 16:
+ if (FixedToFloat)
+ Opcode = IsUnsigned ? ARM::MVE_VCVTf16u16_fix : ARM::MVE_VCVTf16s16_fix;
+ else
+ Opcode = IsUnsigned ? ARM::MVE_VCVTu16f16_fix : ARM::MVE_VCVTs16f16_fix;
+ break;
+ case 32:
+ if (FixedToFloat)
+ Opcode = IsUnsigned ? ARM::MVE_VCVTf32u32_fix : ARM::MVE_VCVTf32s32_fix;
+ else
+ Opcode = IsUnsigned ? ARM::MVE_VCVTu32f32_fix : ARM::MVE_VCVTs32f32_fix;
+ break;
+ default:
+ llvm_unreachable("unexpected number of scalar bits");
+ break;
+ }
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), Type, Ops));
+ return true;
+}
+
+bool ARMDAGToDAGISel::tryFP_TO_INT(SDNode *N, SDLoc dl) {
+ // Transform a floating-point to fixed-point conversion to a VCVT
+ if (!Subtarget->hasMVEFloatOps())
+ return false;
+ EVT Type = N->getValueType(0);
+ if (!Type.isVector())
+ return false;
+ unsigned int ScalarBits = Type.getScalarSizeInBits();
+
+ bool IsUnsigned = N->getOpcode() == ISD::FP_TO_UINT;
+ SDNode *Node = N->getOperand(0).getNode();
+
+ // floating-point to fixed-point with one fractional bit gets turned into an
+ // FP_TO_[U|S]INT(FADD (x, x)) rather than an FP_TO_[U|S]INT(FMUL (x, y))
+ if (Node->getOpcode() == ISD::FADD) {
+ if (Node->getOperand(0) != Node->getOperand(1))
+ return false;
+ SDNodeFlags Flags = Node->getFlags();
+ // The fixed-point vcvt and vcvt+vmul are not always equivalent if inf is
+ // allowed in 16 bit unsigned floats
+ if (ScalarBits == 16 && !Flags.hasNoInfs() && IsUnsigned)
+ return false;
+
+ unsigned Opcode;
+ switch (ScalarBits) {
+ case 16:
+ Opcode = IsUnsigned ? ARM::MVE_VCVTu16f16_fix : ARM::MVE_VCVTs16f16_fix;
+ break;
+ case 32:
+ Opcode = IsUnsigned ? ARM::MVE_VCVTu32f32_fix : ARM::MVE_VCVTs32f32_fix;
+ break;
+ }
+ SmallVector<SDValue, 3> Ops{Node->getOperand(0),
+ CurDAG->getConstant(1, dl, MVT::i32)};
+ AddEmptyMVEPredicateToOps(Ops, dl, Type);
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, dl, Type, Ops));
+ return true;
+ }
+
+ if (Node->getOpcode() != ISD::FMUL)
+ return false;
+
+ return transformFixedFloatingPointConversion(N, Node, IsUnsigned, false);
+}
+
+bool ARMDAGToDAGISel::tryFMULFixed(SDNode *N, SDLoc dl) {
+ // Transform a fixed-point to floating-point conversion to a VCVT
+ if (!Subtarget->hasMVEFloatOps())
+ return false;
+ auto Type = N->getValueType(0);
+ if (!Type.isVector())
+ return false;
+
+ auto LHS = N->getOperand(0);
+ if (LHS.getOpcode() != ISD::SINT_TO_FP && LHS.getOpcode() != ISD::UINT_TO_FP)
+ return false;
+
+ return transformFixedFloatingPointConversion(
+ N, N, LHS.getOpcode() == ISD::UINT_TO_FP, true);
+}
+
bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
if (!Subtarget->hasV6T2Ops())
return false;
@@ -3203,9 +3499,9 @@
unsigned Opcode;
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
if (MemTy == MVT::i8)
- Opcode = ARM::CMP_SWAP_8;
+ Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_8 : ARM::CMP_SWAP_8;
else if (MemTy == MVT::i16)
- Opcode = ARM::CMP_SWAP_16;
+ Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16;
else if (MemTy == MVT::i32)
Opcode = ARM::CMP_SWAP_32;
else
@@ -3443,6 +3739,11 @@
return;
}
}
+ case ISD::INSERT_VECTOR_ELT: {
+ if (tryInsertVectorElt(N))
+ return;
+ break;
+ }
case ISD::SRL:
if (tryV6T2BitfieldExtractOp(N, false))
return;
@@ -3452,6 +3753,15 @@
if (tryV6T2BitfieldExtractOp(N, true))
return;
break;
+ case ISD::FP_TO_UINT:
+ case ISD::FP_TO_SINT:
+ if (tryFP_TO_INT(N, dl))
+ return;
+ break;
+ case ISD::FMUL:
+ if (tryFMULFixed(N, dl))
+ return;
+ break;
case ISD::MUL:
if (Subtarget->isThumb1Only())
break;
@@ -3680,13 +3990,26 @@
return;
// Other cases are autogenerated.
break;
- case ARMISD::WLS:
+ case ARMISD::WLSSETUP: {
+ SDNode *New = CurDAG->getMachineNode(ARM::t2WhileLoopSetup, dl, MVT::i32,
+ N->getOperand(0));
+ ReplaceUses(N, New);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ case ARMISD::WLS: {
+ SDNode *New = CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other,
+ N->getOperand(1), N->getOperand(2),
+ N->getOperand(0));
+ ReplaceUses(N, New);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
case ARMISD::LE: {
SDValue Ops[] = { N->getOperand(1),
N->getOperand(2),
N->getOperand(0) };
- unsigned Opc = N->getOpcode() == ARMISD::WLS ?
- ARM::t2WhileLoopStart : ARM::t2LoopEnd;
+ unsigned Opc = ARM::t2LoopEnd;
SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceUses(N, New);
CurDAG->RemoveDeadNode(N);
@@ -4050,26 +4373,47 @@
}
case ARMISD::VLD2DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
- ARM::VLD2DUPd16wb_fixed,
- ARM::VLD2DUPd32wb_fixed };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8wb_fixed,
+ ARM::VLD2DUPd16wb_fixed,
+ ARM::VLD2DUPd32wb_fixed,
+ ARM::VLD1q64wb_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo,
+ ARM::VLD2DUPq16EvenPseudo,
+ ARM::VLD2DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudoWB_fixed,
+ ARM::VLD2DUPq16OddPseudoWB_fixed,
+ ARM::VLD2DUPq32OddPseudoWB_fixed };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
case ARMISD::VLD3DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
- ARM::VLD3DUPd16Pseudo_UPD,
- ARM::VLD3DUPd32Pseudo_UPD };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+ ARM::VLD3DUPd16Pseudo_UPD,
+ ARM::VLD3DUPd32Pseudo_UPD,
+ ARM::VLD1d64TPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo,
+ ARM::VLD3DUPq16EvenPseudo,
+ ARM::VLD3DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo_UPD,
+ ARM::VLD3DUPq16OddPseudo_UPD,
+ ARM::VLD3DUPq32OddPseudo_UPD };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
case ARMISD::VLD4DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
- ARM::VLD4DUPd16Pseudo_UPD,
- ARM::VLD4DUPd32Pseudo_UPD };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+ ARM::VLD4DUPd16Pseudo_UPD,
+ ARM::VLD4DUPd32Pseudo_UPD,
+ ARM::VLD1d64QPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo,
+ ARM::VLD4DUPq16EvenPseudo,
+ ARM::VLD4DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo_UPD,
+ ARM::VLD4DUPq16OddPseudo_UPD,
+ ARM::VLD4DUPq32OddPseudo_UPD };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
@@ -4151,6 +4495,54 @@
return;
}
+ case ARMISD::VLD1x2_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VLD1q8wb_fixed, ARM::VLD1q16wb_fixed, ARM::VLD1q32wb_fixed,
+ ARM::VLD1q64wb_fixed};
+ static const uint16_t QOpcodes[] = {
+ ARM::VLD1d8QPseudoWB_fixed, ARM::VLD1d16QPseudoWB_fixed,
+ ARM::VLD1d32QPseudoWB_fixed, ARM::VLD1d64QPseudoWB_fixed};
+ SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+ break;
+ }
+
+ case ARMISD::VLD1x3_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VLD1d8TPseudoWB_fixed, ARM::VLD1d16TPseudoWB_fixed,
+ ARM::VLD1d32TPseudoWB_fixed, ARM::VLD1d64TPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = {
+ ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1q16LowTPseudo_UPD,
+ ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1q64LowTPseudo_UPD};
+ static const uint16_t QOpcodes1[] = {
+ ARM::VLD1q8HighTPseudo_UPD, ARM::VLD1q16HighTPseudo_UPD,
+ ARM::VLD1q32HighTPseudo_UPD, ARM::VLD1q64HighTPseudo_UPD};
+ SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+ break;
+ }
+
+ case ARMISD::VLD1x4_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VLD1d8QPseudoWB_fixed, ARM::VLD1d16QPseudoWB_fixed,
+ ARM::VLD1d32QPseudoWB_fixed, ARM::VLD1d64QPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = {
+ ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1q16LowQPseudo_UPD,
+ ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1q64LowQPseudo_UPD};
+ static const uint16_t QOpcodes1[] = {
+ ARM::VLD1q8HighQPseudo_UPD, ARM::VLD1q16HighQPseudo_UPD,
+ ARM::VLD1q32HighQPseudo_UPD, ARM::VLD1q64HighQPseudo_UPD};
+ SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+ break;
+ }
+
case ARMISD::VLD2LN_UPD: {
static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
ARM::VLD2LNd16Pseudo_UPD,
@@ -4240,6 +4632,61 @@
break;
}
+ case ARMISD::VST1x2_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = { ARM::VST1q8wb_fixed,
+ ARM::VST1q16wb_fixed,
+ ARM::VST1q32wb_fixed,
+ ARM::VST1q64wb_fixed};
+ static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudoWB_fixed,
+ ARM::VST1d16QPseudoWB_fixed,
+ ARM::VST1d32QPseudoWB_fixed,
+ ARM::VST1d64QPseudoWB_fixed };
+ SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+ break;
+ }
+
+ case ARMISD::VST1x3_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudoWB_fixed,
+ ARM::VST1d16TPseudoWB_fixed,
+ ARM::VST1d32TPseudoWB_fixed,
+ ARM::VST1d64TPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD,
+ ARM::VST1q16LowTPseudo_UPD,
+ ARM::VST1q32LowTPseudo_UPD,
+ ARM::VST1q64LowTPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo_UPD,
+ ARM::VST1q16HighTPseudo_UPD,
+ ARM::VST1q32HighTPseudo_UPD,
+ ARM::VST1q64HighTPseudo_UPD };
+ SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+ break;
+ }
+
+ case ARMISD::VST1x4_UPD: {
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudoWB_fixed,
+ ARM::VST1d16QPseudoWB_fixed,
+ ARM::VST1d32QPseudoWB_fixed,
+ ARM::VST1d64QPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD,
+ ARM::VST1q16LowQPseudo_UPD,
+ ARM::VST1q32LowQPseudo_UPD,
+ ARM::VST1q64LowQPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo_UPD,
+ ARM::VST1q16HighQPseudo_UPD,
+ ARM::VST1q32HighQPseudo_UPD,
+ ARM::VST1q64HighQPseudo_UPD };
+ SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+ break;
+ }
case ARMISD::VST2LN_UPD: {
static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
ARM::VST2LNd16Pseudo_UPD,
@@ -4946,6 +5393,7 @@
assert(AllIntFields &&
"Unexpected non-integer value in special register string.");
+ (void)AllIntFields;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5980626..9001132 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21,6 +21,7 @@
#include "ARMRegisterInfo.h"
#include "ARMSelectionDAGInfo.h"
#include "ARMSubtarget.h"
+#include "ARMTargetTransformInfo.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
#include "Utils/ARMBaseInfo.h"
@@ -153,8 +154,7 @@
ARM::R0, ARM::R1, ARM::R2, ARM::R3
};
-void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
- MVT PromotedBitwiseVT) {
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
if (VT != PromotedLdStVT) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
@@ -193,16 +193,6 @@
setOperationAction(ISD::SRL, VT, Custom);
}
- // Promote all bit-wise operations.
- if (VT.isInteger() && VT != PromotedBitwiseVT) {
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
- }
-
// Neon does not support vector divide/remainder operations.
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
@@ -224,12 +214,12 @@
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &ARM::DPRRegClass);
- addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+ addTypeForNEON(VT, MVT::f64);
}
void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &ARM::DPairRegClass);
- addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+ addTypeForNEON(VT, MVT::v2f64);
}
void ARMTargetLowering::setAllExpand(MVT VT) {
@@ -280,6 +270,8 @@
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
+ setOperationAction(ISD::ABDS, VT, Legal);
+ setOperationAction(ISD::ABDU, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
@@ -396,6 +388,8 @@
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
}
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+
// We can do bitwise operations on v2i64 vectors
setOperationAction(ISD::AND, MVT::v2i64, Legal);
setOperationAction(ISD::OR, MVT::v2i64, Legal);
@@ -447,6 +441,14 @@
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
}
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
}
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
@@ -541,6 +543,7 @@
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
+ setLibcallName(RTLIB::MUL_I128, nullptr);
// RTLIB
if (Subtarget->isAAPCS_ABI() &&
@@ -766,9 +769,7 @@
addAllExtLoads(VT, InnerVT, Expand);
}
- setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
@@ -947,6 +948,11 @@
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ }
+
// NEON only has FMA instructions as of VFP4.
if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);
@@ -977,6 +983,7 @@
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
@@ -1103,6 +1110,10 @@
setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
}
if (Subtarget->hasBaseDSP()) {
setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
@@ -1340,6 +1351,7 @@
// iff target supports vfp2.
setOperationAction(ISD::BITCAST, MVT::i64, Custom);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+ setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
}
// We want to custom lower some of our intrinsics.
@@ -1604,209 +1616,215 @@
}
const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define MAKE_CASE(V) \
+ case V: \
+ return #V;
switch ((ARMISD::NodeType)Opcode) {
- case ARMISD::FIRST_NUMBER: break;
- case ARMISD::Wrapper: return "ARMISD::Wrapper";
- case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC";
- case ARMISD::WrapperJT: return "ARMISD::WrapperJT";
- case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
- case ARMISD::CALL: return "ARMISD::CALL";
- case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
- case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK";
- case ARMISD::tSECALL: return "ARMISD::tSECALL";
- case ARMISD::BRCOND: return "ARMISD::BRCOND";
- case ARMISD::BR_JT: return "ARMISD::BR_JT";
- case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
- case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
- case ARMISD::SERET_FLAG: return "ARMISD::SERET_FLAG";
- case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG";
- case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
- case ARMISD::CMP: return "ARMISD::CMP";
- case ARMISD::CMN: return "ARMISD::CMN";
- case ARMISD::CMPZ: return "ARMISD::CMPZ";
- case ARMISD::CMPFP: return "ARMISD::CMPFP";
- case ARMISD::CMPFPE: return "ARMISD::CMPFPE";
- case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0";
- case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0";
- case ARMISD::BCC_i64: return "ARMISD::BCC_i64";
- case ARMISD::FMSTAT: return "ARMISD::FMSTAT";
-
- case ARMISD::CMOV: return "ARMISD::CMOV";
- case ARMISD::SUBS: return "ARMISD::SUBS";
-
- case ARMISD::SSAT: return "ARMISD::SSAT";
- case ARMISD::USAT: return "ARMISD::USAT";
-
- case ARMISD::ASRL: return "ARMISD::ASRL";
- case ARMISD::LSRL: return "ARMISD::LSRL";
- case ARMISD::LSLL: return "ARMISD::LSLL";
-
- case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
- case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
- case ARMISD::RRX: return "ARMISD::RRX";
-
- case ARMISD::ADDC: return "ARMISD::ADDC";
- case ARMISD::ADDE: return "ARMISD::ADDE";
- case ARMISD::SUBC: return "ARMISD::SUBC";
- case ARMISD::SUBE: return "ARMISD::SUBE";
- case ARMISD::LSLS: return "ARMISD::LSLS";
-
- case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
- case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
- case ARMISD::VMOVhr: return "ARMISD::VMOVhr";
- case ARMISD::VMOVrh: return "ARMISD::VMOVrh";
- case ARMISD::VMOVSR: return "ARMISD::VMOVSR";
-
- case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
- case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
- case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
-
- case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN";
-
- case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
-
- case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC";
-
- case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
-
- case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
-
- case ARMISD::LDRD: return "ARMISD::LDRD";
- case ARMISD::STRD: return "ARMISD::STRD";
-
- case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
- case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
-
- case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
- case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST";
- case ARMISD::VCMP: return "ARMISD::VCMP";
- case ARMISD::VCMPZ: return "ARMISD::VCMPZ";
- case ARMISD::VTST: return "ARMISD::VTST";
-
- case ARMISD::VSHLs: return "ARMISD::VSHLs";
- case ARMISD::VSHLu: return "ARMISD::VSHLu";
- case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM";
- case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM";
- case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM";
- case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM";
- case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM";
- case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM";
- case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM";
- case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM";
- case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM";
- case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM";
- case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM";
- case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM";
- case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM";
- case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM";
- case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM";
- case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM";
- case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM";
- case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
- case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
- case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
- case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM";
- case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM";
- case ARMISD::VDUP: return "ARMISD::VDUP";
- case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE";
- case ARMISD::VEXT: return "ARMISD::VEXT";
- case ARMISD::VREV64: return "ARMISD::VREV64";
- case ARMISD::VREV32: return "ARMISD::VREV32";
- case ARMISD::VREV16: return "ARMISD::VREV16";
- case ARMISD::VZIP: return "ARMISD::VZIP";
- case ARMISD::VUZP: return "ARMISD::VUZP";
- case ARMISD::VTRN: return "ARMISD::VTRN";
- case ARMISD::VTBL1: return "ARMISD::VTBL1";
- case ARMISD::VTBL2: return "ARMISD::VTBL2";
- case ARMISD::VMOVN: return "ARMISD::VMOVN";
- case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs";
- case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu";
- case ARMISD::VCVTN: return "ARMISD::VCVTN";
- case ARMISD::VCVTL: return "ARMISD::VCVTL";
- case ARMISD::VMULLs: return "ARMISD::VMULLs";
- case ARMISD::VMULLu: return "ARMISD::VMULLu";
- case ARMISD::VQDMULH: return "ARMISD::VQDMULH";
- case ARMISD::VADDVs: return "ARMISD::VADDVs";
- case ARMISD::VADDVu: return "ARMISD::VADDVu";
- case ARMISD::VADDVps: return "ARMISD::VADDVps";
- case ARMISD::VADDVpu: return "ARMISD::VADDVpu";
- case ARMISD::VADDLVs: return "ARMISD::VADDLVs";
- case ARMISD::VADDLVu: return "ARMISD::VADDLVu";
- case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs";
- case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu";
- case ARMISD::VADDLVps: return "ARMISD::VADDLVps";
- case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu";
- case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps";
- case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu";
- case ARMISD::VMLAVs: return "ARMISD::VMLAVs";
- case ARMISD::VMLAVu: return "ARMISD::VMLAVu";
- case ARMISD::VMLAVps: return "ARMISD::VMLAVps";
- case ARMISD::VMLAVpu: return "ARMISD::VMLAVpu";
- case ARMISD::VMLALVs: return "ARMISD::VMLALVs";
- case ARMISD::VMLALVu: return "ARMISD::VMLALVu";
- case ARMISD::VMLALVps: return "ARMISD::VMLALVps";
- case ARMISD::VMLALVpu: return "ARMISD::VMLALVpu";
- case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs";
- case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu";
- case ARMISD::VMLALVAps: return "ARMISD::VMLALVAps";
- case ARMISD::VMLALVApu: return "ARMISD::VMLALVApu";
- case ARMISD::VMINVu: return "ARMISD::VMINVu";
- case ARMISD::VMINVs: return "ARMISD::VMINVs";
- case ARMISD::VMAXVu: return "ARMISD::VMAXVu";
- case ARMISD::VMAXVs: return "ARMISD::VMAXVs";
- case ARMISD::UMAAL: return "ARMISD::UMAAL";
- case ARMISD::UMLAL: return "ARMISD::UMLAL";
- case ARMISD::SMLAL: return "ARMISD::SMLAL";
- case ARMISD::SMLALBB: return "ARMISD::SMLALBB";
- case ARMISD::SMLALBT: return "ARMISD::SMLALBT";
- case ARMISD::SMLALTB: return "ARMISD::SMLALTB";
- case ARMISD::SMLALTT: return "ARMISD::SMLALTT";
- case ARMISD::SMULWB: return "ARMISD::SMULWB";
- case ARMISD::SMULWT: return "ARMISD::SMULWT";
- case ARMISD::SMLALD: return "ARMISD::SMLALD";
- case ARMISD::SMLALDX: return "ARMISD::SMLALDX";
- case ARMISD::SMLSLD: return "ARMISD::SMLSLD";
- case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
- case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
- case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
- case ARMISD::QADD16b: return "ARMISD::QADD16b";
- case ARMISD::QSUB16b: return "ARMISD::QSUB16b";
- case ARMISD::QADD8b: return "ARMISD::QADD8b";
- case ARMISD::QSUB8b: return "ARMISD::QSUB8b";
- case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
- case ARMISD::BFI: return "ARMISD::BFI";
- case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
- case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
- case ARMISD::VBSP: return "ARMISD::VBSP";
- case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
- case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
- case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
- case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
- case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
- case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
- case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
- case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
- case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
- case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
- case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
- case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
- case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
- case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
- case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
- case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
- case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
- case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
- case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
- case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
- case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
- case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
- case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
- case ARMISD::WLS: return "ARMISD::WLS";
- case ARMISD::LE: return "ARMISD::LE";
- case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC";
- case ARMISD::CSINV: return "ARMISD::CSINV";
- case ARMISD::CSNEG: return "ARMISD::CSNEG";
- case ARMISD::CSINC: return "ARMISD::CSINC";
+ case ARMISD::FIRST_NUMBER:
+ break;
+ MAKE_CASE(ARMISD::Wrapper)
+ MAKE_CASE(ARMISD::WrapperPIC)
+ MAKE_CASE(ARMISD::WrapperJT)
+ MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
+ MAKE_CASE(ARMISD::CALL)
+ MAKE_CASE(ARMISD::CALL_PRED)
+ MAKE_CASE(ARMISD::CALL_NOLINK)
+ MAKE_CASE(ARMISD::tSECALL)
+ MAKE_CASE(ARMISD::BRCOND)
+ MAKE_CASE(ARMISD::BR_JT)
+ MAKE_CASE(ARMISD::BR2_JT)
+ MAKE_CASE(ARMISD::RET_FLAG)
+ MAKE_CASE(ARMISD::SERET_FLAG)
+ MAKE_CASE(ARMISD::INTRET_FLAG)
+ MAKE_CASE(ARMISD::PIC_ADD)
+ MAKE_CASE(ARMISD::CMP)
+ MAKE_CASE(ARMISD::CMN)
+ MAKE_CASE(ARMISD::CMPZ)
+ MAKE_CASE(ARMISD::CMPFP)
+ MAKE_CASE(ARMISD::CMPFPE)
+ MAKE_CASE(ARMISD::CMPFPw0)
+ MAKE_CASE(ARMISD::CMPFPEw0)
+ MAKE_CASE(ARMISD::BCC_i64)
+ MAKE_CASE(ARMISD::FMSTAT)
+ MAKE_CASE(ARMISD::CMOV)
+ MAKE_CASE(ARMISD::SUBS)
+ MAKE_CASE(ARMISD::SSAT)
+ MAKE_CASE(ARMISD::USAT)
+ MAKE_CASE(ARMISD::ASRL)
+ MAKE_CASE(ARMISD::LSRL)
+ MAKE_CASE(ARMISD::LSLL)
+ MAKE_CASE(ARMISD::SRL_FLAG)
+ MAKE_CASE(ARMISD::SRA_FLAG)
+ MAKE_CASE(ARMISD::RRX)
+ MAKE_CASE(ARMISD::ADDC)
+ MAKE_CASE(ARMISD::ADDE)
+ MAKE_CASE(ARMISD::SUBC)
+ MAKE_CASE(ARMISD::SUBE)
+ MAKE_CASE(ARMISD::LSLS)
+ MAKE_CASE(ARMISD::VMOVRRD)
+ MAKE_CASE(ARMISD::VMOVDRR)
+ MAKE_CASE(ARMISD::VMOVhr)
+ MAKE_CASE(ARMISD::VMOVrh)
+ MAKE_CASE(ARMISD::VMOVSR)
+ MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
+ MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
+ MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
+ MAKE_CASE(ARMISD::TC_RETURN)
+ MAKE_CASE(ARMISD::THREAD_POINTER)
+ MAKE_CASE(ARMISD::DYN_ALLOC)
+ MAKE_CASE(ARMISD::MEMBARRIER_MCR)
+ MAKE_CASE(ARMISD::PRELOAD)
+ MAKE_CASE(ARMISD::LDRD)
+ MAKE_CASE(ARMISD::STRD)
+ MAKE_CASE(ARMISD::WIN__CHKSTK)
+ MAKE_CASE(ARMISD::WIN__DBZCHK)
+ MAKE_CASE(ARMISD::PREDICATE_CAST)
+ MAKE_CASE(ARMISD::VECTOR_REG_CAST)
+ MAKE_CASE(ARMISD::MVESEXT)
+ MAKE_CASE(ARMISD::MVEZEXT)
+ MAKE_CASE(ARMISD::MVETRUNC)
+ MAKE_CASE(ARMISD::VCMP)
+ MAKE_CASE(ARMISD::VCMPZ)
+ MAKE_CASE(ARMISD::VTST)
+ MAKE_CASE(ARMISD::VSHLs)
+ MAKE_CASE(ARMISD::VSHLu)
+ MAKE_CASE(ARMISD::VSHLIMM)
+ MAKE_CASE(ARMISD::VSHRsIMM)
+ MAKE_CASE(ARMISD::VSHRuIMM)
+ MAKE_CASE(ARMISD::VRSHRsIMM)
+ MAKE_CASE(ARMISD::VRSHRuIMM)
+ MAKE_CASE(ARMISD::VRSHRNIMM)
+ MAKE_CASE(ARMISD::VQSHLsIMM)
+ MAKE_CASE(ARMISD::VQSHLuIMM)
+ MAKE_CASE(ARMISD::VQSHLsuIMM)
+ MAKE_CASE(ARMISD::VQSHRNsIMM)
+ MAKE_CASE(ARMISD::VQSHRNuIMM)
+ MAKE_CASE(ARMISD::VQSHRNsuIMM)
+ MAKE_CASE(ARMISD::VQRSHRNsIMM)
+ MAKE_CASE(ARMISD::VQRSHRNuIMM)
+ MAKE_CASE(ARMISD::VQRSHRNsuIMM)
+ MAKE_CASE(ARMISD::VSLIIMM)
+ MAKE_CASE(ARMISD::VSRIIMM)
+ MAKE_CASE(ARMISD::VGETLANEu)
+ MAKE_CASE(ARMISD::VGETLANEs)
+ MAKE_CASE(ARMISD::VMOVIMM)
+ MAKE_CASE(ARMISD::VMVNIMM)
+ MAKE_CASE(ARMISD::VMOVFPIMM)
+ MAKE_CASE(ARMISD::VDUP)
+ MAKE_CASE(ARMISD::VDUPLANE)
+ MAKE_CASE(ARMISD::VEXT)
+ MAKE_CASE(ARMISD::VREV64)
+ MAKE_CASE(ARMISD::VREV32)
+ MAKE_CASE(ARMISD::VREV16)
+ MAKE_CASE(ARMISD::VZIP)
+ MAKE_CASE(ARMISD::VUZP)
+ MAKE_CASE(ARMISD::VTRN)
+ MAKE_CASE(ARMISD::VTBL1)
+ MAKE_CASE(ARMISD::VTBL2)
+ MAKE_CASE(ARMISD::VMOVN)
+ MAKE_CASE(ARMISD::VQMOVNs)
+ MAKE_CASE(ARMISD::VQMOVNu)
+ MAKE_CASE(ARMISD::VCVTN)
+ MAKE_CASE(ARMISD::VCVTL)
+ MAKE_CASE(ARMISD::VIDUP)
+ MAKE_CASE(ARMISD::VMULLs)
+ MAKE_CASE(ARMISD::VMULLu)
+ MAKE_CASE(ARMISD::VQDMULH)
+ MAKE_CASE(ARMISD::VADDVs)
+ MAKE_CASE(ARMISD::VADDVu)
+ MAKE_CASE(ARMISD::VADDVps)
+ MAKE_CASE(ARMISD::VADDVpu)
+ MAKE_CASE(ARMISD::VADDLVs)
+ MAKE_CASE(ARMISD::VADDLVu)
+ MAKE_CASE(ARMISD::VADDLVAs)
+ MAKE_CASE(ARMISD::VADDLVAu)
+ MAKE_CASE(ARMISD::VADDLVps)
+ MAKE_CASE(ARMISD::VADDLVpu)
+ MAKE_CASE(ARMISD::VADDLVAps)
+ MAKE_CASE(ARMISD::VADDLVApu)
+ MAKE_CASE(ARMISD::VMLAVs)
+ MAKE_CASE(ARMISD::VMLAVu)
+ MAKE_CASE(ARMISD::VMLAVps)
+ MAKE_CASE(ARMISD::VMLAVpu)
+ MAKE_CASE(ARMISD::VMLALVs)
+ MAKE_CASE(ARMISD::VMLALVu)
+ MAKE_CASE(ARMISD::VMLALVps)
+ MAKE_CASE(ARMISD::VMLALVpu)
+ MAKE_CASE(ARMISD::VMLALVAs)
+ MAKE_CASE(ARMISD::VMLALVAu)
+ MAKE_CASE(ARMISD::VMLALVAps)
+ MAKE_CASE(ARMISD::VMLALVApu)
+ MAKE_CASE(ARMISD::VMINVu)
+ MAKE_CASE(ARMISD::VMINVs)
+ MAKE_CASE(ARMISD::VMAXVu)
+ MAKE_CASE(ARMISD::VMAXVs)
+ MAKE_CASE(ARMISD::UMAAL)
+ MAKE_CASE(ARMISD::UMLAL)
+ MAKE_CASE(ARMISD::SMLAL)
+ MAKE_CASE(ARMISD::SMLALBB)
+ MAKE_CASE(ARMISD::SMLALBT)
+ MAKE_CASE(ARMISD::SMLALTB)
+ MAKE_CASE(ARMISD::SMLALTT)
+ MAKE_CASE(ARMISD::SMULWB)
+ MAKE_CASE(ARMISD::SMULWT)
+ MAKE_CASE(ARMISD::SMLALD)
+ MAKE_CASE(ARMISD::SMLALDX)
+ MAKE_CASE(ARMISD::SMLSLD)
+ MAKE_CASE(ARMISD::SMLSLDX)
+ MAKE_CASE(ARMISD::SMMLAR)
+ MAKE_CASE(ARMISD::SMMLSR)
+ MAKE_CASE(ARMISD::QADD16b)
+ MAKE_CASE(ARMISD::QSUB16b)
+ MAKE_CASE(ARMISD::QADD8b)
+ MAKE_CASE(ARMISD::QSUB8b)
+ MAKE_CASE(ARMISD::UQADD16b)
+ MAKE_CASE(ARMISD::UQSUB16b)
+ MAKE_CASE(ARMISD::UQADD8b)
+ MAKE_CASE(ARMISD::UQSUB8b)
+ MAKE_CASE(ARMISD::BUILD_VECTOR)
+ MAKE_CASE(ARMISD::BFI)
+ MAKE_CASE(ARMISD::VORRIMM)
+ MAKE_CASE(ARMISD::VBICIMM)
+ MAKE_CASE(ARMISD::VBSP)
+ MAKE_CASE(ARMISD::MEMCPY)
+ MAKE_CASE(ARMISD::VLD1DUP)
+ MAKE_CASE(ARMISD::VLD2DUP)
+ MAKE_CASE(ARMISD::VLD3DUP)
+ MAKE_CASE(ARMISD::VLD4DUP)
+ MAKE_CASE(ARMISD::VLD1_UPD)
+ MAKE_CASE(ARMISD::VLD2_UPD)
+ MAKE_CASE(ARMISD::VLD3_UPD)
+ MAKE_CASE(ARMISD::VLD4_UPD)
+ MAKE_CASE(ARMISD::VLD1x2_UPD)
+ MAKE_CASE(ARMISD::VLD1x3_UPD)
+ MAKE_CASE(ARMISD::VLD1x4_UPD)
+ MAKE_CASE(ARMISD::VLD2LN_UPD)
+ MAKE_CASE(ARMISD::VLD3LN_UPD)
+ MAKE_CASE(ARMISD::VLD4LN_UPD)
+ MAKE_CASE(ARMISD::VLD1DUP_UPD)
+ MAKE_CASE(ARMISD::VLD2DUP_UPD)
+ MAKE_CASE(ARMISD::VLD3DUP_UPD)
+ MAKE_CASE(ARMISD::VLD4DUP_UPD)
+ MAKE_CASE(ARMISD::VST1_UPD)
+ MAKE_CASE(ARMISD::VST2_UPD)
+ MAKE_CASE(ARMISD::VST3_UPD)
+ MAKE_CASE(ARMISD::VST4_UPD)
+ MAKE_CASE(ARMISD::VST1x2_UPD)
+ MAKE_CASE(ARMISD::VST1x3_UPD)
+ MAKE_CASE(ARMISD::VST1x4_UPD)
+ MAKE_CASE(ARMISD::VST2LN_UPD)
+ MAKE_CASE(ARMISD::VST3LN_UPD)
+ MAKE_CASE(ARMISD::VST4LN_UPD)
+ MAKE_CASE(ARMISD::WLS)
+ MAKE_CASE(ARMISD::WLSSETUP)
+ MAKE_CASE(ARMISD::LE)
+ MAKE_CASE(ARMISD::LOOP_DEC)
+ MAKE_CASE(ARMISD::CSINV)
+ MAKE_CASE(ARMISD::CSNEG)
+ MAKE_CASE(ARMISD::CSINC)
+ MAKE_CASE(ARMISD::MEMCPYLOOP)
+ MAKE_CASE(ARMISD::MEMSETLOOP)
+#undef MAKE_CASE
}
return nullptr;
}
@@ -1817,8 +1835,9 @@
return getPointerTy(DL);
// MVE has a predicate register.
- if (Subtarget->hasMVEIntegerOps() &&
- (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
+ if ((Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) ||
+ (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16)))
return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -1998,8 +2017,10 @@
return CallingConv::PreserveMost;
case CallingConv::ARM_AAPCS_VFP:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
case CallingConv::C:
+ case CallingConv::Tail:
if (!Subtarget->isAAPCS_ABI())
return CallingConv::ARM_APCS;
else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
@@ -2176,19 +2197,31 @@
return Chain;
}
-/// LowerMemOpCallTo - Store the argument to the stack.
-SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
- SDValue Arg, const SDLoc &dl,
- SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const {
- unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
- PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
- StackPtr, PtrOff);
- return DAG.getStore(
- Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
+ const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
+ bool IsTailCall, int SPDiff) const {
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ int32_t Offset = VA.getLocMemOffset();
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (IsTailCall) {
+ Offset += SPDiff;
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ int Size = VA.getLocVT().getFixedSizeInBits() / 8;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
+ DstInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+ } else {
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+ DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ DstInfo =
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
+ }
+
+ return std::make_pair(DstAddr, DstInfo);
}
void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
@@ -2197,7 +2230,8 @@
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVectorImpl<SDValue> &MemOpChains,
- ISD::ArgFlagsTy Flags) const {
+ bool IsTailCall,
+ int SPDiff) const {
SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
unsigned id = Subtarget->isLittle() ? 0 : 1;
@@ -2211,12 +2245,20 @@
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
getPointerTy(DAG.getDataLayout()));
- MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
- dl, DAG, NextVA,
- Flags));
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
}
}
+static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
+ return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
+}
+
/// LowerCall - Lowering a call into a callseq_start <-
/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
/// nodes.
@@ -2241,6 +2283,7 @@
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
bool isCmseNSCall = false;
+ bool isSibCall = false;
bool PreferIndirect = false;
// Determine whether this is a non-secure function call.
@@ -2277,15 +2320,20 @@
Callee, CallConv, isVarArg, isStructRet,
MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
PreferIndirect);
- if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
- report_fatal_error("failed to perform tail call elimination on a call "
- "site marked musttail");
+
+ if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
+ isSibCall = true;
+
// We don't support GuaranteedTailCallOpt for ARM, only automatically
// detected sibcalls.
if (isTailCall)
++NumTailCalls;
}
+ if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
@@ -2295,13 +2343,40 @@
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- if (isTailCall) {
- // For tail calls, memory operands are available in our caller's stack.
+ // SPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int SPDiff = 0;
+
+ if (isTailCall && !isSibCall) {
+ auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
+ unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
+
+ // Since callee will pop argument stack as a tail call, we must keep the
+ // popped size 16-byte aligned.
+ Align StackAlign = DAG.getDataLayout().getStackAlignment();
+ NumBytes = alignTo(NumBytes, StackAlign);
+
+ // SPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // can actually shrink the stack.
+ SPDiff = NumReusableBytes - NumBytes;
+
+ // If this call requires more stack than we have available from
+ // LowerFormalArguments, tell FrameLowering to reserve space for it.
+ if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
+ AFI->setArgRegsSaveSize(-SPDiff);
+ }
+
+ if (isSibCall) {
+ // For sibling tail calls, memory operands are available in our caller's stack.
NumBytes = 0;
} else {
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
}
SDValue StackPtr =
@@ -2310,6 +2385,13 @@
RegsToPassVector RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
+ // During a tail call, stores to the argument area must happen after all of
+ // the function's incoming arguments have been loaded because they may alias.
+ // This is done by folding in a TokenFactor from LowerFormalArguments, but
+ // there's no point in doing so repeatedly so this tracks whether that's
+ // happened yet.
+ bool AfterFormalArgLoads = false;
+
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization, arguments are handled later.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
@@ -2338,6 +2420,11 @@
break;
}
+ if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
+ Chain = DAG.getStackArgumentTokenFactor(Chain);
+ AfterFormalArgLoads = true;
+ }
+
// f16 arguments have their size extended to 4 bytes and passed as if they
// had been copied to the LSBs of a 32-bit register.
// For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
@@ -2367,21 +2454,23 @@
DAG.getConstant(1, dl, MVT::i32));
PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
VA = ArgLocs[++i]; // skip ahead to next loc
if (VA.isRegLoc()) {
PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
} else {
assert(VA.isMemLoc());
-
- MemOpChains.push_back(
- LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
+ MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
}
} else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
} else if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i32) {
@@ -2431,9 +2520,10 @@
if (Flags.getByValSize() > 4*offset) {
auto PtrVT = getPointerTy(DAG.getDataLayout());
- unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
- SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
+ SDValue Dst;
+ MachinePointerInfo DstInfo;
+ std::tie(Dst, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
@@ -2446,11 +2536,15 @@
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
Ops));
}
- } else if (!isTailCall) {
+ } else {
assert(VA.isMemLoc());
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
- MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
- dl, DAG, VA, Flags));
+ SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
}
}
@@ -2614,10 +2708,24 @@
CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
}
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (isTailCall && !isSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
+ if (isTailCall) {
+ Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
+ }
+
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2662,8 +2770,16 @@
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+ // If we're guaranteeing tail-calls will be honoured, the callee must
+ // pop its own argument stack on return. But this call is *not* a tail call so
+ // we need to undo that after it returns to restore the status-quo.
+ bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+ uint64_t CalleePopBytes =
+ canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
+
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ DAG.getIntPtrConstant(CalleePopBytes, dl, true),
+ InFlag, dl);
if (!Ins.empty())
InFlag = Chain.getValue(1);
@@ -2804,6 +2920,9 @@
if (CallerF.hasFnAttribute("interrupt"))
return false;
+ if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+ return CalleeCC == CallerCC;
+
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
@@ -3550,9 +3669,7 @@
/// Return true if all users of V are within function F, looking through
/// ConstantExprs.
static bool allUsersAreInFunction(const Value *V, const Function *F) {
- SmallVector<const User*,4> Worklist;
- for (auto *U : V->users())
- Worklist.push_back(U);
+ SmallVector<const User*,4> Worklist(V->users());
while (!Worklist.empty()) {
auto *U = Worklist.pop_back_val();
if (isa<ConstantExpr>(U)) {
@@ -3599,7 +3716,7 @@
// from .data to .text. This is not allowed in position-independent code.
auto *Init = GVar->getInitializer();
if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
- Init->needsRelocation())
+ Init->needsDynamicRelocation())
return SDValue();
// The constant islands pass can only really deal with alignment requests
@@ -4454,7 +4571,17 @@
}
}
- AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ if (canGuaranteeTCO(CallConv, TailCallOpt)) {
+ // The only way to guarantee a tail call is if the callee restores its
+ // argument area, but it must also keep the stack aligned when doing so.
+ const DataLayout &DL = DAG.getDataLayout();
+ StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
+
+ AFI->setArgumentStackToRestore(StackArgSize);
+ }
+ AFI->setArgumentStackSize(StackArgSize);
if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
DiagnosticInfoUnsupported Diag(
@@ -4826,8 +4953,8 @@
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
-static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
- const ARMSubtarget *Subtarget) {
+static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
EVT VT = Op.getValueType();
if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
return SDValue();
@@ -4835,15 +4962,40 @@
return SDValue();
unsigned NewOpcode;
- bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
switch (VT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::i8:
- NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
+ switch (Op->getOpcode()) {
+ case ISD::UADDSAT:
+ NewOpcode = ARMISD::UQADD8b;
+ break;
+ case ISD::SADDSAT:
+ NewOpcode = ARMISD::QADD8b;
+ break;
+ case ISD::USUBSAT:
+ NewOpcode = ARMISD::UQSUB8b;
+ break;
+ case ISD::SSUBSAT:
+ NewOpcode = ARMISD::QSUB8b;
+ break;
+ }
break;
case MVT::i16:
- NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
+ switch (Op->getOpcode()) {
+ case ISD::UADDSAT:
+ NewOpcode = ARMISD::UQADD16b;
+ break;
+ case ISD::SADDSAT:
+ NewOpcode = ARMISD::QADD16b;
+ break;
+ case ISD::USUBSAT:
+ NewOpcode = ARMISD::UQSUB16b;
+ break;
+ case ISD::SSUBSAT:
+ NewOpcode = ARMISD::QSUB16b;
+ break;
+ }
break;
}
@@ -5224,8 +5376,6 @@
std::swap(TVal, FVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
- if (TVal == 0)
- TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
// Drops F's value because we can get it by inverting/negating TVal.
FalseVal = TrueVal;
@@ -6117,6 +6267,48 @@
return DAG.getMergeValues({And, Chain}, dl);
}
+SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue RMValue = Op->getOperand(1);
+
+ // The rounding mode is in bits 23:22 of the FPSCR.
+ // The llvm.set.rounding argument value to ARM rounding mode value mapping
+ // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
+ // ((arg - 1) & 3) << 22).
+ //
+ // It is expected that the argument of llvm.set.rounding is within the
+ // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
+ // responsibility of the code generated llvm.set.rounding to ensure this
+ // condition.
+
+ // Calculate new value of FPSCR[23:22].
+ RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
+ DAG.getConstant(1, DL, MVT::i32));
+ RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
+ DAG.getConstant(0x3, DL, MVT::i32));
+ RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
+ DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
+
+ // Get current value of FPSCR.
+ SDValue Ops[] = {Chain,
+ DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
+ SDValue FPSCR =
+ DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
+ Chain = FPSCR.getValue(1);
+ FPSCR = FPSCR.getValue(0);
+
+ // Put new rounding mode into FPSCR[23:22].
+ const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
+ FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
+ DAG.getConstant(RMMask, DL, MVT::i32));
+ FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
+ SDValue Ops2[] = {
+ Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
+ return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDLoc dl(N);
@@ -6706,12 +6898,10 @@
return SDValue();
// NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
uint64_t BitMask = 0xff;
- uint64_t Val = 0;
unsigned ImmMask = 1;
Imm = 0;
for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
- Val |= BitMask;
Imm |= ImmMask;
} else if ((SplatBits & BitMask) != 0) {
return SDValue();
@@ -6923,35 +7113,6 @@
return true;
}
-/// isVREVMask - Check if a vector shuffle corresponds to a VREV
-/// instruction with the specified blocksize. (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
- assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
- "Only possible block sizes for VREV are: 16, 32, 64");
-
- unsigned EltSz = VT.getScalarSizeInBits();
- if (EltSz == 64)
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned BlockElts = M[0] + 1;
- // If the first shuffle index is UNDEF, be optimistic.
- if (M[0] < 0)
- BlockElts = BlockSize / EltSz;
-
- if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
- return false;
-
- for (unsigned i = 0; i < NumElts; ++i) {
- if (M[i] < 0) continue; // ignore UNDEF indices
- if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
- return false;
- }
-
- return true;
-}
-
static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
// We can handle <8 x i8> vector shuffles. If the index in the mask is out of
// range, then 0 is placed into the resulting vector. So pretty much any mask
@@ -7224,11 +7385,11 @@
return true;
}
-static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
+static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
unsigned NumElts = VT.getVectorNumElements();
// Make sure the mask has the right size.
if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
- return false;
+ return false;
// If Top
// Look for <0, N, 2, N+2, 4, N+4, ..>.
@@ -7237,10 +7398,33 @@
// Look for <0, N+1, 2, N+3, 4, N+5, ..>
// This inserts Input1 into Input2
unsigned Offset = Top ? 0 : 1;
- for (unsigned i = 0; i < NumElts; i+=2) {
+ unsigned N = SingleSource ? 0 : NumElts;
+ for (unsigned i = 0; i < NumElts; i += 2) {
if (M[i] >= 0 && M[i] != (int)i)
return false;
- if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
+ if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
+ return false;
+ }
+
+ return true;
+}
+
+static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
+ unsigned NumElts = ToVT.getVectorNumElements();
+ if (NumElts != M.size())
+ return false;
+
+ // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
+ // looking for patterns of:
+ // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
+ // rev: N/2 0 N/2+1 1 N/2+2 2 ...
+
+ unsigned Off0 = rev ? NumElts / 2 : 0;
+ unsigned Off1 = rev ? 0 : NumElts / 2;
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
+ return false;
+ if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
return false;
}
@@ -7425,6 +7609,39 @@
return Base;
}
+static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEIntegerOps())
+ return SDValue();
+
+ // We are looking for a buildvector where each element is Op[0] + i*N
+ EVT VT = Op.getValueType();
+ SDValue Op0 = Op.getOperand(0);
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Get the increment value from operand 1
+ SDValue Op1 = Op.getOperand(1);
+ if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)))
+ return SDValue();
+ unsigned N = Op1.getConstantOperandVal(1);
+ if (N != 1 && N != 2 && N != 4 && N != 8)
+ return SDValue();
+
+ // Check that each other operand matches
+ for (unsigned I = 2; I < NumElts; I++) {
+ SDValue OpI = Op.getOperand(I);
+ if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
+ !isa<ConstantSDNode>(OpI.getOperand(1)) ||
+ OpI.getConstantOperandVal(1) != I * N)
+ return SDValue();
+ }
+
+ SDLoc DL(Op);
+ return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
+ DAG.getConstant(N, DL, MVT::i32));
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -7436,6 +7653,9 @@
if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
return LowerBUILD_VECTOR_i1(Op, DAG, ST);
+ if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
+ return R;
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
@@ -7474,6 +7694,18 @@
return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
}
}
+
+ // If we are under MVE, generate a VDUP(constant), bitcast to the original
+ // type.
+ if (ST->hasMVEIntegerOps() &&
+ (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
+ EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
+ : SplatBitSize == 16 ? MVT::v8i16
+ : MVT::v16i8;
+ SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
+ SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
+ }
}
}
@@ -7947,7 +8179,8 @@
isReverseMask(M, VT))
return true;
else if (Subtarget->hasMVEIntegerOps() &&
- (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
+ (isVMOVNMask(M, VT, true, false) ||
+ isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
return true;
else
return false;
@@ -7981,7 +8214,8 @@
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
// vrev <4 x i16> -> VREV32
- if (VT.getVectorElementType() == MVT::i16)
+ if (VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::f16)
return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
// vrev <4 x i8> -> VREV16
assert(VT.getVectorElementType() == MVT::i8);
@@ -8192,8 +8426,8 @@
Input = Op->getOperand(1);
Elt -= 4;
}
- SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
- Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
+ SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
DAG.getConstant(Elt, dl, MVT::i32));
}
}
@@ -8212,19 +8446,70 @@
Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
SDValue NewShuffle = DAG.getVectorShuffle(
VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
- SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
+ SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
for (int Part = 0; Part < 4; ++Part)
if (!Parts[Part])
- Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
BitCast, DAG.getConstant(Part, dl, MVT::i32));
}
// Build a vector out of the various parts and bitcast it back to the original
// type.
- SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
+ SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
return DAG.getBitcast(VT, NewVec);
}
+static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
+ ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // An One-Off Identity mask is one that is mostly an identity mask from as
+ // single source but contains a single element out-of-place, either from a
+ // different vector or from another position in the same vector. As opposed to
+ // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
+ // pair directly.
+ auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
+ int &OffElement) {
+ OffElement = -1;
+ int NonUndef = 0;
+ for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
+ if (Mask[i] == -1)
+ continue;
+ NonUndef++;
+ if (Mask[i] != i + BaseOffset) {
+ if (OffElement == -1)
+ OffElement = i;
+ else
+ return false;
+ }
+ }
+ return NonUndef > 2 && OffElement != -1;
+ };
+ int OffElement;
+ SDValue VInput;
+ if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
+ VInput = V1;
+ else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
+ VInput = V2;
+ else
+ return SDValue();
+
+ SDLoc dl(Op);
+ EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
+ ? MVT::i32
+ : VT.getScalarType();
+ SDValue Elt = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+ ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
+ DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
+ DAG.getVectorIdxConstant(OffElement % NumElts, dl));
+}
+
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
@@ -8311,12 +8596,15 @@
}
}
if (ST->hasMVEIntegerOps()) {
- if (isVMOVNMask(ShuffleMask, VT, 0))
+ if (isVMOVNMask(ShuffleMask, VT, false, false))
return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
DAG.getConstant(0, dl, MVT::i32));
- if (isVMOVNMask(ShuffleMask, VT, 1))
+ if (isVMOVNMask(ShuffleMask, VT, true, false))
return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
DAG.getConstant(1, dl, MVT::i32));
+ if (isVMOVNMask(ShuffleMask, VT, true, true))
+ return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
+ DAG.getConstant(1, dl, MVT::i32));
}
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
@@ -8358,6 +8646,10 @@
}
}
+ if (ST->hasMVEIntegerOps() && EltSize <= 32)
+ if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
+ return V;
+
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
@@ -8643,13 +8935,13 @@
}
// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
-static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG,
+static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
assert(ST->hasMVEIntegerOps() && "Expected MVE!");
- EVT VT = N.getValueType();
+ EVT VT = N->getValueType(0);
assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
"Expected a vector i1 type!");
- SDValue Op = N.getOperand(0);
+ SDValue Op = N->getOperand(0);
EVT FromVT = Op.getValueType();
SDLoc DL(N);
@@ -8659,6 +8951,99 @@
DAG.getCondCode(ISD::SETNE));
}
+static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ EVT ToVT = N->getValueType(0);
+ if (ToVT.getScalarType() == MVT::i1)
+ return LowerTruncatei1(N, DAG, Subtarget);
+
+ // MVE does not have a single instruction to perform the truncation of a v4i32
+ // into the lower half of a v8i16, in the same way that a NEON vmovn would.
+ // Most of the instructions in MVE follow the 'Beats' system, where moving
+ // values from different lanes is usually something that the instructions
+ // avoid.
+ //
+ // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
+ // which take a the top/bottom half of a larger lane and extend it (or do the
+ // opposite, truncating into the top/bottom lane from a larger lane). Note
+ // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
+ // bottom 16bits from each vector lane. This works really well with T/B
+ // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
+ // to move order.
+ //
+ // But truncates and sext/zext are always going to be fairly common from llvm.
+ // We have several options for how to deal with them:
+ // - Wherever possible combine them into an instruction that makes them
+ // "free". This includes loads/stores, which can perform the trunc as part
+ // of the memory operation. Or certain shuffles that can be turned into
+ // VMOVN/VMOVL.
+ // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
+ // trunc(mul(sext(a), sext(b))) may become
+ // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
+ // this case can use VMULL). This is performed in the
+ // MVELaneInterleavingPass.
+ // - Otherwise we have an option. By default we would expand the
+ // zext/sext/trunc into a series of lane extract/inserts going via GPR
+ // registers. One for each vector lane in the vector. This can obviously be
+ // very expensive.
+ // - The other option is to use the fact that loads/store can extend/truncate
+ // to turn a trunc into two truncating stack stores and a stack reload. This
+ // becomes 3 back-to-back memory operations, but at least that is less than
+ // all the insert/extracts.
+ //
+ // In order to do the last, we convert certain trunc's into MVETRUNC, which
+ // are either optimized where they can be, or eventually lowered into stack
+ // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
+ // two early, where other instructions would be better, and stops us from
+ // having to reconstruct multiple buildvector shuffles into loads/stores.
+ if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
+ return SDValue();
+ EVT FromVT = N->getOperand(0).getValueType();
+ if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
+ return SDValue();
+
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ SDLoc DL(N);
+ return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
+}
+
+static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
+
+ EVT ToVT = N->getValueType(0);
+ if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
+ return SDValue();
+ SDValue Op = N->getOperand(0);
+ EVT FromVT = Op.getValueType();
+ if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
+ ExtVT = MVT::v8i16;
+
+ unsigned Opcode =
+ N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
+ SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
+ SDValue Ext1 = Ext.getValue(1);
+
+ if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
+ Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
+ Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
+}
+
/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
/// element has been zero/sign-extended, depending on the isSigned parameter,
/// from an integer type half its size.
@@ -9394,13 +9779,20 @@
// the bottom bits of the predicate.
// Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
// for BE).
+ // Speaking of BE, apparently the rest of llvm will assume a reverse order to
+ // a natural VMSR(load), so needs to be reversed.
SDLoc dl(Op);
SDValue Load = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
LD->getMemOperand());
- SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+ SDValue Val = Load;
+ if (DAG.getDataLayout().isBigEndian())
+ Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
+ DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
+ SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
if (MemVT != MVT::v16i1)
Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
DAG.getConstant(0, dl, MVT::i32));
@@ -9441,14 +9833,22 @@
SDValue Build = ST->getValue();
if (MemVT != MVT::v16i1) {
SmallVector<SDValue, 16> Ops;
- for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+ for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
+ unsigned Elt = DAG.getDataLayout().isBigEndian()
+ ? MemVT.getVectorNumElements() - I - 1
+ : I;
Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
- DAG.getConstant(I, dl, MVT::i32)));
+ DAG.getConstant(Elt, dl, MVT::i32)));
+ }
for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
Ops.push_back(DAG.getUNDEF(MVT::i32));
Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
}
SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+ if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
+ GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
+ DAG.getConstant(16, dl, MVT::i32));
return DAG.getTruncStore(
ST->getChain(), dl, GRP, ST->getBasePtr(),
EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
@@ -9594,7 +9994,7 @@
}
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
- if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+ if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
// equivalent available.
return SDValue();
@@ -9770,8 +10170,11 @@
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
- case ISD::TRUNCATE: return LowerTruncatei1(Op, DAG, Subtarget);
+ case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
@@ -9791,7 +10194,9 @@
return LowerUnsignedALUO(Op, DAG);
case ISD::SADDSAT:
case ISD::SSUBSAT:
- return LowerSADDSUBSAT(Op, DAG, Subtarget);
+ case ISD::UADDSAT:
+ case ISD::USUBSAT:
+ return LowerADDSUBSAT(Op, DAG, Subtarget);
case ISD::LOAD:
return LowerPredicateLoad(Op, DAG);
case ISD::STORE:
@@ -9891,7 +10296,9 @@
return;
case ISD::SADDSAT:
case ISD::SSUBSAT:
- Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+ case ISD::UADDSAT:
+ case ISD::USUBSAT:
+ Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
break;
case ISD::READCYCLECOUNTER:
ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
@@ -9912,6 +10319,13 @@
case ISD::LOAD:
LowerLOAD(N, Results, DAG);
break;
+ case ISD::TRUNCATE:
+ Res = LowerTruncate(N, DAG, Subtarget);
+ break;
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ Res = LowerVectorExtend(N, DAG, Subtarget);
+ break;
}
if (Res.getNode())
Results.push_back(Res);
@@ -10979,6 +11393,145 @@
return true;
}
+/// Adds logic in loop entry MBB to calculate loop iteration count and adds
+/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
+static Register genTPEntry(MachineBasicBlock *TpEntry,
+ MachineBasicBlock *TpLoopBody,
+ MachineBasicBlock *TpExit, Register OpSizeReg,
+ const TargetInstrInfo *TII, DebugLoc Dl,
+ MachineRegisterInfo &MRI) {
+ // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
+ Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
+ .addUse(OpSizeReg)
+ .addImm(15)
+ .add(predOps(ARMCC::AL))
+ .addReg(0);
+
+ Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
+ .addUse(AddDestReg, RegState::Kill)
+ .addImm(4)
+ .add(predOps(ARMCC::AL))
+ .addReg(0);
+
+ Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+ BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
+ .addUse(LsrDestReg, RegState::Kill);
+
+ BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
+ .addUse(TotalIterationsReg)
+ .addMBB(TpExit);
+
+ BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
+ .addMBB(TpLoopBody)
+ .add(predOps(ARMCC::AL));
+
+ return TotalIterationsReg;
+}
+
+/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
+/// t2DoLoopEnd. These are used by later passes to generate tail predicated
+/// loops.
+static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
+ MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
+ const TargetInstrInfo *TII, DebugLoc Dl,
+ MachineRegisterInfo &MRI, Register OpSrcReg,
+ Register OpDestReg, Register ElementCountReg,
+ Register TotalIterationsReg, bool IsMemcpy) {
+ // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
+ // array, loop iteration counter, predication counter.
+
+ Register SrcPhiReg, CurrSrcReg;
+ if (IsMemcpy) {
+ // Current position in the src array
+ SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
+ .addUse(OpSrcReg)
+ .addMBB(TpEntry)
+ .addUse(CurrSrcReg)
+ .addMBB(TpLoopBody);
+ }
+
+ // Current position in the dest array
+ Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
+ .addUse(OpDestReg)
+ .addMBB(TpEntry)
+ .addUse(CurrDestReg)
+ .addMBB(TpLoopBody);
+
+ // Current loop counter
+ Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+ Register RemainingLoopIterationsReg =
+ MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
+ .addUse(TotalIterationsReg)
+ .addMBB(TpEntry)
+ .addUse(RemainingLoopIterationsReg)
+ .addMBB(TpLoopBody);
+
+ // Predication counter
+ Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
+ .addUse(ElementCountReg)
+ .addMBB(TpEntry)
+ .addUse(RemainingElementsReg)
+ .addMBB(TpLoopBody);
+
+ // Pass predication counter to VCTP
+ Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
+ .addUse(PredCounterPhiReg)
+ .addImm(ARMVCC::None)
+ .addReg(0);
+
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
+ .addUse(PredCounterPhiReg)
+ .addImm(16)
+ .add(predOps(ARMCC::AL))
+ .addReg(0);
+
+ // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
+ Register SrcValueReg;
+ if (IsMemcpy) {
+ SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
+ .addDef(CurrSrcReg)
+ .addDef(SrcValueReg)
+ .addReg(SrcPhiReg)
+ .addImm(16)
+ .addImm(ARMVCC::Then)
+ .addUse(VccrReg);
+ } else
+ SrcValueReg = OpSrcReg;
+
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
+ .addDef(CurrDestReg)
+ .addUse(SrcValueReg)
+ .addReg(DestPhiReg)
+ .addImm(16)
+ .addImm(ARMVCC::Then)
+ .addUse(VccrReg);
+
+ // Add the pseudoInstrs for decrementing the loop counter and marking the
+ // end:t2DoLoopDec and t2DoLoopEnd
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
+ .addUse(LoopCounterPhiReg)
+ .addImm(1);
+
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
+ .addUse(RemainingLoopIterationsReg)
+ .addMBB(TpLoopBody);
+
+ BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
+ .addMBB(TpExit)
+ .add(predOps(ARMCC::AL));
+}
+
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -11005,6 +11558,98 @@
return BB;
}
+ case ARM::MVE_MEMCPYLOOPINST:
+ case ARM::MVE_MEMSETLOOPINST: {
+
+ // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
+ // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
+ // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
+ // adds the relevant instructions in the TP loop Body for generation of a
+ // WLSTP loop.
+
+ // Below is relevant portion of the CFG after the transformation.
+ // The Machine Basic Blocks are shown along with branch conditions (in
+ // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
+ // portion of the CFG and may not necessarily be the entry/exit of the
+ // function.
+
+ // (Relevant) CFG after transformation:
+ // TP entry MBB
+ // |
+ // |-----------------|
+ // (n <= 0) (n > 0)
+ // | |
+ // | TP loop Body MBB<--|
+ // | | |
+ // \ |___________|
+ // \ /
+ // TP exit MBB
+
+ MachineFunction *MF = BB->getParent();
+ MachineFunctionProperties &Properties = MF->getProperties();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ Register OpDestReg = MI.getOperand(0).getReg();
+ Register OpSrcReg = MI.getOperand(1).getReg();
+ Register OpSizeReg = MI.getOperand(2).getReg();
+
+ // Allocate the required MBBs and add to parent function.
+ MachineBasicBlock *TpEntry = BB;
+ MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *TpExit;
+
+ MF->push_back(TpLoopBody);
+
+ // If any instructions are present in the current block after
+ // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
+ // move the instructions into the newly created exit block. If there are no
+ // instructions add an explicit branch to the FallThrough block and then
+ // split.
+ //
+ // The split is required for two reasons:
+ // 1) A terminator(t2WhileLoopStart) will be placed at that site.
+ // 2) Since a TPLoopBody will be added later, any phis in successive blocks
+ // need to be updated. splitAt() already handles this.
+ TpExit = BB->splitAt(MI, false);
+ if (TpExit == BB) {
+ assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
+ "block containing memcpy/memset Pseudo");
+ TpExit = BB->getFallThrough();
+ BuildMI(BB, dl, TII->get(ARM::t2B))
+ .addMBB(TpExit)
+ .add(predOps(ARMCC::AL));
+ TpExit = BB->splitAt(MI, false);
+ }
+
+ // Add logic for iteration count
+ Register TotalIterationsReg =
+ genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
+
+ // Add the vectorized (and predicated) loads/store instructions
+ bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
+ genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
+ OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
+
+ // Required to avoid conflict with the MachineVerifier during testing.
+ Properties.reset(MachineFunctionProperties::Property::NoPHIs);
+
+ // Connect the blocks
+ TpEntry->addSuccessor(TpLoopBody);
+ TpLoopBody->addSuccessor(TpLoopBody);
+ TpLoopBody->addSuccessor(TpExit);
+
+ // Reorder for a more natural layout
+ TpLoopBody->moveAfter(TpEntry);
+ TpExit->moveAfter(TpLoopBody);
+
+ // Finally, remove the memcpy Psuedo Instruction
+ MI.eraseFromParent();
+
+ // Return the exit block as it may contain other instructions requiring a
+ // custom inserter
+ return TpExit;
+ }
+
// The Thumb2 pre-indexed stores have the same MI operands, they just
// define them differently in the .td files from the isel patterns, so
// they need pseudos.
@@ -11266,14 +11911,6 @@
return EmitLowered__chkstk(MI, BB);
case ARM::WIN__DBZCHK:
return EmitLowered__dbzchk(MI, BB);
- case ARM::t2DoLoopStart:
- // We are just here to set a register allocation hint, prefering lr for the
- // input register to make it more likely to be movable and removable, later
- // in the pipeline.
- Register R = MI.getOperand(1).getReg();
- MachineFunction *MF = MI.getParent()->getParent();
- MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
- return BB;
}
}
@@ -12285,7 +12922,7 @@
Ext1.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();
EVT VecVT = Ext0.getOperand(0).getValueType();
- if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8)
+ if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
return SDValue();
if (Ext1.getOperand(0).getValueType() != VecVT ||
VecVT.getScalarType() != ScalarType ||
@@ -12293,9 +12930,42 @@
return SDValue();
SDLoc DL(Mul);
- SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0),
- Ext1.getOperand(0));
- return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH);
+ unsigned LegalLanes = 128 / (ShftAmt + 1);
+ EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
+ // For types smaller than legal vectors extend to be legal and only use needed
+ // lanes.
+ if (VecVT.getSizeInBits() < 128) {
+ EVT ExtVecVT =
+ MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),
+ VecVT.getVectorNumElements());
+ SDValue Inp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
+ SDValue Inp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
+ Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
+ Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
+ SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
+ SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
+ Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
+ }
+
+ // For larger types, split into legal sized chunks.
+ assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
+ unsigned NumParts = VecVT.getSizeInBits() / 128;
+ SmallVector<SDValue> Parts;
+ for (unsigned I = 0; I < NumParts; ++I) {
+ SDValue Inp0 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
+ DAG.getVectorIdxConstant(I * LegalLanes, DL));
+ SDValue Inp1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
+ DAG.getVectorIdxConstant(I * LegalLanes, DL));
+ SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
+ Parts.push_back(VQDMULH);
+ }
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
}
static SDValue PerformVSELECTCombine(SDNode *N,
@@ -12394,8 +13064,7 @@
return SDValue();
}
-static SDValue PerformADDVecReduce(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
return SDValue();
@@ -12408,28 +13077,39 @@
// t1: i32,i32 = ARMISD::VADDLVs x
// t2: i64 = build_pair t1, t1:1
// t3: i64 = add t2, y
+ // Otherwise we try to push the add up above VADDLVAx, to potentially allow
+ // the add to be simplified seperately.
// We also need to check for sext / zext and commutitive adds.
auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
SDValue NB) {
if (NB->getOpcode() != ISD::BUILD_PAIR)
return SDValue();
SDValue VecRed = NB->getOperand(0);
- if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 ||
+ if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
+ VecRed.getResNo() != 0 ||
NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
return SDValue();
SDLoc dl(N);
+ if (VecRed->getOpcode() == OpcodeA) {
+ // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
+ SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
+ VecRed.getOperand(0), VecRed.getOperand(1));
+ NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
+ }
+
SmallVector<SDValue, 4> Ops;
- Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
- DCI.DAG.getConstant(0, dl, MVT::i32)));
- Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
- DCI.DAG.getConstant(1, dl, MVT::i32)));
- for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++)
- Ops.push_back(VecRed->getOperand(i));
- SDValue Red = DCI.DAG.getNode(OpcodeA, dl,
- DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops);
- return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
- SDValue(Red.getNode(), 1));
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DAG.getConstant(0, dl, MVT::i32)));
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DAG.getConstant(1, dl, MVT::i32)));
+ unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
+ for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
+ Ops.push_back(VecRed->getOperand(I));
+ SDValue Red =
+ DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
+ SDValue(Red.getNode(), 1));
};
if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
@@ -12638,7 +13318,7 @@
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
- if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget))
+ if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
return Result;
// First try with the default operand order.
@@ -12649,6 +13329,26 @@
return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
}
+// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
+// providing -X is as cheap as X (currently, just a constant).
+static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
+ return SDValue();
+ SDValue CSINC = N->getOperand(1);
+ if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
+ return SDValue();
+
+ ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
+ if (!X)
+ return SDValue();
+
+ return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
+ DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
+ CSINC.getOperand(0)),
+ CSINC.getOperand(1), CSINC.getOperand(2),
+ CSINC.getOperand(3));
+}
+
/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
///
static SDValue PerformSUBCombine(SDNode *N,
@@ -12662,6 +13362,9 @@
if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
return Result;
+ if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
+ return R;
+
if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
return SDValue();
@@ -13254,8 +13957,7 @@
return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
}
-static SDValue PerformORCombine_i1(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
// Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
// together with predicates
@@ -13274,10 +13976,10 @@
if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
return SDValue();
- SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
- SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
- SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
- return DCI.DAG.getLogicalNOT(DL, And, VT);
+ SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
+ SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
+ return DAG.getLogicalNOT(DL, And, VT);
}
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
@@ -13295,7 +13997,7 @@
if (Subtarget->hasMVEIntegerOps() &&
(VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
- return PerformORCombine_i1(N, DCI, Subtarget);
+ return PerformORCombine_i1(N, DAG, Subtarget);
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
@@ -13414,8 +14116,8 @@
Ops.push_back(N0->getOperand(0));
if (N0->getOpcode() == ARMISD::VCMP)
Ops.push_back(N0->getOperand(1));
- Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
- return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
+ Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
+ return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
}
}
}
@@ -13456,52 +14158,40 @@
}
static SDValue FindBFIToCombineWith(SDNode *N) {
- // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
- // if one exists.
+ // We have a BFI in N. Find a BFI it can combine with, if one exists.
APInt ToMask, FromMask;
SDValue From = ParseBFI(N, ToMask, FromMask);
SDValue To = N->getOperand(0);
- // Now check for a compatible BFI to merge with. We can pass through BFIs that
- // aren't compatible, but not if they set the same bit in their destination as
- // we do (or that of any BFI we're going to combine with).
SDValue V = To;
- APInt CombinedToMask = ToMask;
- while (V.getOpcode() == ARMISD::BFI) {
- APInt NewToMask, NewFromMask;
- SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
- if (NewFrom != From) {
- // This BFI has a different base. Keep going.
- CombinedToMask |= NewToMask;
- V = V.getOperand(0);
- continue;
- }
+ if (V.getOpcode() != ARMISD::BFI)
+ return SDValue();
- // Do the written bits conflict with any we've seen so far?
- if ((NewToMask & CombinedToMask).getBoolValue())
- // Conflicting bits - bail out because going further is unsafe.
- return SDValue();
+ APInt NewToMask, NewFromMask;
+ SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+ if (NewFrom != From)
+ return SDValue();
- // Are the new bits contiguous when combined with the old bits?
- if (BitsProperlyConcatenate(ToMask, NewToMask) &&
- BitsProperlyConcatenate(FromMask, NewFromMask))
- return V;
- if (BitsProperlyConcatenate(NewToMask, ToMask) &&
- BitsProperlyConcatenate(NewFromMask, FromMask))
- return V;
+ // Do the written bits conflict with any we've seen so far?
+ if ((NewToMask & ToMask).getBoolValue())
+ // Conflicting bits.
+ return SDValue();
- // We've seen a write to some bits, so track it.
- CombinedToMask |= NewToMask;
- // Keep going...
- V = V.getOperand(0);
- }
+ // Are the new bits contiguous when combined with the old bits?
+ if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+ BitsProperlyConcatenate(FromMask, NewFromMask))
+ return V;
+ if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+ BitsProperlyConcatenate(NewFromMask, FromMask))
+ return V;
return SDValue();
}
-static SDValue PerformBFICombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+
if (N1.getOpcode() == ISD::AND) {
// (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
// the bits being cleared by the AND are not demanded by the BFI.
@@ -13517,17 +14207,13 @@
unsigned Mask = (1u << Width) - 1;
unsigned Mask2 = N11C->getZExtValue();
if ((Mask & (~Mask2)) == 0)
- return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N1.getOperand(0),
- N->getOperand(2));
- } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
- // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
- // Keep track of any consecutive bits set that all come from the same base
- // value. We can combine these together into a single BFI.
- SDValue CombineBFI = FindBFIToCombineWith(N);
- if (CombineBFI == SDValue())
- return SDValue();
+ return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N1.getOperand(0), N->getOperand(2));
+ return SDValue();
+ }
+ // Look for another BFI to combine with.
+ if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
// We've found a BFI.
APInt ToMask1, FromMask1;
SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
@@ -13537,9 +14223,7 @@
assert(From1 == From2);
(void)From2;
- // First, unlink CombineBFI.
- DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
- // Then create a new BFI, combining the two together.
+ // Create a new BFI, combining the two together.
APInt NewFromMask = FromMask1 | FromMask2;
APInt NewToMask = ToMask1 | ToMask2;
@@ -13547,12 +14231,33 @@
SDLoc dl(N);
if (NewFromMask[0] == 0)
- From1 = DCI.DAG.getNode(
- ISD::SRL, dl, VT, From1,
- DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
- return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
- DCI.DAG.getConstant(~NewToMask, dl, VT));
+ From1 = DAG.getNode(
+ ISD::SRL, dl, VT, From1,
+ DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+ return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
+ DAG.getConstant(~NewToMask, dl, VT));
}
+
+ // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
+ // that lower bit insertions are performed first, providing that M1 and M2
+ // do no overlap. This can allow multiple BFI instructions to be combined
+ // together by the other folds above.
+ if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+ APInt ToMask1 = ~N->getConstantOperandAPInt(2);
+ APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
+
+ if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
+ ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+ SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
+ N->getOperand(1), N->getOperand(2));
+ return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
+ N0.getOperand(2));
+ }
+
return SDValue();
}
@@ -13597,6 +14302,54 @@
return Result;
}
+ // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
+ // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
+ if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(InDouble.getOperand(1))) {
+ SDValue BV = InDouble.getOperand(0);
+ // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
+ // change lane order under big endian.
+ bool BVSwap = BV.getOpcode() == ISD::BITCAST;
+ while (
+ (BV.getOpcode() == ISD::BITCAST ||
+ BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
+ (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
+ BVSwap = BV.getOpcode() == ISD::BITCAST;
+ BV = BV.getOperand(0);
+ }
+ if (BV.getValueType() != MVT::v4i32)
+ return SDValue();
+
+ // Handle buildvectors, pulling out the correct lane depending on
+ // endianness.
+ unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
+ if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue Op0 = BV.getOperand(Offset);
+ SDValue Op1 = BV.getOperand(Offset + 1);
+ if (!Subtarget->isLittle() && BVSwap)
+ std::swap(Op0, Op1);
+
+ return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
+ }
+
+ // A chain of insert_vectors, grabbing the correct value of the chain of
+ // inserts.
+ SDValue Op0, Op1;
+ while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (isa<ConstantSDNode>(BV.getOperand(2))) {
+ if (BV.getConstantOperandVal(2) == Offset)
+ Op0 = BV.getOperand(1);
+ if (BV.getConstantOperandVal(2) == Offset + 1)
+ Op1 = BV.getOperand(1);
+ }
+ BV = BV.getOperand(0);
+ }
+ if (!Subtarget->isLittle() && BVSwap)
+ std::swap(Op0, Op1);
+ if (Op0 && Op1)
+ return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
+ }
+
return SDValue();
}
@@ -13618,7 +14371,8 @@
return SDValue();
}
-static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue PerformVMOVhrCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
SDValue Op0 = N->getOperand(0);
// VMOVhr (VMOVrh (X)) -> X
@@ -13664,15 +14418,14 @@
return SDValue();
}
-static SDValue PerformVMOVrhCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (VMOVrh (fpconst x)) -> const x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
APFloat V = C->getValueAPF();
- return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
+ return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
}
// fold (VMOVrh (load x)) -> (zextload (i16*)x)
@@ -13680,18 +14433,18 @@
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load =
- DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
- LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
- DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
- DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+ DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
+ LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
}
// Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(N0->getOperand(1)))
- return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
- N0->getOperand(1));
+ return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
+ N0->getOperand(1));
return SDValue();
}
@@ -13869,30 +14622,32 @@
return SDValue();
}
-static SDValue
-PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const ARMSubtarget *ST) {
+static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
SDLoc dl(N);
// Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
if (ST->isLittle())
- return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+ // VECTOR_REG_CAST undef -> undef
+ if (Op.isUndef())
+ return DAG.getUNDEF(VT);
// VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
// If the valuetypes are the same, we can remove the cast entirely.
if (Op->getOperand(0).getValueType() == VT)
return Op->getOperand(0);
- return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
}
return SDValue();
}
-static SDValue PerformVCMPCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasMVEIntegerOps())
return SDValue();
@@ -13906,19 +14661,18 @@
// vcmp X, 0, cc -> vcmpz X, cc
if (isZeroVector(Op1))
- return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
- N->getOperand(2));
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
unsigned SwappedCond = getSwappedCondition(Cond);
if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
// vcmp 0, X, cc -> vcmpz X, reversed(cc)
if (isZeroVector(Op0))
- return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
- DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
+ DAG.getConstant(SwappedCond, dl, MVT::i32));
// vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
- return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
- DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
+ DAG.getConstant(SwappedCond, dl, MVT::i32));
}
return SDValue();
@@ -13950,8 +14704,73 @@
return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
}
+// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
+// directly or bitcast to an integer if the original is a float vector.
+// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
+// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
+static SDValue
+PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
+ !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
+ return SDValue();
+
+ SDValue Ext = SDValue(N, 0);
+ if (Ext.getOpcode() == ISD::BITCAST &&
+ Ext.getOperand(0).getValueType() == MVT::f32)
+ Ext = Ext.getOperand(0);
+ if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+ Ext.getConstantOperandVal(1) % 2 != 0)
+ return SDValue();
+ if (Ext->use_size() == 1 &&
+ (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
+ Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
+ return SDValue();
+
+ SDValue Op0 = Ext.getOperand(0);
+ EVT VecVT = Op0.getValueType();
+ unsigned Lane = Ext.getConstantOperandVal(1);
+ if (VecVT.getVectorNumElements() != 4)
+ return SDValue();
+
+ // Find another extract, of Lane + 1
+ auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
+ return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(V->getOperand(1)) &&
+ V->getConstantOperandVal(1) == Lane + 1;
+ });
+ if (OtherIt == Op0->uses().end())
+ return SDValue();
+
+ // For float extracts, we need to be converting to a i32 for both vector
+ // lanes.
+ SDValue OtherExt(*OtherIt, 0);
+ if (OtherExt.getValueType() != MVT::i32) {
+ if (OtherExt->use_size() != 1 ||
+ OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
+ OtherExt->use_begin()->getValueType(0) != MVT::i32)
+ return SDValue();
+ OtherExt = SDValue(*OtherExt->use_begin(), 0);
+ }
+
+ // Convert the type to a f64 and extract with a VMOVRRD.
+ SDValue F64 = DCI.DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
+ DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
+ SDValue VMOVRRD =
+ DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
+
+ DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
+ return VMOVRRD;
+}
+
static SDValue PerformExtractEltCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc dl(N);
@@ -13963,6 +14782,8 @@
return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
if (VT == MVT::i32 && X.getValueType() == MVT::f16)
return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
+ if (VT == MVT::f32 && X.getValueType() == MVT::i32)
+ return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
X = X->getOperand(0);
@@ -13970,12 +14791,131 @@
return X;
}
+ // extract ARM_BUILD_VECTOR -> x
+ if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ N->getConstantOperandVal(1) < Op0.getNumOperands()) {
+ return Op0.getOperand(N->getConstantOperandVal(1));
+ }
+
+ // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
+ if (Op0.getValueType() == MVT::v4i32 &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+ Op0.getOperand(0).getValueType() == MVT::v2f64) {
+ SDValue BV = Op0.getOperand(0);
+ unsigned Offset = N->getConstantOperandVal(1);
+ SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
+ if (MOV.getOpcode() == ARMISD::VMOVDRR)
+ return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
+ }
+
+ // extract x, n; extract x, n+1 -> VMOVRRD x
+ if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
+ return R;
+
+ // extract (MVETrunc(x)) -> extract x
+ if (Op0->getOpcode() == ARMISD::MVETRUNC) {
+ unsigned Idx = N->getConstantOperandVal(1);
+ unsigned Vec =
+ Idx / Op0->getOperand(0).getValueType().getVectorNumElements();
+ unsigned SubIdx =
+ Idx % Op0->getOperand(0).getValueType().getVectorNumElements();
+ return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
+ DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // sext_inreg(VGETLANEu) -> VGETLANEs
+ if (Op.getOpcode() == ARMISD::VGETLANEu &&
+ cast<VTSDNode>(N->getOperand(1))->getVT() ==
+ Op.getOperand(0).getValueType().getScalarType())
+ return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
+ Op.getOperand(1));
+
+ return SDValue();
+}
+
+// When lowering complex nodes that we recognize, like VQDMULH and MULH, we
+// can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
+// binop as the shuffles cancel out.
+static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
+ return SDValue();
+ SDValue Op = N->getOperand(0);
+
+ // Looking for binary operators that will have been folded from
+ // truncates/extends.
+ switch (Op.getOpcode()) {
+ case ARMISD::VQDMULH:
+ case ISD::MULHS:
+ case ISD::MULHU:
+ case ISD::ABDS:
+ case ISD::ABDU:
+ break;
+ default:
+ return SDValue();
+ }
+
+ ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
+ ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
+ if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
+ !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
+ Op0->getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ // Check the mask turns into an identity shuffle.
+ ArrayRef<int> NMask = N->getMask();
+ ArrayRef<int> OpMask = Op0->getMask();
+ for (int i = 0, e = NMask.size(); i != e; i++) {
+ if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
+ return SDValue();
+ }
+
+ return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+ Op0->getOperand(0), Op1->getOperand(0));
+}
+
+// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
+static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) {
+ SDValue Trunc = N->getOperand(0);
+ EVT VT = Trunc.getValueType();
+ if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ SDLoc DL(Trunc);
+ if (isVMOVNTruncMask(N->getMask(), VT, 0))
+ return DAG.getNode(
+ ARMISD::VMOVN, DL, VT,
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
+ DAG.getConstant(1, DL, MVT::i32));
+ else if (isVMOVNTruncMask(N->getMask(), VT, 1))
+ return DAG.getNode(
+ ARMISD::VMOVN, DL, VT,
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
+ DAG.getConstant(1, DL, MVT::i32));
return SDValue();
}
/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
/// ISD::VECTOR_SHUFFLE.
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+ if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG))
+ return R;
+ if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
+ return R;
+
// The LLVM shufflevector instruction does not require the shuffle mask
// length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
// have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
@@ -14064,6 +15004,9 @@
// Find the new opcode for the updating load/store.
bool isLoadOp = true;
bool isLaneOp = false;
+ // Workaround for vst1x and vld1x intrinsics which do not have alignment
+ // as an operand.
+ bool hasAlignment = true;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
if (isIntrinsic) {
@@ -14078,15 +15021,18 @@
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
- case Intrinsic::arm_neon_vld1x2:
- case Intrinsic::arm_neon_vld1x3:
- case Intrinsic::arm_neon_vld1x4:
- case Intrinsic::arm_neon_vld2dup:
- case Intrinsic::arm_neon_vld3dup:
- case Intrinsic::arm_neon_vld4dup:
- // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip
- // combining base updates for such intrinsics.
- continue;
+ case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
+ NumVecs = 2; hasAlignment = false; break;
+ case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
+ NumVecs = 3; hasAlignment = false; break;
+ case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
+ NumVecs = 4; hasAlignment = false; break;
+ case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
+ NumVecs = 4; break;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
@@ -14107,6 +15053,12 @@
NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
+ NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
+ case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
+ NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
+ case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
+ NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
}
} else {
isLaneOp = true;
@@ -14134,8 +15086,12 @@
VecTy = N->getOperand(1).getValueType();
}
+ bool isVLDDUPOp =
+ NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
+ NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
+
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
- if (isLaneOp)
+ if (isLaneOp || isVLDDUPOp)
NumBytes /= VecTy.getVectorNumElements();
// If the increment is a constant, it must match the memory ref size.
@@ -14210,7 +15166,9 @@
} else {
// Loads (and of course intrinsics) match the intrinsics' signature,
// so just add all but the alignment operand.
- for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+ unsigned LastOperand =
+ hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
+ for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
Ops.push_back(N->getOperand(i));
}
@@ -14505,10 +15463,8 @@
}
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
-static SDValue PerformVDUPCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
- SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
SDLoc dl(N);
@@ -14516,11 +15472,11 @@
// Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
// need to come from a GPR.
if (Op.getValueType() == MVT::f32)
- return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
- DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
+ return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+ DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
else if (Op.getValueType() == MVT::f16)
- return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
- DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
+ return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+ DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
}
if (!Subtarget->hasNEON())
@@ -14532,12 +15488,12 @@
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
if (LD && Op.hasOneUse() && LD->isUnindexed() &&
LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
- SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
- DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
+ SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
+ DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)};
SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
- SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
- Ops, LD->getMemoryVT(),
- LD->getMemOperand());
+ SDValue VLDDup =
+ DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
+ LD->getMemoryVT(), LD->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
return VLDDup;
}
@@ -14642,7 +15598,7 @@
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
-// Try taking a single vector store from an truncate (which would otherwise turn
+// Try taking a single vector store from an fpround (which would otherwise turn
// into an expensive buildvector) and splitting it into a series of narrowing
// stores.
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
@@ -14650,7 +15606,7 @@
if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
return SDValue();
SDValue Trunc = St->getValue();
- if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND)
+ if (Trunc->getOpcode() != ISD::FP_ROUND)
return SDValue();
EVT FromVT = Trunc->getOperand(0).getValueType();
EVT ToVT = Trunc.getValueType();
@@ -14660,16 +15616,11 @@
EVT ToEltVT = ToVT.getVectorElementType();
EVT FromEltVT = FromVT.getVectorElementType();
- unsigned NumElements = 0;
- if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
- NumElements = 4;
- if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
- NumElements = 8;
- if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16)
- NumElements = 4;
- if (NumElements == 0 ||
- (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) ||
- FromVT.getVectorNumElements() % NumElements != 0)
+ if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
+ return SDValue();
+
+ unsigned NumElements = 4;
+ if (FromVT.getVectorNumElements() % NumElements != 0)
return SDValue();
// Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
@@ -14698,14 +15649,6 @@
return true;
};
- // It may be preferable to keep the store unsplit as the trunc may end up
- // being removed. Check that here.
- if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) {
- if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) {
- DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U);
- return SDValue();
- }
- }
if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
return SDValue();
@@ -14735,12 +15678,10 @@
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
DAG.getConstant(i * NumElements, DL, MVT::i32));
- if (ToEltVT == MVT::f16) {
- SDValue FPTrunc =
- DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
- Extract, DAG.getConstant(0, DL, MVT::i32));
- Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
- }
+ SDValue FPTrunc =
+ DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
+ Extract, DAG.getConstant(0, DL, MVT::i32));
+ Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
SDValue Store = DAG.getTruncStore(
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
@@ -14750,6 +15691,83 @@
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
}
+// Try taking a single vector store from an MVETRUNC (which would otherwise turn
+// into an expensive buildvector) and splitting it into a series of narrowing
+// stores.
+static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
+ SelectionDAG &DAG) {
+ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+ return SDValue();
+ SDValue Trunc = St->getValue();
+ if (Trunc->getOpcode() != ARMISD::MVETRUNC)
+ return SDValue();
+ EVT FromVT = Trunc->getOperand(0).getValueType();
+ EVT ToVT = Trunc.getValueType();
+
+ LLVMContext &C = *DAG.getContext();
+ SDLoc DL(St);
+ // Details about the old store
+ SDValue Ch = St->getChain();
+ SDValue BasePtr = St->getBasePtr();
+ Align Alignment = St->getOriginalAlign();
+ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = St->getAAInfo();
+
+ EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
+ FromVT.getVectorNumElements());
+
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
+ unsigned NewOffset =
+ i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
+ SDValue NewPtr =
+ DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
+
+ SDValue Extract = Trunc.getOperand(i);
+ SDValue Store = DAG.getTruncStore(
+ Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
+ NewToVT, Alignment.value(), MMOFlags, AAInfo);
+ Stores.push_back(Store);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
+// Given a floating point store from an extracted vector, with an integer
+// VGETLANE that already exists, store the existing VGETLANEu directly. This can
+// help reduce fp register pressure, doesn't require the fp extract and allows
+// use of more integer post-inc stores not available with vstr.
+static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
+ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+ return SDValue();
+ SDValue Extract = St->getValue();
+ EVT VT = Extract.getValueType();
+ // For now only uses f16. This may be useful for f32 too, but that will
+ // be bitcast(extract), not the VGETLANEu we currently check here.
+ if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDNode *GetLane =
+ DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
+ {Extract.getOperand(0), Extract.getOperand(1)});
+ if (!GetLane)
+ return SDValue();
+
+ LLVMContext &C = *DAG.getContext();
+ SDLoc DL(St);
+ // Create a new integer store to replace the existing floating point version.
+ SDValue Ch = St->getChain();
+ SDValue BasePtr = St->getBasePtr();
+ Align Alignment = St->getOriginalAlign();
+ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = St->getAAInfo();
+ EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
+ SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
+ St->getPointerInfo(), NewToVT,
+ Alignment.value(), MMOFlags, AAInfo);
+
+ return Store;
+}
+
/// PerformSTORECombine - Target-specific dag combine xforms for
/// ISD::STORE.
static SDValue PerformSTORECombine(SDNode *N,
@@ -14765,9 +15783,15 @@
if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
return Store;
- if (Subtarget->hasMVEIntegerOps())
+ if (Subtarget->hasMVEIntegerOps()) {
if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
return NewToken;
+ if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
+ return NewChain;
+ if (SDValue NewToken =
+ PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))
+ return NewToken;
+ }
if (!ISD::isNormalStore(St))
return SDValue();
@@ -14953,12 +15977,12 @@
SDLoc dl(N);
// We are looking for something that will have illegal types if left alone,
- // but that we can convert to a single instruction undef MVE. For example
+ // but that we can convert to a single instruction under MVE. For example
// vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
// or
// vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
- // Cases:
+ // The legal cases are:
// VADDV u/s 8/16/32
// VMLAV u/s 8/16/32
// VADDLV u/s 32
@@ -14966,6 +15990,13 @@
// If the input vector is smaller than legal (v4i8/v4i16 for example) we can
// extend it and use v4i32 instead.
+ auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
+ EVT AVT = A.getValueType();
+ return any_of(ExtTypes, [&](MVT Ty) {
+ return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
+ AVT.bitsLE(Ty);
+ });
+ };
auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
EVT AVT = A.getValueType();
if (!AVT.is128BitVector())
@@ -14979,7 +16010,7 @@
if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
return SDValue();
SDValue A = N0->getOperand(0);
- if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ if (ExtTypeMatches(A, ExtTypes))
return ExtendIfNeeded(A, ExtendCode);
return SDValue();
};
@@ -14993,7 +16024,7 @@
if (Ext->getOpcode() != ExtendCode)
return SDValue();
SDValue A = Ext->getOperand(0);
- if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ if (ExtTypeMatches(A, ExtTypes))
return ExtendIfNeeded(A, ExtendCode);
return SDValue();
};
@@ -15018,13 +16049,11 @@
return false;
SDValue ExtA = Mul->getOperand(0);
SDValue ExtB = Mul->getOperand(1);
- if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+ if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
return false;
A = ExtA->getOperand(0);
B = ExtB->getOperand(0);
- if (A.getValueType() == B.getValueType() &&
- llvm::any_of(ExtTypes,
- [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+ if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
A = ExtendIfNeeded(A, ExtendCode);
B = ExtendIfNeeded(B, ExtendCode);
return true;
@@ -15052,13 +16081,11 @@
return false;
SDValue ExtA = Mul->getOperand(0);
SDValue ExtB = Mul->getOperand(1);
- if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+ if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
return false;
A = ExtA->getOperand(0);
B = ExtB->getOperand(0);
- if (A.getValueType() == B.getValueType() &&
- llvm::any_of(ExtTypes,
- [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+ if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
A = ExtendIfNeeded(A, ExtendCode);
B = ExtendIfNeeded(B, ExtendCode);
return true;
@@ -15066,6 +16093,32 @@
return false;
};
auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
+ // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
+ // reductions. The operands are extended with MVEEXT, but as they are
+ // reductions the lane orders do not matter. MVEEXT may be combined with
+ // loads to produce two extending loads, or else they will be expanded to
+ // VREV/VMOVL.
+ EVT VT = Ops[0].getValueType();
+ if (VT == MVT::v16i8) {
+ assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
+ "Unexpected illegal long reduction opcode");
+ bool IsUnsigned = Opcode == ARMISD::VMLALVu;
+
+ SDValue Ext0 =
+ DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
+ DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
+ SDValue Ext1 =
+ DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
+ DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
+
+ SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ Ext0, Ext1);
+ SDValue MLA1 =
+ DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
+ Ext0.getValue(1), Ext1.getValue(1));
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
+ }
SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
SDValue(Node.getNode(), 1));
@@ -15075,11 +16128,9 @@
return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
- if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND,
- {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
+ if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
return Create64bitNode(ARMISD::VADDLVs, {A});
- if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND,
- {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
+ if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
return Create64bitNode(ARMISD::VADDLVu, {A});
if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
@@ -15093,11 +16144,9 @@
return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
- if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND,
- {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+ if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
- if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND,
- {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+ if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
@@ -15111,11 +16160,11 @@
return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
- if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND,
- {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
+ if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
+ A, B))
return Create64bitNode(ARMISD::VMLALVs, {A, B});
- if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND,
- {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
+ if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
+ A, B))
return Create64bitNode(ARMISD::VMLALVu, {A, B});
if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
@@ -15124,17 +16173,17 @@
return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
- if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+ if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
+ Mask))
return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
- if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+ if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
+ Mask))
return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
- if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND,
- {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
- B, Mask))
+ if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
+ Mask))
return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
- if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND,
- {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
- B, Mask))
+ if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
+ Mask))
return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
@@ -15171,6 +16220,14 @@
SDValue Op1 = N->getOperand(1);
unsigned IsTop = N->getConstantOperandVal(2);
+ // VMOVNT a undef -> a
+ // VMOVNB a undef -> a
+ // VMOVNB undef a -> a
+ if (Op1->isUndef())
+ return Op0;
+ if (Op0->isUndef() && !IsTop)
+ return Op1;
+
// VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
// VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
@@ -15580,10 +16637,8 @@
EVT FromEltVT = FromVT.getVectorElementType();
unsigned NumElements = 0;
- if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+ if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
NumElements = 4;
- if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
- NumElements = 8;
if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
NumElements = 4;
if (NumElements == 0 ||
@@ -15934,7 +16989,7 @@
}
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
- if (IntOp != Intrinsic::test_set_loop_iterations &&
+ if (IntOp != Intrinsic::test_start_loop_iterations &&
IntOp != Intrinsic::loop_decrement_reg)
return SDValue();
return N;
@@ -15949,7 +17004,7 @@
// The hwloop intrinsics that we're interested are used for control-flow,
// either for entering or exiting the loop:
- // - test.set.loop.iterations will test whether its operand is zero. If it
+ // - test.start.loop.iterations will test whether its operand is zero. If it
// is zero, the proceeding branch should not enter the loop.
// - loop.decrement.reg also tests whether its operand is zero. If it is
// zero, the proceeding branch should not branch back to the beginning of
@@ -16024,21 +17079,25 @@
DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
};
- if (IntOp == Intrinsic::test_set_loop_iterations) {
+ if (IntOp == Intrinsic::test_start_loop_iterations) {
SDValue Res;
+ SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
// We expect this 'instruction' to branch when the counter is zero.
if (IsTrueIfZero(CC, Imm)) {
- SDValue Ops[] = { Chain, Elements, Dest };
+ SDValue Ops[] = {Chain, Setup, Dest};
Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
} else {
// The logic is the reverse of what we need for WLS, so find the other
// basic block target: the target of the proceeding br.
UpdateUncondBr(Br, Dest, DAG);
- SDValue Ops[] = { Chain, Elements, OtherTarget };
+ SDValue Ops[] = {Chain, Setup, OtherTarget};
Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
}
- DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ // Update LR count to the new value
+ DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
+ // Update chain
+ DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
return Res;
} else {
SDValue Size = DAG.getTargetConstant(
@@ -16283,8 +17342,10 @@
return Res;
}
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
+static SDValue PerformBITCASTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ SelectionDAG &DAG = DCI.DAG;
SDValue Src = N->getOperand(0);
EVT DstVT = N->getValueType(0);
@@ -16310,9 +17371,288 @@
DAG.getDataLayout().isBigEndian())
return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
+ // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
+ if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
+ return R;
+
return SDValue();
}
+// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
+// node into stack operations after legalizeOps.
+SDValue ARMTargetLowering::PerformMVETruncCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // MVETrunc(Undef, Undef) -> Undef
+ if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
+ if (N->getNumOperands() == 2 &&
+ N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
+ N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
+ return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
+ N->getOperand(0).getOperand(1),
+ N->getOperand(1).getOperand(0),
+ N->getOperand(1).getOperand(1));
+
+ // MVETrunc(shuffle, shuffle) -> VMOVN
+ if (N->getNumOperands() == 2 &&
+ N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
+ auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
+ auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
+
+ if (S0->getOperand(0) == S1->getOperand(0) &&
+ S0->getOperand(1) == S1->getOperand(1)) {
+ // Construct complete shuffle mask
+ SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
+ Mask.append(S1->getMask().begin(), S1->getMask().end());
+
+ if (isVMOVNTruncMask(Mask, VT, 0))
+ return DAG.getNode(
+ ARMISD::VMOVN, DL, VT,
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
+ DAG.getConstant(1, DL, MVT::i32));
+ if (isVMOVNTruncMask(Mask, VT, 1))
+ return DAG.getNode(
+ ARMISD::VMOVN, DL, VT,
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
+ DAG.getConstant(1, DL, MVT::i32));
+ }
+ }
+
+ // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
+ // truncate to a buildvector to allow the generic optimisations to kick in.
+ if (all_of(N->ops(), [](SDValue Op) {
+ return Op.getOpcode() == ISD::BUILD_VECTOR ||
+ Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ (Op.getOpcode() == ISD::BITCAST &&
+ Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
+ })) {
+ SmallVector<SDValue, 8> Extracts;
+ for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
+ SDValue O = N->getOperand(Op);
+ for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
+ DAG.getConstant(i, DL, MVT::i32));
+ Extracts.push_back(Ext);
+ }
+ }
+ return DAG.getBuildVector(VT, DL, Extracts);
+ }
+
+ // If we are late in the legalization process and nothing has optimised
+ // the trunc to anything better, lower it to a stack store and reload,
+ // performing the truncation whilst keeping the lanes in the correct order:
+ // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ int NumIns = N->getNumOperands();
+ assert((NumIns == 2 || NumIns == 4) &&
+ "Expected 2 or 4 inputs to an MVETrunc");
+ EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ if (N->getNumOperands() == 4)
+ StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+ SmallVector<SDValue> Chains;
+ for (int I = 0; I < NumIns; I++) {
+ SDValue Ptr = DAG.getNode(
+ ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+ DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
+ SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
+ Ptr, MPI, StoreVT, Align(4));
+ Chains.push_back(Ch);
+ }
+
+ SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
+ return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
+}
+
+// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
+static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
+ SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
+ if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
+ return SDValue();
+
+ EVT FromVT = LD->getMemoryVT();
+ EVT ToVT = N->getValueType(0);
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+ NumElements = 4;
+ if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+ NumElements = 8;
+ assert(NumElements != 0);
+
+ ISD::LoadExtType NewExtType =
+ N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
+ LD->getExtensionType() != ISD::EXTLOAD &&
+ LD->getExtensionType() != NewExtType)
+ return SDValue();
+
+ LLVMContext &C = *DAG.getContext();
+ SDLoc DL(LD);
+ // Details about the old load
+ SDValue Ch = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ Align Alignment = LD->getOriginalAlign();
+ MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = LD->getAAInfo();
+
+ SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+ EVT NewFromVT = EVT::getVectorVT(
+ C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
+ EVT NewToVT = EVT::getVectorVT(
+ C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
+
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+ unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
+ SDValue NewPtr =
+ DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
+
+ SDValue NewLoad =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+ LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+ Alignment, MMOFlags, AAInfo);
+ Loads.push_back(NewLoad);
+ Chains.push_back(SDValue(NewLoad.getNode(), 1));
+ }
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+ return DAG.getMergeValues(Loads, DL);
+}
+
+// Perform combines for MVEEXT. If it has not be optimized to anything better
+// before lowering, it gets converted to stack store and extloads performing the
+// extend whilst still keeping the same lane ordering.
+SDValue ARMTargetLowering::PerformMVEExtCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+ assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
+ assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
+
+ EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
+ *DAG.getContext());
+ auto Extend = [&](SDValue V) {
+ SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
+ return N->getOpcode() == ARMISD::MVESEXT
+ ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
+ DAG.getValueType(ExtVT))
+ : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
+ };
+
+ // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
+ if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
+ SDValue Ext = Extend(N->getOperand(0));
+ return DAG.getMergeValues({Ext, Ext}, DL);
+ }
+
+ // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
+ ArrayRef<int> Mask = SVN->getMask();
+ assert(Mask.size() == 2 * VT.getVectorNumElements());
+ assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
+ unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
+ SDValue Op0 = SVN->getOperand(0);
+ SDValue Op1 = SVN->getOperand(1);
+
+ auto CheckInregMask = [&](int Start, int Offset) {
+ for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
+ if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
+ return false;
+ return true;
+ };
+ SDValue V0 = SDValue(N, 0);
+ SDValue V1 = SDValue(N, 1);
+ if (CheckInregMask(0, 0))
+ V0 = Extend(Op0);
+ else if (CheckInregMask(0, 1))
+ V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
+ else if (CheckInregMask(0, Mask.size()))
+ V0 = Extend(Op1);
+ else if (CheckInregMask(0, Mask.size() + 1))
+ V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
+
+ if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
+ V1 = Extend(Op1);
+ else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
+ V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
+ else if (CheckInregMask(VT.getVectorNumElements(), 0))
+ V1 = Extend(Op0);
+ else if (CheckInregMask(VT.getVectorNumElements(), 1))
+ V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
+
+ if (V0.getNode() != N || V1.getNode() != N)
+ return DAG.getMergeValues({V0, V1}, DL);
+ }
+
+ // MVEEXT(load) -> extload, extload
+ if (N->getOperand(0)->getOpcode() == ISD::LOAD)
+ if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
+ return L;
+
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ // Lower to a stack store and reload:
+ // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
+ SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ int NumOuts = N->getNumValues();
+ assert((NumOuts == 2 || NumOuts == 4) &&
+ "Expected 2 or 4 outputs to an MVEEXT");
+ EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
+ *DAG.getContext());
+ if (N->getNumOperands() == 4)
+ LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
+ SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
+ StackPtr, MPI, Align(4));
+
+ SmallVector<SDValue> Loads;
+ for (int I = 0; I < NumOuts; I++) {
+ SDValue Ptr = DAG.getNode(
+ ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+ DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
+ SDValue Load = DAG.getExtLoad(
+ N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
+ VT, Chain, Ptr, MPI, LoadVT, Align(4));
+ Loads.push_back(Load);
+ }
+
+ return DAG.getMergeValues(Loads, DL);
+}
+
SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -16334,18 +17674,20 @@
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
- case ARMISD::BFI: return PerformBFICombine(N, DCI);
+ case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
- case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI);
+ case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
- case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformExtractEltCombine(N, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
- case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@@ -16379,13 +17721,18 @@
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
case ISD::BITCAST:
- return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
+ return PerformBITCASTCombine(N, DCI, Subtarget);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
case ARMISD::VECTOR_REG_CAST:
- return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
+ return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::MVETRUNC:
+ return PerformMVETruncCombine(N, DCI);
+ case ARMISD::MVESEXT:
+ case ARMISD::MVEZEXT:
+ return PerformMVEExtCombine(N, DCI);
case ARMISD::VCMP:
- return PerformVCMPCombine(N, DCI, Subtarget);
+ return PerformVCMPCombine(N, DCI.DAG, Subtarget);
case ISD::VECREDUCE_ADD:
return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
case ARMISD::VMOVN:
@@ -16413,7 +17760,9 @@
}
case ARMISD::SMLALBB:
case ARMISD::QADD16b:
- case ARMISD::QSUB16b: {
+ case ARMISD::QSUB16b:
+ case ARMISD::UQADD16b:
+ case ARMISD::UQSUB16b: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -16450,7 +17799,9 @@
break;
}
case ARMISD::QADD8b:
- case ARMISD::QSUB8b: {
+ case ARMISD::QSUB8b:
+ case ARMISD::UQADD8b:
+ case ARMISD::UQSUB8b: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -16503,7 +17854,7 @@
}
bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
- unsigned Alignment,
+ Align Alignment,
MachineMemOperand::Flags,
bool *Fast) const {
// Depends what it gets converted into if the type is weird.
@@ -16582,14 +17933,14 @@
bool Fast;
if (Op.size() >= 16 &&
(Op.isAligned(Align(16)) ||
- (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
+ (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::v2f64;
} else if (Op.size() >= 8 &&
(Op.isAligned(Align(8)) ||
(allowsMisalignedMemoryAccesses(
- MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
+ MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::f64;
}
@@ -16849,9 +18200,10 @@
return true;
}
-int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
- const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM,
+ Type *Ty,
+ unsigned AS) const {
if (isLegalAddressingMode(DL, AM, Ty, AS)) {
if (Subtarget->hasFPAO())
return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
@@ -17403,6 +18755,8 @@
auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!RHS || RHS->getZExtValue() != 4)
return false;
+ if (Alignment < Align(4))
+ return false;
Offset = Op->getOperand(1);
Base = Op->getOperand(0);
@@ -17539,6 +18893,28 @@
Known = KnownOp.zext(32);
break;
}
+ case ARMISD::CSINC:
+ case ARMISD::CSINV:
+ case ARMISD::CSNEG: {
+ KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+ KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+
+ // The result is either:
+ // CSINC: KnownOp0 or KnownOp1 + 1
+ // CSINV: KnownOp0 or ~KnownOp1
+ // CSNEG: KnownOp0 or KnownOp1 * -1
+ if (Op.getOpcode() == ARMISD::CSINC)
+ KnownOp1 = KnownBits::computeForAddSub(
+ true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
+ else if (Op.getOpcode() == ARMISD::CSINV)
+ std::swap(KnownOp1.Zero, KnownOp1.One);
+ else if (Op.getOpcode() == ARMISD::CSNEG)
+ KnownOp1 = KnownBits::mul(
+ KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
+
+ Known = KnownBits::commonBits(KnownOp0, KnownOp1);
+ break;
+ }
}
}
@@ -17842,7 +19218,7 @@
break;
}
- if (StringRef("{cc}").equals_lower(Constraint))
+ if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -18480,6 +19856,66 @@
Info.flags = MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::arm_mve_vldr_gather_base:
+ case Intrinsic::arm_mve_vldr_gather_base_predicated: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ Info.memVT = MVT::getVT(I.getType());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
+ case Intrinsic::arm_mve_vldr_gather_base_wb:
+ case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
+ case Intrinsic::arm_mve_vldr_gather_offset:
+ case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getType());
+ unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+ Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
+ DataVT.getVectorNumElements());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
+ case Intrinsic::arm_mve_vstr_scatter_base:
+ case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::arm_mve_vstr_scatter_base_wb:
+ case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::arm_mve_vstr_scatter_offset:
+ case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+ Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
+ DataVT.getVectorNumElements());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -18551,7 +19987,7 @@
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
-Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
+Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
ARM_MB::MemBOpt Domain) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -18581,7 +20017,7 @@
}
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
@@ -18606,7 +20042,7 @@
llvm_unreachable("Unknown fence ordering in emitLeadingFence");
}
-Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
@@ -18654,6 +20090,14 @@
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+ // implement atomicrmw without spilling. If the target address is also on the
+ // stack and close enough to the spill slot, this can lead to a situation
+ // where the monitor always gets cleared and the atomic operation can never
+ // succeed. So at -O0 lower this operation to a CAS loop.
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ return AtomicExpansionKind::CmpXChg;
+
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
@@ -18760,16 +20204,16 @@
return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
}
-Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
+ Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i32, i32} and we have to recombine them into a
// single i64 here.
- if (ValTy->getPrimitiveSizeInBits() == 64) {
+ if (ValueTy->getPrimitiveSizeInBits() == 64) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
Function *Ldrex = Intrinsic::getDeclaration(M, Int);
@@ -18781,31 +20225,29 @@
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
if (!Subtarget->isLittle())
std::swap (Lo, Hi);
- Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
- Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
return Builder.CreateOr(
- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
}
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
- return Builder.CreateTruncOrBitCast(
- Builder.CreateCall(Ldrex, Addr),
- cast<PointerType>(Addr->getType())->getElementType());
+ return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy);
}
void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
- IRBuilder<> &Builder) const {
+ IRBuilderBase &Builder) const {
if (!Subtarget->hasV7Ops())
return;
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
}
-Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
- Value *Addr,
+Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
+ Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
bool IsRelease = isReleaseOrStronger(Ord);
@@ -18851,7 +20293,8 @@
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
- unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const {
+ unsigned Factor, FixedVectorType *VecTy, Align Alignment,
+ const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
@@ -18874,6 +20317,9 @@
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32)
return false;
+ // And the alignment if high enough under MVE.
+ if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
+ return false;
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
@@ -18914,11 +20360,12 @@
Type *EltTy = VecTy->getElementType();
const DataLayout &DL = LI->getModule()->getDataLayout();
+ Align Alignment = LI->getAlign();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
@@ -19067,11 +20514,12 @@
auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
+ Align Alignment = SI->getAlign();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
@@ -19254,8 +20702,8 @@
}
/// Return the correct alignment for the current calling convention.
-Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const {
+Align ARMTargetLowering::getABIAlignmentForCallingConv(
+ Type *ArgTy, const DataLayout &DL) const {
const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
if (!ArgTy->isVectorTy())
return ABITypeAlign;
@@ -19269,7 +20717,8 @@
/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
/// passing according to AAPCS rules.
bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const {
if (getEffectiveCallingConv(CallConv, isVarArg) !=
CallingConv::ARM_AAPCS_VFP)
return false;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 61a127a..844b7d4 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -51,287 +51,331 @@
namespace ARMISD {
- // ARM Specific DAG Nodes
- enum NodeType : unsigned {
- // Start the numbering where the builtin ops and target ops leave off.
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ // ARM Specific DAG Nodes
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops and target ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
- Wrapper, // Wrapper - A wrapper node for TargetConstantPool,
- // TargetExternalSymbol, and TargetGlobalAddress.
- WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
- // PIC mode.
- WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable
+ Wrapper, // Wrapper - A wrapper node for TargetConstantPool,
+ // TargetExternalSymbol, and TargetGlobalAddress.
+ WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
+ // PIC mode.
+ WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable
- // Add pseudo op to model memcpy for struct byval.
- COPY_STRUCT_BYVAL,
+ // Add pseudo op to model memcpy for struct byval.
+ COPY_STRUCT_BYVAL,
- CALL, // Function call.
- CALL_PRED, // Function call that's predicable.
- CALL_NOLINK, // Function call with branch not branch-and-link.
- tSECALL, // CMSE non-secure function call.
- BRCOND, // Conditional branch.
- BR_JT, // Jumptable branch.
- BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump).
- RET_FLAG, // Return with a flag operand.
- SERET_FLAG, // CMSE Entry function return with a flag operand.
- INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand.
+ CALL, // Function call.
+ CALL_PRED, // Function call that's predicable.
+ CALL_NOLINK, // Function call with branch not branch-and-link.
+ tSECALL, // CMSE non-secure function call.
+ BRCOND, // Conditional branch.
+ BR_JT, // Jumptable branch.
+ BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump).
+ RET_FLAG, // Return with a flag operand.
+ SERET_FLAG, // CMSE Entry function return with a flag operand.
+ INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand.
- PIC_ADD, // Add with a PC operand and a PIC label.
+ PIC_ADD, // Add with a PC operand and a PIC label.
- ASRL, // MVE long arithmetic shift right.
- LSRL, // MVE long shift right.
- LSLL, // MVE long shift left.
+ ASRL, // MVE long arithmetic shift right.
+ LSRL, // MVE long shift right.
+ LSLL, // MVE long shift left.
- CMP, // ARM compare instructions.
- CMN, // ARM CMN instructions.
- CMPZ, // ARM compare that sets only Z flag.
- CMPFP, // ARM VFP compare instruction, sets FPSCR.
- CMPFPE, // ARM VFP signalling compare instruction, sets FPSCR.
- CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR.
- CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets FPSCR.
- FMSTAT, // ARM fmstat instruction.
+ CMP, // ARM compare instructions.
+ CMN, // ARM CMN instructions.
+ CMPZ, // ARM compare that sets only Z flag.
+ CMPFP, // ARM VFP compare instruction, sets FPSCR.
+ CMPFPE, // ARM VFP signalling compare instruction, sets FPSCR.
+ CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR.
+ CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets
+ // FPSCR.
+ FMSTAT, // ARM fmstat instruction.
- CMOV, // ARM conditional move instructions.
- SUBS, // Flag-setting subtraction.
+ CMOV, // ARM conditional move instructions.
+ SUBS, // Flag-setting subtraction.
- SSAT, // Signed saturation
- USAT, // Unsigned saturation
+ SSAT, // Signed saturation
+ USAT, // Unsigned saturation
- BCC_i64,
+ BCC_i64,
- SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
- SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
- RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag.
+ SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+ SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+ RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag.
- ADDC, // Add with carry
- ADDE, // Add using carry
- SUBC, // Sub with carry
- SUBE, // Sub using carry
- LSLS, // Shift left producing carry
+ ADDC, // Add with carry
+ ADDE, // Add using carry
+ SUBC, // Sub with carry
+ SUBE, // Sub using carry
+ LSLS, // Shift left producing carry
- VMOVRRD, // double to two gprs.
- VMOVDRR, // Two gprs to double.
- VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr
+ VMOVRRD, // double to two gprs.
+ VMOVDRR, // Two gprs to double.
+ VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr
- EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
- EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
- EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+ EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
+ EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
+ EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
- TC_RETURN, // Tail call return pseudo.
+ TC_RETURN, // Tail call return pseudo.
- THREAD_POINTER,
+ THREAD_POINTER,
- DYN_ALLOC, // Dynamic allocation on the stack.
+ DYN_ALLOC, // Dynamic allocation on the stack.
- MEMBARRIER_MCR, // Memory barrier (MCR)
+ MEMBARRIER_MCR, // Memory barrier (MCR)
- PRELOAD, // Preload
+ PRELOAD, // Preload
- WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
- WIN__DBZCHK, // Windows' divide by zero check
+ WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
+ WIN__DBZCHK, // Windows' divide by zero check
- WLS, // Low-overhead loops, While Loop Start
- LOOP_DEC, // Really a part of LE, performs the sub
- LE, // Low-overhead loops, Loop End
+ WLS, // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart
+ WLSSETUP, // Setup for the iteration count of a WLS. See t2WhileLoopSetup.
+ LOOP_DEC, // Really a part of LE, performs the sub
+ LE, // Low-overhead loops, Loop End
- PREDICATE_CAST, // Predicate cast for MVE i1 types
- VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
+ PREDICATE_CAST, // Predicate cast for MVE i1 types
+ VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
- VCMP, // Vector compare.
- VCMPZ, // Vector compare to zero.
- VTST, // Vector test bits.
+ MVESEXT, // Legalization aids for extending a vector into two/four vectors.
+ MVEZEXT, // or truncating two/four vectors into one. Eventually becomes
+ MVETRUNC, // stack store/load sequence, if not optimized to anything else.
- // Vector shift by vector
- VSHLs, // ...left/right by signed
- VSHLu, // ...left/right by unsigned
+ VCMP, // Vector compare.
+ VCMPZ, // Vector compare to zero.
+ VTST, // Vector test bits.
- // Vector shift by immediate:
- VSHLIMM, // ...left
- VSHRsIMM, // ...right (signed)
- VSHRuIMM, // ...right (unsigned)
+ // Vector shift by vector
+ VSHLs, // ...left/right by signed
+ VSHLu, // ...left/right by unsigned
- // Vector rounding shift by immediate:
- VRSHRsIMM, // ...right (signed)
- VRSHRuIMM, // ...right (unsigned)
- VRSHRNIMM, // ...right narrow
+ // Vector shift by immediate:
+ VSHLIMM, // ...left
+ VSHRsIMM, // ...right (signed)
+ VSHRuIMM, // ...right (unsigned)
- // Vector saturating shift by immediate:
- VQSHLsIMM, // ...left (signed)
- VQSHLuIMM, // ...left (unsigned)
- VQSHLsuIMM, // ...left (signed to unsigned)
- VQSHRNsIMM, // ...right narrow (signed)
- VQSHRNuIMM, // ...right narrow (unsigned)
- VQSHRNsuIMM, // ...right narrow (signed to unsigned)
+ // Vector rounding shift by immediate:
+ VRSHRsIMM, // ...right (signed)
+ VRSHRuIMM, // ...right (unsigned)
+ VRSHRNIMM, // ...right narrow
- // Vector saturating rounding shift by immediate:
- VQRSHRNsIMM, // ...right narrow (signed)
- VQRSHRNuIMM, // ...right narrow (unsigned)
- VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
+ // Vector saturating shift by immediate:
+ VQSHLsIMM, // ...left (signed)
+ VQSHLuIMM, // ...left (unsigned)
+ VQSHLsuIMM, // ...left (signed to unsigned)
+ VQSHRNsIMM, // ...right narrow (signed)
+ VQSHRNuIMM, // ...right narrow (unsigned)
+ VQSHRNsuIMM, // ...right narrow (signed to unsigned)
- // Vector shift and insert:
- VSLIIMM, // ...left
- VSRIIMM, // ...right
+ // Vector saturating rounding shift by immediate:
+ VQRSHRNsIMM, // ...right narrow (signed)
+ VQRSHRNuIMM, // ...right narrow (unsigned)
+ VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
- // Vector get lane (VMOV scalar to ARM core register)
- // (These are used for 8- and 16-bit element types only.)
- VGETLANEu, // zero-extend vector extract element
- VGETLANEs, // sign-extend vector extract element
+ // Vector shift and insert:
+ VSLIIMM, // ...left
+ VSRIIMM, // ...right
- // Vector move immediate and move negated immediate:
- VMOVIMM,
- VMVNIMM,
+ // Vector get lane (VMOV scalar to ARM core register)
+ // (These are used for 8- and 16-bit element types only.)
+ VGETLANEu, // zero-extend vector extract element
+ VGETLANEs, // sign-extend vector extract element
- // Vector move f32 immediate:
- VMOVFPIMM,
+ // Vector move immediate and move negated immediate:
+ VMOVIMM,
+ VMVNIMM,
- // Move H <-> R, clearing top 16 bits
- VMOVrh,
- VMOVhr,
+ // Vector move f32 immediate:
+ VMOVFPIMM,
- // Vector duplicate:
- VDUP,
- VDUPLANE,
+ // Move H <-> R, clearing top 16 bits
+ VMOVrh,
+ VMOVhr,
- // Vector shuffles:
- VEXT, // extract
- VREV64, // reverse elements within 64-bit doublewords
- VREV32, // reverse elements within 32-bit words
- VREV16, // reverse elements within 16-bit halfwords
- VZIP, // zip (interleave)
- VUZP, // unzip (deinterleave)
- VTRN, // transpose
- VTBL1, // 1-register shuffle with mask
- VTBL2, // 2-register shuffle with mask
- VMOVN, // MVE vmovn
+ // Vector duplicate:
+ VDUP,
+ VDUPLANE,
- // MVE Saturating truncates
- VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
- VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
+ // Vector shuffles:
+ VEXT, // extract
+ VREV64, // reverse elements within 64-bit doublewords
+ VREV32, // reverse elements within 32-bit words
+ VREV16, // reverse elements within 16-bit halfwords
+ VZIP, // zip (interleave)
+ VUZP, // unzip (deinterleave)
+ VTRN, // transpose
+ VTBL1, // 1-register shuffle with mask
+ VTBL2, // 2-register shuffle with mask
+ VMOVN, // MVE vmovn
- // MVE float <> half converts
- VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
- VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
+ // MVE Saturating truncates
+ VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
+ VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
- // Vector multiply long:
- VMULLs, // ...signed
- VMULLu, // ...unsigned
+ // MVE float <> half converts
+ VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top
+ // lanes
+ VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
- VQDMULH, // MVE vqdmulh instruction
+ // MVE VIDUP instruction, taking a start value and increment.
+ VIDUP,
- // MVE reductions
- VADDVs, // sign- or zero-extend the elements of a vector to i32,
- VADDVu, // add them all together, and return an i32 of their sum
- VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask
- VADDVpu,
- VADDLVs, // sign- or zero-extend elements to i64 and sum, returning
- VADDLVu, // the low and high 32-bit halves of the sum
- VADDLVAs, // Same as VADDLV[su] but also add an input accumulator
- VADDLVAu, // provided as low and high halves
- VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask
- VADDLVpu,
- VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
- VADDLVApu,
- VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply them
- VMLAVu, // and add the results together, returning an i32 of their sum
- VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
- VMLAVpu,
- VMLALVs, // Same as VMLAV but with i64, returning the low and
- VMLALVu, // high 32-bit halves of the sum
- VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask
- VMLALVpu,
- VMLALVAs, // Same as VMLALV but also add an input accumulator
- VMLALVAu, // provided as low and high halves
- VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask
- VMLALVApu,
- VMINVu, // Find minimum unsigned value of a vector and register
- VMINVs, // Find minimum signed value of a vector and register
- VMAXVu, // Find maximum unsigned value of a vector and register
- VMAXVs, // Find maximum signed value of a vector and register
+ // Vector multiply long:
+ VMULLs, // ...signed
+ VMULLu, // ...unsigned
- SMULWB, // Signed multiply word by half word, bottom
- SMULWT, // Signed multiply word by half word, top
- UMLAL, // 64bit Unsigned Accumulate Multiply
- SMLAL, // 64bit Signed Accumulate Multiply
- UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply
- SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16
- SMLALBT, // 64-bit signed accumulate multiply bottom, top 16
- SMLALTB, // 64-bit signed accumulate multiply top, bottom 16
- SMLALTT, // 64-bit signed accumulate multiply top, top 16
- SMLALD, // Signed multiply accumulate long dual
- SMLALDX, // Signed multiply accumulate long dual exchange
- SMLSLD, // Signed multiply subtract long dual
- SMLSLDX, // Signed multiply subtract long dual exchange
- SMMLAR, // Signed multiply long, round and add
- SMMLSR, // Signed multiply long, subtract and round
+ VQDMULH, // MVE vqdmulh instruction
- // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
- QADD8b,
- QSUB8b,
- QADD16b,
- QSUB16b,
+ // MVE reductions
+ VADDVs, // sign- or zero-extend the elements of a vector to i32,
+ VADDVu, // add them all together, and return an i32 of their sum
+ VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask
+ VADDVpu,
+ VADDLVs, // sign- or zero-extend elements to i64 and sum, returning
+ VADDLVu, // the low and high 32-bit halves of the sum
+ VADDLVAs, // Same as VADDLV[su] but also add an input accumulator
+ VADDLVAu, // provided as low and high halves
+ VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask
+ VADDLVpu,
+ VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
+ VADDLVApu,
+ VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply
+ // them
+ VMLAVu, // and add the results together, returning an i32 of their sum
+ VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
+ VMLAVpu,
+ VMLALVs, // Same as VMLAV but with i64, returning the low and
+ VMLALVu, // high 32-bit halves of the sum
+ VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask
+ VMLALVpu,
+ VMLALVAs, // Same as VMLALV but also add an input accumulator
+ VMLALVAu, // provided as low and high halves
+ VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask
+ VMLALVApu,
+ VMINVu, // Find minimum unsigned value of a vector and register
+ VMINVs, // Find minimum signed value of a vector and register
+ VMAXVu, // Find maximum unsigned value of a vector and register
+ VMAXVs, // Find maximum signed value of a vector and register
- // Operands of the standard BUILD_VECTOR node are not legalized, which
- // is fine if BUILD_VECTORs are always lowered to shuffles or other
- // operations, but for ARM some BUILD_VECTORs are legal as-is and their
- // operands need to be legalized. Define an ARM-specific version of
- // BUILD_VECTOR for this purpose.
- BUILD_VECTOR,
+ SMULWB, // Signed multiply word by half word, bottom
+ SMULWT, // Signed multiply word by half word, top
+ UMLAL, // 64bit Unsigned Accumulate Multiply
+ SMLAL, // 64bit Signed Accumulate Multiply
+ UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply
+ SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16
+ SMLALBT, // 64-bit signed accumulate multiply bottom, top 16
+ SMLALTB, // 64-bit signed accumulate multiply top, bottom 16
+ SMLALTT, // 64-bit signed accumulate multiply top, top 16
+ SMLALD, // Signed multiply accumulate long dual
+ SMLALDX, // Signed multiply accumulate long dual exchange
+ SMLSLD, // Signed multiply subtract long dual
+ SMLSLDX, // Signed multiply subtract long dual exchange
+ SMMLAR, // Signed multiply long, round and add
+ SMMLSR, // Signed multiply long, subtract and round
- // Bit-field insert
- BFI,
+ // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b
+ // stands for.
+ QADD8b,
+ QSUB8b,
+ QADD16b,
+ QSUB16b,
+ UQADD8b,
+ UQSUB8b,
+ UQADD16b,
+ UQSUB16b,
- // Vector OR with immediate
- VORRIMM,
- // Vector AND with NOT of immediate
- VBICIMM,
+ // Operands of the standard BUILD_VECTOR node are not legalized, which
+ // is fine if BUILD_VECTORs are always lowered to shuffles or other
+ // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+ // operands need to be legalized. Define an ARM-specific version of
+ // BUILD_VECTOR for this purpose.
+ BUILD_VECTOR,
- // Pseudo vector bitwise select
- VBSP,
+ // Bit-field insert
+ BFI,
- // Pseudo-instruction representing a memory copy using ldm/stm
- // instructions.
- MEMCPY,
+ // Vector OR with immediate
+ VORRIMM,
+ // Vector AND with NOT of immediate
+ VBICIMM,
- // V8.1MMainline condition select
- CSINV, // Conditional select invert.
- CSNEG, // Conditional select negate.
- CSINC, // Conditional select increment.
+ // Pseudo vector bitwise select
+ VBSP,
- // Vector load N-element structure to all lanes:
- VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
- VLD2DUP,
- VLD3DUP,
- VLD4DUP,
+ // Pseudo-instruction representing a memory copy using ldm/stm
+ // instructions.
+ MEMCPY,
- // NEON loads with post-increment base updates:
- VLD1_UPD,
- VLD2_UPD,
- VLD3_UPD,
- VLD4_UPD,
- VLD2LN_UPD,
- VLD3LN_UPD,
- VLD4LN_UPD,
- VLD1DUP_UPD,
- VLD2DUP_UPD,
- VLD3DUP_UPD,
- VLD4DUP_UPD,
+ // Pseudo-instruction representing a memory copy using a tail predicated
+ // loop
+ MEMCPYLOOP,
+ // Pseudo-instruction representing a memset using a tail predicated
+ // loop
+ MEMSETLOOP,
- // NEON stores with post-increment base updates:
- VST1_UPD,
- VST2_UPD,
- VST3_UPD,
- VST4_UPD,
- VST2LN_UPD,
- VST3LN_UPD,
- VST4LN_UPD,
+ // V8.1MMainline condition select
+ CSINV, // Conditional select invert.
+ CSNEG, // Conditional select negate.
+ CSINC, // Conditional select increment.
- // Load/Store of dual registers
- LDRD,
- STRD
- };
+ // Vector load N-element structure to all lanes:
+ VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ VLD2DUP,
+ VLD3DUP,
+ VLD4DUP,
+
+ // NEON loads with post-increment base updates:
+ VLD1_UPD,
+ VLD2_UPD,
+ VLD3_UPD,
+ VLD4_UPD,
+ VLD2LN_UPD,
+ VLD3LN_UPD,
+ VLD4LN_UPD,
+ VLD1DUP_UPD,
+ VLD2DUP_UPD,
+ VLD3DUP_UPD,
+ VLD4DUP_UPD,
+ VLD1x2_UPD,
+ VLD1x3_UPD,
+ VLD1x4_UPD,
+
+ // NEON stores with post-increment base updates:
+ VST1_UPD,
+ VST2_UPD,
+ VST3_UPD,
+ VST4_UPD,
+ VST2LN_UPD,
+ VST3LN_UPD,
+ VST4LN_UPD,
+ VST1x2_UPD,
+ VST1x3_UPD,
+ VST1x4_UPD,
+
+ // Load/Store of dual registers
+ LDRD,
+ STRD
+ };
} // end namespace ARMISD
+ namespace ARM {
+ /// Possible values of current rounding mode, which is specified in bits
+ /// 23:22 of FPSCR.
+ enum Rounding {
+ RN = 0, // Round to Nearest
+ RP = 1, // Round towards Plus infinity
+ RM = 2, // Round towards Minus infinity
+ RZ = 3, // Round towards Zero
+ rmMask = 3 // Bit mask selecting rounding mode
+ };
+
+ // Bit position of rounding mode bits in FPSCR.
+ const unsigned RoundingBitsPos = 22;
+ } // namespace ARM
+
/// Define some predicates that are used for node matching.
namespace ARM {
@@ -381,6 +425,8 @@
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
bool SimplifyDemandedBitsForTargetNode(SDValue Op,
@@ -396,7 +442,7 @@
/// unaligned memory accesses of the specified type. Returns whether it
/// is "fast" by reference in the second argument.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Align,
+ Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
@@ -427,8 +473,9 @@
/// addressing mode represented by AM.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, the return value must be negative.
- int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
- unsigned AS) const override;
+ InstructionCost getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
@@ -498,8 +545,6 @@
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
if (ConstraintCode == "Q")
return InlineAsm::Constraint_Q;
- else if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
else if (ConstraintCode.size() == 2) {
if (ConstraintCode[0] == 'U') {
switch(ConstraintCode[1]) {
@@ -544,6 +589,8 @@
Sched::Preference getSchedulingPreference(SDNode *N) const override;
+ bool preferZeroCompareBranch() const override { return true; }
+
bool
isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
@@ -578,7 +625,8 @@
/// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
@@ -590,17 +638,18 @@
Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
- Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
- Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ Instruction *makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const;
+ Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const override;
- Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
- Value *Addr, AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
+ AtomicOrdering Ord) const override;
- void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+ void
+ emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
@@ -660,6 +709,7 @@
/// function checks the vector element type and the overall width of the
/// vector.
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy,
+ Align Alignment,
const DataLayout &DL) const;
bool alignLoopsWithOptSize() const override;
@@ -673,7 +723,7 @@
/// Return the correct alignment for the current calling convention.
Align getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const override;
+ const DataLayout &DL) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
@@ -706,7 +756,7 @@
bool HasStandaloneRem = true;
- void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+ void addTypeForNEON(MVT VT, MVT PromotedLdStVT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
@@ -718,7 +768,8 @@
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVectorImpl<SDValue> &MemOpChains,
- ISD::ArgFlagsTy Flags) const;
+ bool IsTailCall,
+ int SPDiff) const;
SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
const SDLoc &dl) const;
@@ -727,10 +778,10 @@
bool isVarArg) const;
CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
bool isVarArg) const;
- SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
- const SDLoc &dl, SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const;
+ std::pair<SDValue, MachinePointerInfo>
+ computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, SDValue StackPtr,
+ bool IsTailCall, int SPDiff) const;
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
@@ -766,6 +817,7 @@
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 2790ac2..3c6c696 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -32,7 +32,8 @@
: ARMBaseInstrInfo(STI), RI() {}
/// Return the noop instruction to use for a noop.
-void ARMInstrInfo::getNoop(MCInst &NopInst) const {
+MCInst ARMInstrInfo::getNop() const {
+ MCInst NopInst;
if (hasNOP()) {
NopInst.setOpcode(ARM::HINT);
NopInst.addOperand(MCOperand::createImm(0));
@@ -46,6 +47,7 @@
NopInst.addOperand(MCOperand::createReg(0));
NopInst.addOperand(MCOperand::createReg(0));
}
+ return NopInst;
}
unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.h b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.h
index 042b53f..178d7a2 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -25,7 +25,7 @@
explicit ARMInstrInfo(const ARMSubtarget &STI);
/// Return the noop instruction to use for a noop.
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
// Return the non-pre/post incrementing version of 'Opc'. Return 0
// if there is not such an opcode.
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index 8dcb319..7466cec 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -66,7 +66,7 @@
def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
SDTCisInt<1>]>;
-def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARMTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
@@ -240,6 +240,11 @@
def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+def ARMuqadd8b : SDNode<"ARMISD::UQADD8b", SDT_ARMAnd, []>;
+def ARMuqsub8b : SDNode<"ARMISD::UQSUB8b", SDT_ARMAnd, []>;
+def ARMuqadd16b : SDNode<"ARMISD::UQADD16b", SDT_ARMAnd, []>;
+def ARMuqsub16b : SDNode<"ARMISD::UQSUB16b", SDT_ARMAnd, []>;
+
def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -256,6 +261,10 @@
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisVT<2, i32>]>>;
+def SDTARMVIDUP : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def ARMvidup : SDNode<"ARMISD::VIDUP", SDTARMVIDUP>;
+
def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
@@ -2619,15 +2628,16 @@
let Inst{7-4} = 0b0010;
let Inst{3-0} = func;
let isBranch = 1;
+ let isIndirectBranch = 1;
}
// Tail calls.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
- def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+ def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, i32imm:$SPDiff), IIC_Br, []>,
Sched<[WriteBr]>;
- def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+ def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, i32imm:$SPDiff), IIC_Br, []>,
Sched<[WriteBr]>;
def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
@@ -3940,6 +3950,7 @@
(QDADD rGPR:$Rm, rGPR:$Rn)>;
def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(QDSUB rGPR:$Rm, rGPR:$Rn)>;
+
def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
(QADD8 rGPR:$Rm, rGPR:$Rn)>;
def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
@@ -3958,6 +3969,16 @@
def UQASX : AAIIntrinsic<0b01100110, 0b11110011, "uqasx", int_arm_uqasx>;
def UQSAX : AAIIntrinsic<0b01100110, 0b11110101, "uqsax", int_arm_uqsax>;
+def : ARMV6Pat<(ARMuqadd8b rGPR:$Rm, rGPR:$Rn),
+ (UQADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMuqsub8b rGPR:$Rm, rGPR:$Rn),
+ (UQSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMuqadd16b rGPR:$Rm, rGPR:$Rn),
+ (UQADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMuqsub16b rGPR:$Rm, rGPR:$Rn),
+ (UQSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
+
// Signed/Unsigned add/subtract
def SASX : AAIIntrinsic<0b01100001, 0b11110011, "sasx", int_arm_sasx>;
@@ -5278,7 +5299,7 @@
class acquiring_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr), [{
- AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getSuccessOrdering();
return isAcquireOrStronger(Ordering);
}]>;
@@ -5288,7 +5309,7 @@
class releasing_store<PatFrag base>
: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
- AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getSuccessOrdering();
return isReleaseOrStronger(Ordering);
}]>;
@@ -5998,9 +6019,12 @@
// TODO: add,sub,and, 3-instr forms?
// Tail calls. These patterns also apply to Thumb mode.
-def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
-def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret tcGPR:$dst, (i32 timm:$SPDiff)),
+ (TCRETURNri tcGPR:$dst, timm:$SPDiff)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst), (i32 timm:$SPDiff)),
+ (TCRETURNdi texternalsym:$dst, (i32 timm:$SPDiff))>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst), (i32 timm:$SPDiff)),
+ (TCRETURNdi texternalsym:$dst, i32imm:$SPDiff)>;
// Direct calls
def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 0dfea68..3728938 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -318,7 +318,7 @@
def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>;
-multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic PredInt,
dag PredOperands, Instruction Inst,
SDPatternOperator IdentityVec = null_frag> {
// Unpredicated
@@ -354,7 +354,7 @@
(VTI.Vec MQPR:$inactive)))>;
}
-multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic PredInt,
dag PredOperands, Instruction Inst,
SDPatternOperator IdentityVec = null_frag> {
// Unpredicated
@@ -1825,14 +1825,22 @@
}
def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>;
-def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>;
def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>;
def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>;
-def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>;
def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>;
def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>;
+let isInsertSubreg = 1 in
+def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>;
+def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>;
def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>;
+// This is the same as insertelt but allows the inserted value to be an i32 as
+// will be used when it is the only legal type.
+def ARMVecInsert : SDTypeProfile<1, 3, [
+ SDTCisVT<2, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
+]>;
+def ARMinsertelt : SDNode<"ISD::INSERT_VECTOR_ELT", ARMVecInsert>;
+
let Predicates = [HasMVEInt] in {
def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane),
(f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>;
@@ -1844,6 +1852,14 @@
(i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>;
def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+ // This tries to copy from one lane to another, without going via GPR regs
+ def : Pat<(insertelt (v4i32 MQPR:$src1), (extractelt (v4i32 MQPR:$src2), imm:$extlane), imm:$inslane),
+ (v4i32 (COPY_TO_REGCLASS
+ (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4i32 MQPR:$src1), MQPR)),
+ (f32 (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4i32 MQPR:$src2), MQPR)),
+ (SSubReg_f32_reg imm:$extlane))),
+ (SSubReg_f32_reg imm:$inslane)),
+ MQPR))>;
def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>;
@@ -1862,6 +1878,14 @@
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
+ // For i16's inserts being extracted from low lanes, then may use VINS.
+ def : Pat<(ARMinsertelt (v8i16 MQPR:$src1),
+ (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$extlane),
+ imm_odd:$inslane),
+ (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
+ (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$inslane)),
+ (EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$extlane))),
+ (SSubReg_f16_reg imm_odd:$inslane)), MQPR)>;
def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
@@ -1876,8 +1900,13 @@
def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
- def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane),
+ def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
+ def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane),
+ (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
+ (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)),
+ (COPY_TO_REGCLASS HPR:$src2, SPR)),
+ (SSubReg_f16_reg imm_odd:$lane)), MQPR)>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
@@ -1885,6 +1914,8 @@
(VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
+ def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
@@ -2100,36 +2131,31 @@
let validForTailPredication = 1;
}
-multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
- Intrinsic unpred_int, Intrinsic pred_int> {
+multiclass MVE_VABD_m<MVEVectorVTInfo VTI, SDNode Op,
+ Intrinsic unpred_int, Intrinsic PredInt> {
def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
+
// Unpredicated absolute difference
def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated absolute difference
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
}
}
-multiclass MVE_VABD<MVEVectorVTInfo VTI>
- : MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
+multiclass MVE_VABD<MVEVectorVTInfo VTI, SDNode Op>
+ : MVE_VABD_m<VTI, Op, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
-defm MVE_VABDs8 : MVE_VABD<MVE_v16s8>;
-defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>;
-defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>;
-defm MVE_VABDu8 : MVE_VABD<MVE_v16u8>;
-defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>;
-defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>;
+defm MVE_VABDs8 : MVE_VABD<MVE_v16s8, abds>;
+defm MVE_VABDs16 : MVE_VABD<MVE_v8s16, abds>;
+defm MVE_VABDs32 : MVE_VABD<MVE_v4s32, abds>;
+defm MVE_VABDu8 : MVE_VABD<MVE_v16u8, abdu>;
+defm MVE_VABDu16 : MVE_VABD<MVE_v8u16, abdu>;
+defm MVE_VABDu32 : MVE_VABD<MVE_v4u32, abdu>;
class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vrhadd", suffix, size, pattern> {
@@ -2422,7 +2448,7 @@
}
multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
- SDNode unpred_op> {
+ SDPatternOperator unpred_op> {
def "": MVE_VCLSCLZ<"v"#opname, VTI.Suffix, VTI.Size, opcode>;
defvar Inst = !cast<Instruction>(NAME);
@@ -2466,7 +2492,7 @@
}
multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
- SDNode unpred_op, Intrinsic pred_int,
+ SDPatternOperator unpred_op, Intrinsic pred_int,
MVEVectorVTInfo VTI> {
def "" : MVE_VABSNEG_int<iname, VTI.Suffix, VTI.Size, negate, saturate>;
defvar Inst = !cast<Instruction>(NAME);
@@ -3531,7 +3557,7 @@
}
multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode,
- SDNode unpred_op> {
+ SDPatternOperator unpred_op> {
def "": MVE_VRINT<suffix, opcode, VTI.Suffix, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
defvar pred_int = !cast<Intrinsic>("int_arm_mve_vrint"#suffix#"_predicated");
@@ -4320,7 +4346,7 @@
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
-multiclass unpred_vcmpf_r<int fc> {
+multiclass unpred_vcmpf_r<PatLeaf fc> {
def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
@@ -4587,7 +4613,7 @@
}
multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int,
+ SDPatternOperator unpred_op, Intrinsic pred_int,
bit Top, string cstr=""> {
def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
VTI.Size, Top, cstr>;
@@ -4714,26 +4740,33 @@
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
}
multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
- Intrinsic pred_int, bit round> {
+ Intrinsic PredInt, bit round> {
def "" : MVE_VxMULH<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, round>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated multiply returning high bits
+ if !eq(round, 0b0) then {
+ defvar mulh = !if(VTI.Unsigned, mulhu, mulhs);
+ defm : MVE_TwoOpPattern<VTI, mulh, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
+ } else {
+ // Predicated multiply returning high bits
+ def : Pat<(VTI.Vec (PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+
+ // Unpredicated intrinsic
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated multiply returning high bits
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
}
}
@@ -5134,7 +5167,8 @@
// Patterns for vector-scalar instructions with integer operands
multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
- SDNode unpred_op, SDNode pred_op,
+ SDPatternOperator unpred_op,
+ SDPatternOperator pred_op,
bit unpred_has_sign = 0,
bit pred_has_sign = 0> {
defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?));
@@ -5673,11 +5707,12 @@
defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>;
class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
- list<dag> pattern=[]>
+ ValueType VT, SDPatternOperator vxdup>
: MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
(ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary,
iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src",
- pattern> {
+ [(set (VT MQPR:$Qd), (i32 tGPREven:$Rn),
+ (vxdup (i32 tGPREven:$Rn_src), (i32 imm:$imm)))]> {
bits<4> Qd;
bits<4> Rn;
bits<2> imm;
@@ -5698,13 +5733,13 @@
let hasSideEffects = 0;
}
-def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
-def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>;
-def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>;
+def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0, v16i8, ARMvidup>;
+def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0, v8i16, ARMvidup>;
+def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0, v4i32, ARMvidup>;
-def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>;
-def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>;
-def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>;
+def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1, v16i8, null_frag>;
+def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1, v8i16, null_frag>;
+def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1, v4i32, null_frag>;
class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
list<dag> pattern=[]>
@@ -5994,8 +6029,8 @@
def SDTARMVST4 : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>,
SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>,
SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>;
-def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>;
-def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>;
+def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain, SDNPMemOperand]>;
+def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain, SDNPMemOperand]>;
multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
foreach stage = [0,1] in
@@ -6199,11 +6234,7 @@
def _post : MVE_VLDRSTR_cs<
dir, memsz, 0, 1,
!con((outs rGPR:$wb), dir.Oops),
- // We need an !if here to select the base register class,
- // because it's legal to write back to SP in a load of this
- // type, but not in a store.
- !con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none,
- t2_nosp_addr_offset_none):$Rn,
+ !con(dir.Iops, (ins t2_nosp_addr_offset_none:$Rn,
t2am_imm7_offset<memsz.shift>:$addr)),
asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
bits<4> Rn;
@@ -6836,6 +6867,30 @@
let isTerminator = 1;
}
+def SDT_MVEMEMCPYLOOPNODE
+ : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CPSR] in {
+ def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
+ (ins rGPR:$dst, rGPR:$src, rGPR:$sz),
+ NoItinerary,
+ [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
+}
+
+def SDT_MVEMEMSETLOOPNODE
+ : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>;
+def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CPSR] in {
+ def MVE_MEMSETLOOPINST : PseudoInst<(outs),
+ (ins rGPR:$dst, MQPR:$src, rGPR:$sz),
+ NoItinerary,
+ [(MVE_MEMSETLOOPNODE rGPR:$dst, MQPR:$src, rGPR:$sz)]>;
+}
+
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
index a8c0d05..3ca6704 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -705,21 +705,31 @@
defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
-def VLD1d8TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1d16TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1d32TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d8TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d8TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d8TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d16TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d16TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d16TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d32TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d32TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d32TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q8HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q8LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q16HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q16LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q32HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q32LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q64HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
-def VLD1q64LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8HighTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16HighTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32HighTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64HighTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
// ...with 4 registers
class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
@@ -758,21 +768,31 @@
defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
-def VLD1d8QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1d16QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1d32QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d8QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d8QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d8QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d16QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d16QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d16QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d32QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d32QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d32QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q8LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q8HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q16LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q16HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q32LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q32HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q64LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
-def VLD1q64HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8HighQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16HighQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32HighQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64HighQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
// VLD2 : Vector Load (multiple 2-element structures)
class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1514,6 +1534,13 @@
defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
+def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
class VLD3DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
@@ -1567,6 +1594,10 @@
def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq8OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq16OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq32OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
class VLD4DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1111, op7_4,
@@ -1621,6 +1652,10 @@
def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq8OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq16OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq32OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
@@ -1776,19 +1811,31 @@
defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
def VST1d8TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d8TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d8TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1d16TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d16TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d16TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1d32TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d32TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d32TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1q8HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
-def VST1q8LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1q16HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
-def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1q32HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
-def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1q64HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+
+def VST1q8HighTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16HighTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32HighTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q64HighTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+
+def VST1q8LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
// ...with 4 registers
@@ -1831,19 +1878,31 @@
defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
def VST1d8QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d8QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d8QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1d16QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d16QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d16QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1d32QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d32QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d32QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1q8HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
-def VST1q8LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1q16HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
-def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1q32HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
-def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1q64HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+
+def VST1q8HighQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16HighQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32HighQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q64HighQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+
+def VST1q8LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
// VST2 : Vector Store (multiple 2-element structures)
@@ -3018,7 +3077,7 @@
// Long 3-register operations with explicitly extended operands.
class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
- ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+ ValueType TyQ, ValueType TyD, SDNode OpNode, SDPatternOperator ExtOp,
bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
(outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -3085,7 +3144,7 @@
// Wide 3-register operations.
class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
- SDNode OpNode, SDNode ExtOp, bit Commutable>
+ SDNode OpNode, SDPatternOperator ExtOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
(outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -3649,7 +3708,7 @@
multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
InstrItinClass itin16, InstrItinClass itin32,
string OpcodeStr, string Dt,
- SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+ SDNode OpNode, SDPatternOperator ExtOp, bit Commutable = 0> {
def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
OpcodeStr, !strconcat(Dt, "8"),
v8i16, v8i8, OpNode, ExtOp, Commutable>;
@@ -3717,7 +3776,7 @@
// source operand element sizes of 8, 16 and 32 bits:
multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt,
- SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+ SDNode OpNode, SDPatternOperator ExtOp, bit Commutable = 0> {
def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
OpcodeStr, !strconcat(Dt, "8"),
v8i16, v8i8, OpNode, ExtOp, Commutable>;
@@ -4371,7 +4430,7 @@
v8i16, v8i8, int_arm_neon_vmullp, 1>;
def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
"vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
- Requires<[HasV8, HasCrypto]>;
+ Requires<[HasV8, HasAES]>;
}
defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>;
defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>;
@@ -5282,6 +5341,29 @@
def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
v4i32, v4i32, or, 1>;
+multiclass BitwisePatterns<string Name, SDPatternOperator OpNodeD,
+ SDPatternOperator OpNodeQ> {
+ def : Pat<(v8i8 (OpNodeD DPR:$LHS, DPR:$RHS)),
+ (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+ def : Pat<(v4i16 (OpNodeD DPR:$LHS, DPR:$RHS)),
+ (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+ def : Pat<(v1i64 (OpNodeD DPR:$LHS, DPR:$RHS)),
+ (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+
+ def : Pat<(v16i8 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+ (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+ def : Pat<(v8i16 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+ (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+ def : Pat<(v2i64 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+ (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+}
+
+let Predicates = [HasNEON] in {
+ defm : BitwisePatterns<"VAND", and, and>;
+ defm : BitwisePatterns<"VORR", or, or>;
+ defm : BitwisePatterns<"VEOR", xor, xor>;
+}
+
def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
(outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
IIC_VMOVImm,
@@ -5333,6 +5415,11 @@
(vnotq QPR:$Vm))))]>;
}
+let Predicates = [HasNEON] in {
+ defm : BitwisePatterns<"VBIC", BinOpFrag<(and node:$LHS, (vnotd node:$RHS))>,
+ BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>>;
+}
+
def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
(outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
IIC_VMOVImm,
@@ -5381,6 +5468,11 @@
[(set QPR:$Vd, (v4i32 (or QPR:$Vn,
(vnotq QPR:$Vm))))]>;
+let Predicates = [HasNEON] in {
+ defm : BitwisePatterns<"VORN", BinOpFrag<(or node:$LHS, (vnotd node:$RHS))>,
+ BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>>;
+}
+
// VMVN : Vector Bitwise NOT (Immediate)
let isReMaterializable = 1 in {
@@ -5424,8 +5516,18 @@
"vmvn", "$Vd, $Vm", "",
[(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
let Predicates = [HasNEON] in {
-def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
-def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+def : Pat<(v1i64 (vnotd DPR:$src)),
+ (VMVNd DPR:$src)>;
+def : Pat<(v4i16 (vnotd DPR:$src)),
+ (VMVNd DPR:$src)>;
+def : Pat<(v8i8 (vnotd DPR:$src)),
+ (VMVNd DPR:$src)>;
+def : Pat<(v2i64 (vnotq QPR:$src)),
+ (VMVNq QPR:$src)>;
+def : Pat<(v8i16 (vnotq QPR:$src)),
+ (VMVNq QPR:$src)>;
+def : Pat<(v16i8 (vnotq QPR:$src)),
+ (VMVNq QPR:$src)>;
}
// The TwoAddress pass will not go looking for equivalent operations
@@ -5454,10 +5556,15 @@
(v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
(VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v8i8 (or (and DPR:$Vn, DPR:$Vd),
+ (and DPR:$Vm, (vnotd DPR:$Vd)))),
+ (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i16 (or (and DPR:$Vn, DPR:$Vd),
+ (and DPR:$Vm, (vnotd DPR:$Vd)))),
+ (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
(VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
-
def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
(VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
@@ -5485,6 +5592,12 @@
(v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
(VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v16i8 (or (and QPR:$Vn, QPR:$Vd),
+ (and QPR:$Vm, (vnotq QPR:$Vd)))),
+ (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v8i16 (or (and QPR:$Vn, QPR:$Vd),
+ (and QPR:$Vm, (vnotq QPR:$Vd)))),
+ (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
(VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
@@ -5574,10 +5687,10 @@
(zext node:$in2)), (i32 $shift))>;
let Predicates = [HasNEON] in {
-def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
- (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
- (zext (v2i32 DPR:$opB))),
- (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+def : Pat<(xor (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)),
+ (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+ (zext (v2i32 DPR:$opB))),
+ (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
(VABDLuv2i64 DPR:$opA, DPR:$opB)>;
}
@@ -6482,8 +6595,6 @@
defm : InsertEltF16<f16, v4f16, v8f16>;
-//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
-// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
(INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
@@ -6494,6 +6605,11 @@
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+def : Pat<(v4f16 (scalar_to_vector (f16 HPR:$src))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 HPR:$src))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+
def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
(VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
@@ -7207,33 +7323,31 @@
DecoderNamespace = "v8Crypto", hasSideEffects = 0 in {
class AES<string op, bit op7, bit op6, SDPatternOperator Int>
: N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
- !strconcat("aes", op), "8", v16i8, v16i8, Int>,
- Requires<[HasV8, HasCrypto]>;
+ !strconcat("aes", op), "8", v16i8, v16i8, Int>;
class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
: N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
- !strconcat("aes", op), "8", v16i8, v16i8, Int>,
- Requires<[HasV8, HasCrypto]>;
+ !strconcat("aes", op), "8", v16i8, v16i8, Int>;
class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
SDPatternOperator Int>
: N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
- !strconcat("sha", op), "32", v4i32, v4i32, Int>,
- Requires<[HasV8, HasCrypto]>;
+ !strconcat("sha", op), "32", v4i32, v4i32, Int>;
class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
SDPatternOperator Int>
: N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
- !strconcat("sha", op), "32", v4i32, v4i32, Int>,
- Requires<[HasV8, HasCrypto]>;
+ !strconcat("sha", op), "32", v4i32, v4i32, Int>;
class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
: N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
- !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
- Requires<[HasV8, HasCrypto]>;
+ !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>;
}
+let Predicates = [HasV8, HasAES] in {
def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+}
+let Predicates = [HasV8, HasSHA2] in {
def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>;
def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
@@ -7244,6 +7358,7 @@
def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+}
let Predicates = [HasNEON] in {
def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
index 3a33dfe..ef07b28 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1659,19 +1659,16 @@
(tSTMIA_UPD tGPR:$Rn, tGPR:$Rt)>;
// If it's impossible to use [r,r] address mode for sextload, select to
-// ldr{b|h} + sxt{b|h} instead.
-def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
- (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+// ldsr{b|h} r, 0 instead, in a hope that the mov 0 will be more likely to be
+// commoned out than a sxth.
+let AddedComplexity = 10 in {
+def : T1Pat<(sextloadi8 tGPR:$Rn),
+ (tLDRSB tGPR:$Rn, (tMOVi8 0))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
- (tSXTB (tLDRBr t_addrmode_rr:$addr))>,
+def : T1Pat<(sextloadi16 tGPR:$Rn),
+ (tLDRSH tGPR:$Rn, (tMOVi8 0))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
- (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
- Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
- (tSXTH (tLDRHr t_addrmode_rr:$addr))>,
- Requires<[IsThumb, IsThumb1Only, HasV6]>;
+}
def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
(tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
@@ -1769,3 +1766,21 @@
def tLDRConstPool
: tAsmPseudo<"ldr${p} $Rt, $immediate",
(ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// See ARMInstrInfo.td. These two thumb specific pseudos are required to
+// restrict the register class for the UXTB/UXTH ops used in the expansion.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
+ mayLoad = 1, mayStore = 1 in {
+def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+ (ins GPR:$addr, tGPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
+
+def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+ (ins GPR:$addr, tGPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
+}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 5642cab..e7eed2a 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -63,6 +63,12 @@
let MIOperandInfo = (ops rGPR, i32imm);
}
+// Same as above, but only matching on a single use node.
+def t2_so_reg_oneuse : Operand<i32>,
+ ComplexPattern<i32, 2,
+ "SelectShiftImmShifterOperandOneUse",
+ [shl,srl,sra,rotr]>;
+
// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
@@ -1563,6 +1569,14 @@
Sched<[WriteLd]>;
} // mayLoad = 1, hasSideEffects = 0
+// F5.1.72 LDR (immediate) T4
+// .w suffixes; Constraints can't be used on t2InstAlias to describe
+// "$Rn = $Rn_wb" on POST or "$addr.base = $Rn_wb" on PRE.
+def t2LDR_PRE_imm : t2AsmPseudo<"ldr${p}.w $Rt, $addr!",
+ (ins GPR:$Rt, t2addrmode_imm8_pre:$addr, pred:$p)>;
+def t2LDR_POST_imm : t2AsmPseudo<"ldr${p}.w $Rt, $Rn, $imm",
+ (ins GPR:$Rt, addr_offset_none:$Rn, t2am_imm8_offset:$imm, pred:$p)>;
+
// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
@@ -1720,6 +1734,15 @@
Sched<[WriteST]>;
}
+// F5.1.229 STR (immediate) T4
+// .w suffixes; Constraints can't be used on t2InstAlias to describe
+// "$Rn = $Rn_wb,@earlyclobber $Rn_wb" on POST or
+// "$addr.base = $Rn_wb,@earlyclobber $Rn_wb" on PRE.
+def t2STR_PRE_imm : t2AsmPseudo<"str${p}.w $Rt, $addr!",
+ (ins GPR:$Rt, t2addrmode_imm8_pre:$addr, pred:$p)>;
+def t2STR_POST_imm : t2AsmPseudo<"str${p}.w $Rt, $Rn, $imm",
+ (ins GPR:$Rt, addr_offset_none:$Rn, t2am_imm8_offset:$imm, pred:$p)>;
+
// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
// only.
// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
@@ -2498,6 +2521,7 @@
(t2QDADD rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+
def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
(t2QADD8 rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
@@ -2507,6 +2531,15 @@
def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
(t2QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMuqadd8b rGPR:$Rm, rGPR:$Rn),
+ (t2UQADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMuqsub8b rGPR:$Rm, rGPR:$Rn),
+ (t2UQSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMuqadd16b rGPR:$Rm, rGPR:$Rn),
+ (t2UQADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMuqsub16b rGPR:$Rm, rGPR:$Rn),
+ (t2UQSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
// Signed/Unsigned add/subtract
def t2SASX : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
@@ -2660,7 +2693,7 @@
defm t2LSL : T2I_sh_ir<0b00, "lsl", imm1_31, shl>;
defm t2LSR : T2I_sh_ir<0b01, "lsr", imm_sr, srl>;
defm t2ASR : T2I_sh_ir<0b10, "asr", imm_sr, sra>;
-defm t2ROR : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
+defm t2ROR : T2I_sh_ir<0b11, "ror", imm1_31, rotr>;
// LSL #0 is actually MOV, and has slightly different permitted registers to
// LSL with non-zero shift
@@ -2832,6 +2865,12 @@
defm t2ORN : T2I_bin_irs<0b0011, "orn",
IIC_iBITi, IIC_iBITr, IIC_iBITsi,
BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
+def : t2InstAlias<"orn${s}${p}.w $Rd, $Rn, $imm",
+ (t2ORNri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"orn${s}${p}.w $Rd, $Rn, $Rm",
+ (t2ORNrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"orn${s}${p}.w $Rd, $Rn, $ShiftedRm",
+ (t2ORNrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
/// unary operation that produces a value. These are predicable and can be
@@ -3354,8 +3393,8 @@
(t2CMPri GPRnopc:$lhs, t2_so_imm:$imm)>;
def : T2Pat<(ARMcmpZ GPRnopc:$lhs, rGPR:$rhs),
(t2CMPrr GPRnopc:$lhs, rGPR:$rhs)>;
-def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg:$rhs),
- (t2CMPrs GPRnopc:$lhs, t2_so_reg:$rhs)>;
+def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg_oneuse:$rhs),
+ (t2CMPrs GPRnopc:$lhs, t2_so_reg_oneuse:$rhs)>;
let isCompare = 1, Defs = [CPSR] in {
// shifted imm
@@ -3950,6 +3989,8 @@
let Inst{15-0} = 0b1000111100000000;
}
+def : t2InstAlias<"bl${p}.w $func", (tBL pred:$p, thumb_bl_target:$func), 0>;
+
// Compare and branch on zero / non-zero
let isBranch = 1, isTerminator = 1 in {
def tCBZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
@@ -4047,6 +4088,7 @@
let Inst{7-4} = 0b1111;
let Inst{3-0} = opt;
}
+def : t2InstAlias<"dbg${p}.w $opt", (t2DBG imm0_15:$opt, pred:$p), 0>;
// Secure Monitor Call is a system instruction.
// Option = Inst{19-16}
@@ -4145,6 +4187,18 @@
let Inst{7-0} = imm;
}
+// B9.3.19 SUBS PC, LR (Thumb)
+// In the Thumb instruction set, MOVS{<c>}{<q>} PC, LR is a pseudo-instruction
+// for SUBS{<c>}{<q>} PC, LR, #0.
+def : t2InstAlias<"movs${p}\tpc, lr", (t2SUBS_PC_LR 0, pred:$p)>;
+def : t2InstAlias<"movs${p}.w\tpc, lr", (t2SUBS_PC_LR 0, pred:$p)>;
+
+// ERET - Return from exception in Hypervisor mode.
+// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
+// includes virtualization extensions.
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
+ Requires<[IsThumb2, HasVirtualization]>;
+
// Hypervisor Call is a system instruction.
let isCall = 1 in {
def t2HVC : T2XI <(outs), (ins imm0_65535:$imm16), IIC_Br, "hvc.w\t$imm16", []>,
@@ -4160,12 +4214,6 @@
// Alias for HVC without the ".w" optional width specifier
def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
-// ERET - Return from exception in Hypervisor mode.
-// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
-// includes virtualization extensions.
-def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
- Requires<[IsThumb2, HasVirtualization]>;
-
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//
@@ -5032,8 +5080,8 @@
def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
-// Alias for RSB without the ".w" optional width specifier, and with optional
-// implied destination register.
+// Alias for RSB with and without the ".w" optional width specifier, with and
+// without explicit destination register.
def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
(t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
@@ -5043,6 +5091,13 @@
def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
(t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p}.w $Rdn, $Rm",
+ (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p}.w $Rd, $Rn, $Rm",
+ (t2RSBrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p}.w $Rd, $Rn, $ShiftedRm",
+ (t2RSBrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p,
+ cc_out:$s)>;
// SSAT/USAT optional shift operand.
def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
@@ -5431,35 +5486,74 @@
let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
-let usesCustomInserter = 1 in
+// t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in
+// ARMLowOverheadLoops if possible, or reverted to a Mov if not.
def t2DoLoopStart :
- t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
- [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
+ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc), 4, IIC_Br,
+ [(set GPRlr:$X, (int_start_loop_iterations rGPR:$tc))]>;
+// A pseudo for a DLSTP, created in the MVETPAndVPTOptimizationPass from a
+// t2DoLoopStart if the loops is tail predicated. Holds both the element
+// count and trip count of the loop, picking the correct one during
+// ARMLowOverheadLoops when it is converted to a DLSTP or DLS as required.
let isTerminator = 1, hasSideEffects = 1 in
def t2DoLoopStartTP :
- t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
+ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc, rGPR:$elts), 4, IIC_Br, []>;
+// Setup for a t2WhileLoopStart. A pair of t2WhileLoopSetup and t2WhileLoopStart
+// will be created post-ISel from a llvm.test.start.loop.iterations. This
+// t2WhileLoopSetup to setup LR and t2WhileLoopStart to perform the branch. Not
+// valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations
+// into a t2WhileLoopStartLR (or expanded).
+def t2WhileLoopSetup :
+ t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>;
+
+// A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and
+// t2LoopEnd together represent a LE instruction. Ideally these are converted
+// to a t2LoopEndDec which is lowered as a single instruction.
let hasSideEffects = 0 in
def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
-// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may
-// get converted into t2CMP and t2Bcc.
+// The branch in a t2WhileLoopSetup/t2WhileLoopStart pair, eventually turned
+// into a t2WhileLoopStartLR that does both the LR setup and branch.
def t2WhileLoopStart :
t2PseudoInst<(outs),
- (ins rGPR:$elts, brtarget:$target),
+ (ins GPRlr:$tc, brtarget:$target),
+ 4, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+// WhileLoopStartLR that sets up LR and branches on zero, equivalent to WLS. It
+// is lowered in the ARMLowOverheadLoops pass providing the branches are within
+// range. WhileLoopStartLR and LoopEnd to occupy 8 bytes because they may get
+// converted into t2CMP and t2Bcc.
+def t2WhileLoopStartLR :
+ t2PseudoInst<(outs GPRlr:$lr),
+ (ins rGPR:$tc, brtarget:$target),
8, IIC_Br, []>,
Sched<[WriteBr]>;
+// Similar to a t2DoLoopStartTP, a t2WhileLoopStartTP is a pseudo for a WLSTP
+// holding both the element count and the tripcount of the loop.
+def t2WhileLoopStartTP :
+ t2PseudoInst<(outs GPRlr:$lr),
+ (ins rGPR:$tc, rGPR:$elts, brtarget:$target),
+ 8, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+// t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair.
def t2LoopEnd :
- t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
+ t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
+// The combination of a t2LoopDec and t2LoopEnd, performing both the LR
+// decrement and branch as a single instruction. Is lowered to a LE or
+// LETP in ARMLowOverheadLoops as appropriate, or converted to t2CMP/t2Bcc
+// if the branches are out of range.
def t2LoopEndDec :
- t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target),
+ t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$tc, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
} // end isBranch, isTerminator, hasSideEffects
@@ -5492,13 +5586,22 @@
def t2CSINV : CS<"csinv", 0b1010>;
def t2CSNEG : CS<"csneg", 0b1011>;
+
let Predicates = [HasV8_1MMainline] in {
- def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
- (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
- def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
- (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
- def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
- (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ multiclass CSPats<SDNode Node, Instruction Insn> {
+ def : T2Pat<(Node GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(Node (i32 0), GPRwithZR:$fval, imm0_31:$imm),
+ (Insn ZR, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(Node GPRwithZR:$tval, (i32 0), imm0_31:$imm),
+ (Insn GPRwithZR:$tval, ZR, imm0_31:$imm)>;
+ def : T2Pat<(Node (i32 0), (i32 0), imm0_31:$imm),
+ (Insn ZR, ZR, imm0_31:$imm)>;
+ }
+
+ defm : CSPats<ARMcsinc, t2CSINC>;
+ defm : CSPats<ARMcsinv, t2CSINV>;
+ defm : CSPats<ARMcsneg, t2CSNEG>;
multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> {
def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm),
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
index 2be58d7..bcd6433 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -1126,9 +1126,12 @@
Requires<[HasFullFP16]>;
def VINSH : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
- Requires<[HasFullFP16]>;
+ Requires<[HasFullFP16]> {
+ let Constraints = "$Sd = $Sda";
+}
+
} // PostEncoderMethod
} // hasSideEffects
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 09a94cc..8be4e3f 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -1096,24 +1096,6 @@
if (NewOpc == G_LOAD || NewOpc == G_STORE)
return false;
- if (ValSize == 1 && NewOpc == Opcodes.STORE8) {
- // Before storing a 1-bit value, make sure to clear out any unneeded bits.
- Register OriginalValue = I.getOperand(0).getReg();
-
- Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
- I.getOperand(0).setReg(ValueToStore);
-
- auto InsertBefore = I.getIterator();
- auto AndI = BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.AND))
- .addDef(ValueToStore)
- .addUse(OriginalValue)
- .addImm(1)
- .add(predOps(ARMCC::AL))
- .add(condCodeOp());
- if (!constrainSelectedInstRegOperands(*AndI, TII, TRI, RBI))
- return false;
- }
-
I.setDesc(TII.get(NewOpc));
if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH)
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index d9b60f4..de88ffa 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -33,29 +33,30 @@
/// In practice, not specifying those isn't a problem, and the below functions
/// should disappear quickly as we add support for legalizing non-power-of-2
/// sized types further.
-static void
-addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
- const LegalizerInfo::SizeAndActionsVec &v) {
+static void addAndInterleaveWithUnsupported(
+ LegacyLegalizerInfo::SizeAndActionsVec &result,
+ const LegacyLegalizerInfo::SizeAndActionsVec &v) {
for (unsigned i = 0; i < v.size(); ++i) {
result.push_back(v[i]);
if (i + 1 < v[i].first && i + 1 < v.size() &&
v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, Unsupported});
+ result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
}
}
-static LegalizerInfo::SizeAndActionsVec
-widen_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+static LegacyLegalizerInfo::SizeAndActionsVec
+widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
assert(v.size() >= 1);
assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {{1, Unsupported},
- {8, WidenScalar},
- {9, Unsupported},
- {16, WidenScalar},
- {17, Unsupported}};
+ LegacyLegalizerInfo::SizeAndActionsVec result = {
+ {1, LegacyLegalizeActions::Unsupported},
+ {8, LegacyLegalizeActions::WidenScalar},
+ {9, LegacyLegalizeActions::Unsupported},
+ {16, LegacyLegalizeActions::WidenScalar},
+ {17, LegacyLegalizeActions::Unsupported}};
addAndInterleaveWithUnsupported(result, v);
auto Largest = result.back().first;
- result.push_back({Largest + 1, Unsupported});
+ result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
return result;
}
@@ -74,9 +75,10 @@
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
if (ST.isThumb1Only()) {
// Thumb1 is not supported yet.
- computeTables();
+ LegacyInfo.computeTables();
verify(*ST.getInstrInfo());
return;
}
@@ -116,13 +118,13 @@
.clampScalar(0, s32, s32);
for (unsigned Op : {G_SREM, G_UREM}) {
- setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
if (HasHWDivide)
- setAction({Op, s32}, Lower);
+ LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower);
else if (AEABI(ST))
- setAction({Op, s32}, Custom);
+ LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom);
else
- setAction({Op, s32}, Libcall);
+ LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall);
}
getActionDefinitionsBuilder(G_INTTOPTR)
@@ -147,11 +149,10 @@
// We're keeping these builders around because we'll want to add support for
// floating point to them.
auto &LoadStoreBuilder = getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalForTypesWithMemDesc({{s1, p0, 8, 8},
- {s8, p0, 8, 8},
- {s16, p0, 16, 8},
- {s32, p0, 32, 8},
- {p0, p0, 32, 8}})
+ .legalForTypesWithMemDesc({{s8, p0, s8, 8},
+ {s16, p0, s16, 8},
+ {s32, p0, s32, 8},
+ {p0, p0, p0, 8}})
.unsupportedIfMemSizeNotPow2();
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
@@ -174,7 +175,7 @@
.legalFor({s32, s64});
LoadStoreBuilder
- .legalForTypesWithMemDesc({{s64, p0, 64, 32}})
+ .legalForTypesWithMemDesc({{s64, p0, s64, 32}})
.maxScalar(0, s32);
PhiBuilder.legalFor({s64});
@@ -198,7 +199,7 @@
LoadStoreBuilder.maxScalar(0, s32);
for (auto Ty : {s32, s64})
- setAction({G_FNEG, Ty}, Lower);
+ LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower);
getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
@@ -219,6 +220,9 @@
.libcallForCartesianProduct({s32, s64}, {s32});
}
+ // Just expand whatever loads and stores are left.
+ LoadStoreBuilder.lower();
+
if (!ST.useSoftFloat() && ST.hasVFP4Base())
getActionDefinitionsBuilder(G_FMA).legalFor({s32, s64});
else
@@ -246,7 +250,7 @@
.clampScalar(0, s32, s32);
}
- computeTables();
+ LegacyInfo.computeTables();
verify(*ST.getInstrInfo());
}
@@ -385,9 +389,9 @@
StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
Register RetRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
OriginalResult};
- auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy},
- {{MI.getOperand(1).getReg(), ArgTy},
- {MI.getOperand(2).getReg(), ArgTy}});
+ auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy, 0},
+ {{MI.getOperand(1).getReg(), ArgTy, 0},
+ {MI.getOperand(2).getReg(), ArgTy, 0}});
if (Status != LegalizerHelper::Legalized)
return false;
break;
@@ -420,10 +424,10 @@
SmallVector<Register, 2> Results;
for (auto Libcall : Libcalls) {
auto LibcallResult = MRI.createGenericVirtualRegister(LLT::scalar(32));
- auto Status =
- createLibcall(MIRBuilder, Libcall.LibcallID, {LibcallResult, RetTy},
- {{MI.getOperand(2).getReg(), ArgTy},
- {MI.getOperand(3).getReg(), ArgTy}});
+ auto Status = createLibcall(MIRBuilder, Libcall.LibcallID,
+ {LibcallResult, RetTy, 0},
+ {{MI.getOperand(2).getReg(), ArgTy, 0},
+ {MI.getOperand(3).getReg(), ArgTy, 0}});
if (Status != LegalizerHelper::Legalized)
return false;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index aa1fe4e..fd06bfd 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1238,19 +1238,37 @@
/// Searches for a increment or decrement of \p Reg after \p MBBI.
static MachineBasicBlock::iterator
findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
- ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
+ ARMCC::CondCodes Pred, Register PredReg, int &Offset,
+ const TargetRegisterInfo *TRI) {
Offset = 0;
MachineBasicBlock &MBB = *MBBI->getParent();
MachineBasicBlock::iterator EndMBBI = MBB.end();
MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
- // Skip debug values.
- while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
- ++NextMBBI;
- if (NextMBBI == EndMBBI)
- return EndMBBI;
+ while (NextMBBI != EndMBBI) {
+ // Skip debug values.
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
+ ++NextMBBI;
+ if (NextMBBI == EndMBBI)
+ return EndMBBI;
- Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
- return Offset == 0 ? EndMBBI : NextMBBI;
+ unsigned Off = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+ if (Off) {
+ Offset = Off;
+ return NextMBBI;
+ }
+
+ // SP can only be combined if it is the next instruction after the original
+ // MBBI, otherwise we may be incrementing the stack pointer (invalidating
+ // anything below the new pointer) when its frame elements are still in
+ // use. Other registers can attempt to look further, until a different use
+ // or def of the register is found.
+ if (Reg == ARM::SP || NextMBBI->readsRegister(Reg, TRI) ||
+ NextMBBI->definesRegister(Reg, TRI))
+ return EndMBBI;
+
+ ++NextMBBI;
+ }
+ return EndMBBI;
}
/// Fold proceeding/trailing inc/dec of base register into the
@@ -1296,7 +1314,7 @@
} else if (Mode == ARM_AM::ib && Offset == -Bytes) {
Mode = ARM_AM::da;
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
@@ -1483,13 +1501,17 @@
} else if (Offset == -Bytes) {
NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
- if (Offset == Bytes) {
- NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
- } else if (!isAM5 && Offset == -Bytes) {
- NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
- } else
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
+ if (MergeInstr == MBB.end())
return false;
+
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+ if ((isAM5 && Offset != Bytes) ||
+ (!isAM5 && !isLegalAddressImm(NewOpc, Offset, TII))) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+ if (isAM5 || !isLegalAddressImm(NewOpc, Offset, TII))
+ return false;
+ }
}
LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr);
MBB.erase(MergeInstr);
@@ -1528,7 +1550,7 @@
(void)MIB;
LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
} else {
- int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Imm = ARM_AM::getAM2Opc(AddSub, abs(Offset), ARM_AM::no_shift);
auto MIB =
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
@@ -1558,7 +1580,7 @@
// the vestigal zero-reg offset register. When that's fixed, this clause
// can be removed entirely.
if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
- int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Imm = ARM_AM::getAM2Opc(AddSub, abs(Offset), ARM_AM::no_shift);
// STR_PRE, STR_POST
auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
@@ -1614,10 +1636,11 @@
if (Offset == 8 || Offset == -8) {
NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
- if (Offset == 8 || Offset == -8) {
- NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
- } else
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
+ if (MergeInstr == MBB.end())
+ return false;
+ NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+ if (!isLegalAddressImm(NewOpc, Offset, TII))
return false;
}
LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr);
@@ -2703,9 +2726,18 @@
// by -Offset. This can either happen in-place or be a replacement as MI is
// converted to another instruction type.
static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
- int Offset, const TargetInstrInfo *TII) {
+ int Offset, const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) {
+ // Set the Base reg
unsigned BaseOp = getBaseOperandIndex(*MI);
MI->getOperand(BaseOp).setReg(NewBaseReg);
+ // and constrain the reg class to that required by the instruction.
+ MachineFunction *MF = MI->getMF();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const MCInstrDesc &MCID = TII->get(MI->getOpcode());
+ const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI, *MF);
+ MRI.constrainRegClass(NewBaseReg, TRC);
+
int OldOffset = MI->getOperand(BaseOp + 1).getImm();
if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII))
MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset);
@@ -2948,7 +2980,7 @@
for (auto *Use : SuccessorAccesses) {
LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
- AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII);
+ AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII, TRI);
LLVM_DEBUG(dbgs() << " To : "; Use->dump());
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 8dc5320..ea41442 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -31,7 +31,7 @@
/// during the transform and pseudo instructions are replaced by real ones. In
/// some cases, when we have to revert to a 'normal' loop, we have to introduce
/// multiple instructions for a single pseudo (see RevertWhile and
-/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
+/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
/// are defined to be as large as this maximum sequence of replacement
/// instructions.
///
@@ -101,10 +101,6 @@
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
}
-static bool isDo(MachineInstr *MI) {
- return MI->getOpcode() != ARM::t2WhileLoopStart;
-}
-
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -158,7 +154,7 @@
if (auto *Preheader = ML.getLoopPreheader())
GetPredecessor(Preheader);
- else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
+ else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
GetPredecessor(Preheader);
}
};
@@ -386,7 +382,7 @@
MF = ML.getHeader()->getParent();
if (auto *MBB = ML.getLoopPreheader())
Preheader = MBB;
- else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
+ else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
Preheader = MBB;
VPTState::reset();
}
@@ -442,11 +438,11 @@
MachineOperand &getLoopStartOperand() {
if (IsTailPredicationLegal())
return TPNumElements;
- return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
+ return Start->getOperand(1);
}
unsigned getStartOpcode() const {
- bool IsDo = isDo(Start);
+ bool IsDo = isDoLoopStart(*Start);
if (!IsTailPredicationLegal())
return IsDo ? ARM::t2DLS : ARM::t2WLS;
@@ -630,31 +626,13 @@
return false;
}
- // Check that creating a [W|D]LSTP, which will define LR with an element
- // count instead of iteration count, won't affect any other instructions
- // than the LoopStart and LoopDec.
- // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
- Register StartReg = isDo(Start) ? Start->getOperand(1).getReg()
- : Start->getOperand(0).getReg();
- if (StartInsertPt == Start && StartReg == ARM::LR) {
- if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) {
- SmallPtrSet<MachineInstr *, 2> Uses;
- RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
- for (auto *Use : Uses) {
- if (Use != Start && Use != Dec) {
- LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
- return false;
- }
- }
- }
- }
-
// For tail predication, we need to provide the number of elements, instead
// of the iteration count, to the loop start instruction. The number of
// elements is provided to the vctp instruction, so we need to check that
// we can use this register at InsertPt.
MachineInstr *VCTP = VCTPs.back();
- if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
+ if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
+ Start->getOpcode() == ARM::t2WhileLoopStartTP) {
TPNumElements = Start->getOperand(2);
StartInsertPt = Start;
StartInsertBB = Start->getParent();
@@ -796,6 +774,22 @@
ToRemove.insert(ElementChain.begin(), ElementChain.end());
}
}
+
+ // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
+ // can also remove any extra instructions in the preheader, which often
+ // includes a now unused MOV.
+ if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
+ Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
+ Preheader && !Preheader->empty() &&
+ !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
+ if (auto *Def = RDA.getUniqueReachingMIDef(
+ &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
+ SmallPtrSet<MachineInstr*, 2> Ignore;
+ Ignore.insert(VCTPs.begin(), VCTPs.end());
+ TryRemove(Def, RDA, ToRemove, Ignore);
+ }
+ }
+
return true;
}
@@ -1050,53 +1044,21 @@
return false;
}
- if (Start->getOpcode() == ARM::t2WhileLoopStart &&
- (BBUtils->getOffsetOf(Start) >
- BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
- !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
- LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
- return false;
+ if (isWhileLoopStart(*Start)) {
+ MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
+ if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
+ !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+ return false;
+ }
}
return true;
};
- // Find a suitable position to insert the loop start instruction. It needs to
- // be able to safely define LR.
- auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
- MachineBasicBlock::iterator &InsertPt,
- MachineBasicBlock *&InsertBB,
- ReachingDefAnalysis &RDA,
- InstSet &ToRemove) {
- // For a t2DoLoopStart it is always valid to use the start insertion point.
- // For WLS we can define LR if LR already contains the same value.
- if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
- InsertPt = MachineBasicBlock::iterator(Start);
- InsertBB = Start->getParent();
- return true;
- }
-
- // We've found no suitable LR def and Start doesn't use LR directly. Can we
- // just define LR anyway?
- if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
- return false;
-
- InsertPt = MachineBasicBlock::iterator(Start);
- InsertBB = Start->getParent();
- return true;
- };
-
- if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
- ToRemove)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
- Revert = true;
- return;
- }
- LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
- dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
- else
- dbgs() << "ARM Loops: Will insert LoopStart at "
- << *StartInsertPt
- );
+ StartInsertPt = MachineBasicBlock::iterator(Start);
+ StartInsertBB = Start->getParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
+ << *StartInsertPt);
Revert = !ValidateRanges(Start, End, BBUtils, ML);
CannotTailPredicate = !ValidateTailPredicate();
@@ -1122,7 +1084,85 @@
return true;
}
-bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
+static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
+
+ auto GetFrameIndex = [](MachineMemOperand *Operand) {
+ const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
+ if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
+ if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
+ return FS->getFrameIndex();
+ }
+ }
+ return -1;
+ };
+
+ auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
+ switch (I->getOpcode()) {
+ case ARM::MVE_VSTRWU32:
+ case ARM::MVE_VLDRWU32: {
+ return I->getOperand(1).getReg() == ARM::SP &&
+ I->memoperands().size() == 1 &&
+ GetFrameIndex(I->memoperands().front()) >= 0;
+ }
+ default:
+ return false;
+ }
+ };
+
+ // An unpredicated vector register spill is allowed if all of the uses of the
+ // stack slot are within the loop
+ if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
+ return false;
+
+ // Search all blocks after the loop for accesses to the same stack slot.
+ // ReachingDefAnalysis doesn't work for sp as it relies on registers being
+ // live-out (which sp never is) to know what blocks to look in
+ if (MI->memoperands().size() == 0)
+ return false;
+ int FI = GetFrameIndex(MI->memoperands().front());
+
+ MachineFrameInfo FrameInfo = MI->getParent()->getParent()->getFrameInfo();
+ if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
+ return false;
+
+ SmallVector<MachineBasicBlock *> Frontier;
+ ML->getExitBlocks(Frontier);
+ SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
+ unsigned Idx = 0;
+ while (Idx < Frontier.size()) {
+ MachineBasicBlock *BB = Frontier[Idx];
+ bool LookAtSuccessors = true;
+ for (auto &I : *BB) {
+ if (!IsStackOp(&I) || I.memoperands().size() == 0)
+ continue;
+ if (GetFrameIndex(I.memoperands().front()) != FI)
+ continue;
+ // If this block has a store to the stack slot before any loads then we
+ // can ignore the block
+ if (I.getOpcode() == ARM::MVE_VSTRWU32) {
+ LookAtSuccessors = false;
+ break;
+ }
+ // If the store and the load are using the same stack slot then the
+ // store isn't valid for tail predication
+ if (I.getOpcode() == ARM::MVE_VLDRWU32)
+ return false;
+ }
+
+ if (LookAtSuccessors) {
+ for (auto Succ : BB->successors()) {
+ if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
+ Frontier.push_back(Succ);
+ }
+ }
+ Visited.insert(BB);
+ Idx++;
+ }
+
+ return true;
+}
+
+bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
if (CannotTailPredicate)
return false;
@@ -1178,7 +1218,7 @@
// If the instruction is already explicitly predicated, then the conversion
// will be fine, but ensure that all store operations are predicated.
- if (MI->mayStore())
+ if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
return IsUse;
// If this instruction defines the VPR, update the predicate for the
@@ -1235,16 +1275,15 @@
for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
Changed |= ProcessLoop(*I);
- LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
- if (auto *Preheader = ML->getLoopPreheader())
- dbgs() << " - " << Preheader->getName() << "\n";
- else if (auto *Preheader = MLI->findLoopPreheader(ML))
- dbgs() << " - " << Preheader->getName() << "\n";
- else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
- dbgs() << " - " << Preheader->getName() << "\n";
- for (auto *MBB : ML->getBlocks())
- dbgs() << " - " << MBB->getName() << "\n";
- );
+ LLVM_DEBUG({
+ dbgs() << "ARM Loops: Processing loop containing:\n";
+ if (auto *Preheader = ML->getLoopPreheader())
+ dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
+ else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
+ dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
+ for (auto *MBB : ML->getBlocks())
+ dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
+ });
// Search the given block for a loop start instruction. If one isn't found,
// and there's only one predecessor block, search that one too.
@@ -1266,7 +1305,7 @@
if (LoLoop.Preheader)
LoLoop.Start = SearchForStart(LoLoop.Preheader);
else
- return false;
+ return Changed;
// Find the low-overhead loop components and decide whether or not to fall
// back to a normal loop. Also look for a vctp instructions and decide
@@ -1300,9 +1339,12 @@
LLVM_DEBUG(LoLoop.dump());
if (!LoLoop.FoundAllComponents()) {
LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
- return false;
+ return Changed;
}
+ assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
+ "Expected t2WhileLoopStart to be removed before regalloc!");
+
// Check that the only instruction using LoopDec is LoopEnd. This can only
// happen when the Dec and End are separate, not a single t2LoopEndDec.
// TODO: Check for copy chains that really have no effect.
@@ -1325,11 +1367,11 @@
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
- MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+ MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
- RevertWhileLoopStart(MI, TII, BrOpc);
+ RevertWhileLoopStartLR(MI, TII, BrOpc);
}
void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
@@ -1426,8 +1468,7 @@
LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
- MachineInstr *Def =
- RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0);
+ MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
if (!Def) {
LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
return;
@@ -1452,17 +1493,26 @@
unsigned Opc = LoLoop.getStartOpcode();
MachineOperand &Count = LoLoop.getLoopStartOperand();
- MachineInstrBuilder MIB =
- BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
+ // A DLS lr, lr we needn't emit
+ MachineInstr* NewStart;
+ if (Opc == ARM::t2DLS && Count.isReg() && Count.getReg() == ARM::LR) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
+ NewStart = nullptr;
+ } else {
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
- MIB.addDef(ARM::LR);
- MIB.add(Count);
- if (!isDo(Start))
- MIB.add(Start->getOperand(1));
+ MIB.addDef(ARM::LR);
+ MIB.add(Count);
+ if (isWhileLoopStart(*Start))
+ MIB.addMBB(getWhileLoopStartTargetBB(*Start));
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
+ NewStart = &*MIB;
+ }
LoLoop.ToRemove.insert(Start);
- LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
- return &*MIB;
+ return NewStart;
}
void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
@@ -1640,7 +1690,7 @@
};
if (LoLoop.Revert) {
- if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
+ if (isWhileLoopStart(*LoLoop.Start))
RevertWhile(LoLoop.Start);
else
RevertDo(LoLoop.Start);
@@ -1650,7 +1700,8 @@
RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
} else {
LoLoop.Start = ExpandLoopStart(LoLoop);
- RemoveDeadBranch(LoLoop.Start);
+ if (LoLoop.Start)
+ RemoveDeadBranch(LoLoop.Start);
LoLoop.End = ExpandLoopEnd(LoLoop);
RemoveDeadBranch(LoLoop.End);
if (LoLoop.IsTailPredicationLegal())
@@ -1710,7 +1761,7 @@
Changed = true;
for (auto *Start : Starts) {
- if (Start->getOpcode() == ARM::t2WhileLoopStart)
+ if (isWhileLoopStart(*Start))
RevertWhile(Start);
else
RevertDo(Start);
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index f893faa..e4b0229 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -110,7 +110,7 @@
APFloat Val = MO.getFPImm()->getValueAPF();
bool ignored;
Val.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
- MCOp = MCOperand::createFPImm(Val.convertToDouble());
+ MCOp = MCOperand::createDFPImm(bit_cast<uint64_t>(Val.convertToDouble()));
break;
}
case MachineOperand::MO_RegisterMask:
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 298c8a2..8516552 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -43,7 +43,9 @@
/// "attach" GPR-part to the part that was passed via stack.
unsigned StByValParamsPadding = 0;
- /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+ /// ArgsRegSaveSize - Size of the register save area for vararg functions or
+ /// those making guaranteed tail calls that need more stack argument space
+ /// than is provided by this functions incoming parameters.
///
unsigned ArgRegsSaveSize = 0;
@@ -118,6 +120,10 @@
/// being passed on the stack
unsigned ArgumentStackSize = 0;
+ /// ArgumentStackToRestore - amount of bytes on stack consumed that we must
+ /// restore on return.
+ unsigned ArgumentStackToRestore = 0;
+
/// CoalescedWeights - mapping of basic blocks to the rolling counter of
/// coalesced weights.
DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
@@ -195,6 +201,9 @@
unsigned getArgumentStackSize() const { return ArgumentStackSize; }
void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+ unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+ void setArgumentStackToRestore(unsigned v) { ArgumentStackToRestore = v; }
+
void initPICLabelUId(unsigned UId) {
PICLabelUId = UId;
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 9a7c1f5..46baf89 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -299,14 +299,6 @@
};
}
-template<typename MemInst>
-static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
- const DataLayout &DL, ScalarEvolution &SE) {
- if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
- return true;
- return false;
-}
-
bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
MemInstList &VecMem) {
if (!Ld0 || !Ld1)
@@ -414,7 +406,7 @@
if (Base == Offset || OffsetLoads.count(Offset))
continue;
- if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+ if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
SafeToPair(Base, Offset)) {
LoadPairs[Base] = Offset;
OffsetLoads.insert(Offset);
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/src/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
index fe32433..b379882 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -83,7 +83,7 @@
def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>;
def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>;
// These require 32-bit instructions.
-let CostPerUse = 1 in {
+let CostPerUse = [1] in {
def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>;
def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>;
def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
index cfcc7d5..332acb4 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -165,12 +165,16 @@
struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
const char *getThunkPrefix() { return SLSBLRNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
+ ComdatThunks &= !MF.getSubtarget<ARMSubtarget>().hardenSlsNoComdat();
// FIXME: This could also check if there are any indirect calls in the
// function to more accurately reflect if a thunk will be needed.
return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr();
}
void insertThunks(MachineModuleInfo &MMI);
void populateThunk(MachineFunction &MF);
+
+private:
+ bool ComdatThunks = true;
};
} // namespace
@@ -179,7 +183,7 @@
// based on which registers are actually used in indirect calls in this
// function. But would that be a worthwhile optimization?
for (auto T : SLSBLRThunks)
- createThunkFunction(MMI, T.Name);
+ createThunkFunction(MMI, T.Name, ComdatThunks);
}
void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td b/src/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
index 12296ad..25bc840 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
@@ -19,6 +19,8 @@
let CompleteModel = 0;
}
+let SchedModel = CortexM7Model in {
+
//===--------------------------------------------------------------------===//
// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
// pipe. The stages relevant to scheduling are as follows:
@@ -33,7 +35,9 @@
// for scheduling, so simple ALU operations executing in EX2 will have
// ReadAdvance<0> (the default) for their source operands and Latency = 1.
-def M7UnitLoad : ProcResource<2> { let BufferSize = 0; }
+def M7UnitLoadL : ProcResource<1> { let BufferSize = 0; }
+def M7UnitLoadH : ProcResource<1> { let BufferSize = 0; }
+def M7UnitLoad : ProcResGroup<[M7UnitLoadL,M7UnitLoadH]> { let BufferSize = 0; }
def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
def M7UnitALU : ProcResource<2>;
def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
@@ -41,14 +45,14 @@
def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
-def M7UnitVPort : ProcResource<2> { let BufferSize = 0; }
+def M7UnitVPortL : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPortH : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPort : ProcResGroup<[M7UnitVPortL,M7UnitVPortH]> { let BufferSize = 0; }
def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }
//===---------------------------------------------------------------------===//
// Subtarget-specific SchedWrite types with map ProcResources and set latency.
-let SchedModel = CortexM7Model in {
-
def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
// Basic ALU with shifts.
@@ -105,39 +109,42 @@
// Floating point conversions.
def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }
+def M7WriteFPMOV64 : SchedWriteRes<[M7UnitVPortL, M7UnitVPortH]> {
+ let Latency = 3;
+}
// The FP pipeline has a latency of 3 cycles.
// ALU operations (32/64-bit). These go down the FP pipeline.
def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
-def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 4;
let BeginGroup = 1;
}
// Multiplication
def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
-def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 7;
let BeginGroup = 1;
}
// Multiply-accumulate. FPMAC goes down the FP Pipeline.
def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
-def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 11;
let BeginGroup = 1;
}
// Division. Effective scheduling latency is 3, though real latency is larger
def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
-def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 30;
let BeginGroup = 1;
}
// Square-root. Effective scheduling latency is 3; real latency is larger
def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
-def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 30;
let BeginGroup = 1;
}
@@ -283,12 +290,12 @@
// VFP loads and stores
def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
-def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
+def M7LoadDP : SchedWriteRes<[M7UnitLoadL, M7UnitLoadH, M7UnitVPortL, M7UnitVPortH]> {
let Latency = 2;
let SingleIssue = 1;
}
def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
-def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
+def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPortL, M7UnitVPortH]> {
let SingleIssue = 1;
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 7e06229..12d4ad8 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -11,12 +11,27 @@
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
+#include "ARMTargetTransformInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "arm-selectiondag-info"
+cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
+ "arm-memtransfer-tploop", cl::Hidden,
+ cl::desc("Control conversion of memcpy to "
+ "Tail predicated loops (WLSTP)"),
+ cl::init(TPLoop::ForceDisabled),
+ cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
+ "Don't convert memcpy to TP loop."),
+ clEnumValN(TPLoop::ForceEnabled, "force-enabled",
+ "Always convert memcpy to TP loop."),
+ clEnumValN(TPLoop::Allow, "allow",
+ "Allow (may be subject to certain conditions) "
+ "conversion of memcpy to TP loop.")));
+
// Emit, if possible, a specialized version of the given Libcall. Typically this
// means selecting the appropriately aligned version, but we also convert memset
// of 0 into memclr.
@@ -124,19 +139,52 @@
return CallResult.second;
}
+static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
+ const SelectionDAG &DAG,
+ ConstantSDNode *ConstantSize,
+ Align Alignment, bool IsMemcpy) {
+ auto &F = DAG.getMachineFunction().getFunction();
+ if (!EnableMemtransferTPLoop)
+ return false;
+ if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
+ return true;
+ // Do not generate inline TP loop if optimizations is disabled,
+ // or if optimization for size (-Os or -Oz) is on.
+ if (F.hasOptNone() || F.hasOptSize())
+ return false;
+ // If cli option is unset, for memset always generate inline TP.
+ // For memcpy, check some conditions
+ if (!IsMemcpy)
+ return true;
+ if (!ConstantSize && Alignment >= Align(4))
+ return true;
+ if (ConstantSize &&
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
+ ConstantSize->getZExtValue() <
+ Subtarget.getMaxMemcpyTPInlineSizeThreshold())
+ return true;
+ return false;
+}
+
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ if (Subtarget.hasMVEIntegerOps() &&
+ shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
+ return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
+ DAG.getZExtOrTrunc(Size, dl, MVT::i32));
+
// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.
if (Alignment < Align(4))
return SDValue();
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);
@@ -250,6 +298,22 @@
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
+
+ const ARMSubtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ // Generate TP loop for llvm.memset
+ if (Subtarget.hasMVEIntegerOps() &&
+ shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
+ false)) {
+ Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
+ return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
+ DAG.getZExtOrTrunc(Size, dl, MVT::i32));
+ }
+
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMSET);
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 5cb608b..90f1b69 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -230,7 +230,7 @@
// registers are the 4 used for parameters. We don't currently do this
// case.
- SupportsTailCall = !isThumb() || hasV8MBaselineOps();
+ SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps();
if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
SupportsTailCall = false;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index fd9b94f..a8a9ae6 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -20,6 +20,7 @@
#include "ARMISelLowering.h"
#include "ARMSelectionDAGInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -117,6 +118,7 @@
ARMv84a,
ARMv85a,
ARMv86a,
+ ARMv87a,
ARMv8a,
ARMv8mBaseline,
ARMv8mMainline,
@@ -473,6 +475,9 @@
/// Harden against Straight Line Speculation for indirect calls.
bool HardenSlsBlr = false;
+ /// Generate thunk code for SLS mitigation in the normal text section.
+ bool HardenSlsNoComdat = false;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
@@ -536,6 +541,12 @@
return 64;
}
+ /// getMaxMemcpyTPInlineSizeThreshold - Returns the maximum size
+ /// that still makes it profitable to inline a llvm.memcpy as a Tail
+ /// Predicated loop.
+ /// This threshold should only be used for constant size inputs.
+ unsigned getMaxMemcpyTPInlineSizeThreshold() const { return 128; }
+
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
@@ -809,8 +820,10 @@
return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
}
- bool useR7AsFramePointer() const {
- return isTargetDarwin() || (!isTargetWindows() && isThumb());
+ MCPhysReg getFramePointerReg() const {
+ if (isTargetDarwin() || (!isTargetWindows() && isThumb()))
+ return ARM::R7;
+ return ARM::R11;
}
/// Returns true if the frame setup is split into two separate pushes (first
@@ -818,7 +831,7 @@
/// to lr. This is always required on Thumb1-only targets, as the push and
/// pop instructions can't access the high registers.
bool splitFramePushPop(const MachineFunction &MF) const {
- return (useR7AsFramePointer() &&
+ return (getFramePointerReg() == ARM::R7 &&
MF.getTarget().Options.DisableFramePointerElim(MF)) ||
isThumb1Only();
}
@@ -910,7 +923,12 @@
unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
- unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; }
+ unsigned
+ getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const {
+ if (CostKind == TargetTransformInfo::TCK_CodeSize)
+ return 1;
+ return MVEVectorCostFactor;
+ }
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
@@ -918,6 +936,7 @@
bool hardenSlsRetBr() const { return HardenSlsRetBr; }
bool hardenSlsBlr() const { return HardenSlsBlr; }
+ bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 237ef54..ae7ea7c 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -47,6 +47,7 @@
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/CFGuard.h"
+#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
#include <cassert>
#include <memory>
@@ -96,12 +97,13 @@
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
initializeMVEVPTBlockPass(Registry);
- initializeMVEVPTOptimisationsPass(Registry);
+ initializeMVETPAndVPTOptimisationsPass(Registry);
initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
initializeARMBlockPlacementPass(Registry);
initializeMVEGatherScatterLoweringPass(Registry);
initializeARMSLSHardeningPass(Registry);
+ initializeMVELaneInterleavingPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -273,8 +275,7 @@
// function before we can generate a subtarget. We also need to use
// it as a key for the subtarget since that can be the only difference
// between two functions.
- bool SoftFloat =
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool SoftFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
// If the soft float attribute is set on the function turn on the soft float
// subtarget feature.
if (SoftFloat)
@@ -416,6 +417,7 @@
}));
addPass(createMVEGatherScatterLoweringPass());
+ addPass(createMVELaneInterleavingPass());
TargetPassConfig::addIRPasses();
@@ -461,6 +463,14 @@
if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createHardwareLoopsPass());
addPass(createMVETailPredicationPass());
+ // FIXME: IR passes can delete address-taken basic blocks, deleting
+ // corresponding blockaddresses. ARMConstantPoolConstant holds references to
+ // address-taken basic blocks which can be invalidated if the function
+ // containing the blockaddress has already been codegen'd and the basic
+ // block is removed. Work around this by forcing all IR passes to run before
+ // any ISel takes place. We should have a more principled way of handling
+ // this. See D99707 for more details.
+ addPass(createBarrierNoopPass());
}
return false;
@@ -487,13 +497,13 @@
}
bool ARMPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
void ARMPassConfig::addPreRegAlloc() {
if (getOptLevel() != CodeGenOpt::None) {
- addPass(createMVEVPTOptimisationsPass());
+ addPass(createMVETPAndVPTOptimisationsPass());
addPass(createMLxExpansionPass());
@@ -564,7 +574,10 @@
addPass(createARMConstantIslandPass());
addPass(createARMLowOverheadLoopsPass());
- // Identify valid longjmp targets for Windows Control Flow Guard.
- if (TM->getTargetTriple().isOSWindows())
+ if (TM->getTargetTriple().isOSWindows()) {
+ // Identify valid longjmp targets for Windows Control Flow Guard.
addPass(createCFGuardLongjmpPass());
+ // Identify valid eh continuation targets for Windows EHCont Guard.
+ addPass(createEHContGuardCatchretPass());
+ }
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 3f0e336..b03bff9 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -49,7 +49,8 @@
// Since we cannot modify flags for an existing section, we create a new
// section with the right flags, and use 0 as the unique ID for
// execute-only text
- TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U, nullptr);
+ TextSection =
+ Ctx.getELFSection(".text", Type, Flags, 0, "", false, 0U, nullptr);
}
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 8901934..cf7456e 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -100,18 +100,20 @@
return MatchExact && MatchSubset;
}
-bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
- if (L->getHeader()->getParent()->hasOptSize())
- return false;
+TTI::AddressingModeKind
+ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
+ ScalarEvolution *SE) const {
if (ST->hasMVEIntegerOps())
- return false;
- return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
-}
+ return TTI::AMK_PostIndexed;
-bool ARMTTIImpl::shouldFavorPostInc() const {
- if (ST->hasMVEIntegerOps())
- return true;
- return false;
+ if (L->getHeader()->getParent()->hasOptSize())
+ return TTI::AMK_None;
+
+ if (ST->isMClass() && ST->isThumb2() &&
+ L->getNumBlocks() == 1)
+ return TTI::AMK_PreIndexed;
+
+ return TTI::AMK_None;
}
Optional<Instruction *>
@@ -199,7 +201,7 @@
Type *IntTy32 = Type::getInt32Ty(II.getContext());
Metadata *M[] = {
ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
- ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+ ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
return &II;
}
@@ -246,8 +248,8 @@
return None;
}
-int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -281,8 +283,8 @@
// Constants smaller than 256 fit in the immediate field of
// Thumb1 instructions so we return a zero cost and 1 otherwise.
-int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
return 0;
@@ -322,10 +324,10 @@
return false;
}
-int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
@@ -335,6 +337,11 @@
Idx == 1)
return 0;
+ // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
+ // splitting any large offsets.
+ if (Opcode == Instruction::GetElementPtr && Idx != 0)
+ return 0;
+
if (Opcode == Instruction::And) {
// UXTB/UXTH
if (Imm == 255 || Imm == 65535)
@@ -377,7 +384,9 @@
return getIntImmCost(Imm, Ty, CostKind);
}
-int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind == TTI::TCK_RecipThroughput &&
(ST->hasNEON() || ST->hasMVEIntegerOps())) {
// FIXME: The vectorizer is highly sensistive to the cost of these
@@ -386,18 +395,19 @@
// vector targets.
return 0;
}
- return BaseT::getCFInstrCost(Opcode, CostKind);
+ return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// TODO: Allow non-throughput costs that aren't binary.
- auto AdjustCost = [&CostKind](int Cost) {
+ auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
@@ -426,7 +436,8 @@
(Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
- return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+ return 2 * DstTy.getVectorNumElements() *
+ ST->getMVEVectorCostFactor(CostKind);
// The extend of other kinds of load is free
if (CCH == TTI::CastContextHint::Normal ||
@@ -470,7 +481,7 @@
if (const auto *Entry =
ConvertCostTableLookup(MVELoadConversionTbl, ISD,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
- return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
@@ -482,7 +493,7 @@
if (const auto *Entry =
ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
- return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
// The truncate of a store is free. This is the mirror of extends above.
@@ -499,7 +510,7 @@
if (const auto *Entry =
ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
- return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
@@ -510,7 +521,7 @@
if (const auto *Entry =
ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
- return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
}
@@ -553,7 +564,7 @@
{ISD::FP_EXTEND, MVT::v2f32, 2},
{ISD::FP_EXTEND, MVT::v4f32, 4}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
return AdjustCost(LT.first * Entry->Cost);
}
@@ -734,14 +745,15 @@
if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
ISD, DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
// As general rule, fp converts that were not matched above are scalarized
// and cost 1 vcvt for each lane, so long as the instruction is available.
// If not it will become a series of function calls.
- const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind);
+ const InstructionCost CallCost =
+ getCallInstrCost(nullptr, Dst, {Src}, CostKind);
int Lanes = 1;
if (SrcTy.isFixedLengthVector())
Lanes = SrcTy.getVectorNumElements();
@@ -784,14 +796,14 @@
}
int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
- ? ST->getMVEVectorCostFactor()
+ ? ST->getMVEVectorCostFactor(CostKind)
: 1;
return AdjustCost(
BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
-int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
// Penalize inserting into an D-subregister. We end up with a three times
// lower estimated throughput on swift.
if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
@@ -809,27 +821,28 @@
// of NEON and VFP code and should be therefore penalized.
if (ValTy->isVectorTy() &&
ValTy->getScalarSizeInBits() <= 32)
- return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+ return std::max<InstructionCost>(
+ BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
}
if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement)) {
- // We say MVE moves costs at least the MVEVectorCostFactor, even though
- // they are scalar instructions. This helps prevent mixing scalar and
- // vector, to prevent vectorising where we end up just scalarising the
- // result anyway.
- return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
- ST->getMVEVectorCostFactor()) *
- cast<FixedVectorType>(ValTy)->getNumElements() / 2;
+ // Integer cross-lane moves are more expensive than float, which can
+ // sometimes just be vmovs. Integer involve being passes to GPR registers,
+ // causing more of a delay.
+ std::pair<InstructionCost, MVT> LT =
+ getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
+ return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
}
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// Thumb scalar code size cost for select.
@@ -843,7 +856,7 @@
// - may require one or more conditional mov (including an IT),
// - can't operate directly on immediates,
// - require live flags, which we can't copy around easily.
- int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
+ InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
// Possible IT instruction for Thumb2, or more for Thumb1.
++Cost;
@@ -856,6 +869,52 @@
return Cost;
}
+ // If this is a vector min/max/abs, use the cost of that intrinsic directly
+ // instead. Hopefully when min/max intrinsics are more prevalent this code
+ // will not be needed.
+ const Instruction *Sel = I;
+ if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
+ Sel->hasOneUse())
+ Sel = cast<Instruction>(Sel->user_back());
+ if (Sel && ValTy->isVectorTy() &&
+ (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
+ const Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
+ unsigned IID = 0;
+ switch (SPF) {
+ case SPF_ABS:
+ IID = Intrinsic::abs;
+ break;
+ case SPF_SMIN:
+ IID = Intrinsic::smin;
+ break;
+ case SPF_SMAX:
+ IID = Intrinsic::smax;
+ break;
+ case SPF_UMIN:
+ IID = Intrinsic::umin;
+ break;
+ case SPF_UMAX:
+ IID = Intrinsic::umax;
+ break;
+ case SPF_FMINNUM:
+ IID = Intrinsic::minnum;
+ break;
+ case SPF_FMAXNUM:
+ IID = Intrinsic::maxnum;
+ break;
+ default:
+ break;
+ }
+ if (IID) {
+ // The ICmp is free, the select gets the cost of the min/max/etc
+ if (Sel != I)
+ return 0;
+ IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
+ return getIntrinsicInstrCost(CostAttrs, CostKind);
+ }
+ }
+
// On NEON a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
// Lowering of some vector selects is currently far from perfect.
@@ -874,23 +933,60 @@
return Entry->Cost;
}
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, ValTy);
return LT.first;
}
+ if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
+ (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
+ FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
+ FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
+ if (!VecCondTy)
+ VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
+
+ // If we don't have mve.fp any fp operations will need to be scalarized.
+ if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
+ // One scalaization insert, one scalarization extract and the cost of the
+ // fcmps.
+ return BaseT::getScalarizationOverhead(VecValTy, false, true) +
+ BaseT::getScalarizationOverhead(VecCondTy, true, false) +
+ VecValTy->getNumElements() *
+ getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
+ VecCondTy->getScalarType(), VecPred, CostKind,
+ I);
+ }
+
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, ValTy);
+ int BaseCost = ST->getMVEVectorCostFactor(CostKind);
+ // There are two types - the input that specifies the type of the compare
+ // and the output vXi1 type. Because we don't know how the output will be
+ // split, we may need an expensive shuffle to get two in sync. This has the
+ // effect of making larger than legal compares (v8i32 for example)
+ // expensive.
+ if (LT.second.getVectorNumElements() > 2) {
+ if (LT.first > 1)
+ return LT.first * BaseCost +
+ BaseT::getScalarizationOverhead(VecCondTy, true, false);
+ return BaseCost;
+ }
+ }
+
// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
- if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
- ValTy->isVectorTy())
- BaseCost = ST->getMVEVectorCostFactor();
+ if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor(CostKind);
return BaseCost *
BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
-int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) {
+InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
+ ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -1037,7 +1133,7 @@
return -1;
}
-int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
+InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
// To model the cost of a library call, we assume 1 for the call, and
@@ -1047,8 +1143,10 @@
return NumOps;
}
-int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
- int Index, VectorType *SubTp) {
+InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *Tp, ArrayRef<int> Mask,
+ int Index, VectorType *SubTp) {
+ Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasNEON()) {
if (Kind == TTI::SK_Broadcast) {
static const CostTblEntry NEONDupTbl[] = {
@@ -1065,8 +1163,7 @@
{ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry =
CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
@@ -1087,8 +1184,7 @@
{ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry =
CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
@@ -1112,7 +1208,7 @@
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
@@ -1128,27 +1224,34 @@
{ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
LT.second))
- return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
+ return LT.first * Entry->Cost *
+ ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
+ }
+
+ if (!Mask.empty()) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (Mask.size() <= LT.second.getVectorNumElements() &&
+ (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
+ isVREVMask(Mask, LT.second, 64)))
+ return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
}
}
+
int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
- ? ST->getMVEVectorCostFactor()
+ ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
: 1;
- return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
-int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost ARMTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
// Make operations on i1 relatively expensive as this often involves
@@ -1165,7 +1268,7 @@
}
}
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->hasNEON()) {
const unsigned FunctionCallDivCost = 20;
@@ -1214,9 +1317,8 @@
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
- Op2Info,
- Opd1PropInfo, Opd2PropInfo);
+ InstructionCost Cost = BaseT::getArithmeticInstrCost(
+ Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
// This is somewhat of a hack. The problem that we are facing is that SROA
// creates a sequence of shift, and, or instructions to construct values.
@@ -1262,9 +1364,8 @@
// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
- if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
- Ty->isVectorTy())
- BaseCost = ST->getMVEVectorCostFactor();
+ if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor(CostKind);
// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
// without treating floats as more expensive that scalars or increasing the
@@ -1276,20 +1377,22 @@
// Else this is expand, assume that we need to scalarize this op.
if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
unsigned Num = VTy->getNumElements();
- unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
- CostKind);
+ InstructionCost Cost =
+ getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
return BaseCost;
}
-int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return 1;
@@ -1304,7 +1407,7 @@
cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
return LT.first * 4;
}
@@ -1321,25 +1424,25 @@
: cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
DstTy->getScalarType()->isFloatTy())
- return ST->getMVEVectorCostFactor();
+ return ST->getMVEVectorCostFactor(CostKind);
}
int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
- ? ST->getMVEVectorCostFactor()
+ ? ST->getMVEVectorCostFactor(CostKind)
: 1;
return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind, I);
}
-unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
- Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
if (ST->hasMVEIntegerOps()) {
if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
- return ST->getMVEVectorCostFactor();
+ return ST->getMVEVectorCostFactor(CostKind);
if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
- return ST->getMVEVectorCostFactor();
+ return ST->getMVEVectorCostFactor(CostKind);
}
if (!isa<FixedVectorType>(Src))
return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
@@ -1349,7 +1452,7 @@
return cast<FixedVectorType>(Src)->getNumElements() * 8;
}
-int ARMTTIImpl::getInterleavedMemoryOpCost(
+InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
@@ -1368,9 +1471,10 @@
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one vldN/vstN instruction.
- int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
+ int BaseCost =
+ ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
if (NumElts % Factor == 0 &&
- TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+ TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
// Some smaller than legal interleaved patterns are cheap as we can make
@@ -1389,11 +1493,9 @@
UseMaskForCond, UseMaskForGaps);
}
-unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost ARMTTIImpl::getGatherScatterOpCost(
+ unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
using namespace PatternMatch;
if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
@@ -1406,18 +1508,20 @@
unsigned NumElems = VTy->getNumElements();
unsigned EltSize = VTy->getScalarSizeInBits();
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
// For now, it is assumed that for the MVE gather instructions the loads are
// all effectively serialised. This means the cost is the scalar cost
// multiplied by the number of elements being loaded. This is possibly very
// conservative, but even so we still end up vectorising loops because the
// cost per iteration for many loops is lower than for scalar loops.
- unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
+ InstructionCost VectorCost =
+ NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
// The scalarization cost should be a lot higher. We use the number of vector
// elements plus the scalarization overhead.
- unsigned ScalarCost =
- NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
+ InstructionCost ScalarCost =
+ NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
+ BaseT::getScalarizationOverhead(VTy, false, true);
if (EltSize < 8 || Alignment < EltSize / 8)
return ScalarCost;
@@ -1488,16 +1592,19 @@
return ScalarCost;
}
-int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF))
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+
EVT ValVT = TLI->getValueType(DL, ValTy);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
static const CostTblEntry CostTblAdd[]{
{ISD::ADD, MVT::v16i8, 1},
@@ -1505,10 +1612,9 @@
{ISD::ADD, MVT::v4i32, 1},
};
if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
- return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
+ return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
}
InstructionCost
@@ -1518,20 +1624,22 @@
EVT ValVT = TLI->getValueType(DL, ValTy);
EVT ResVT = TLI->getValueType(DL, ResTy);
if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, ValTy);
if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
(LT.second == MVT::v8i16 &&
ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
(LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
- return ST->getMVEVectorCostFactor() * LT.first;
+ return ST->getMVEVectorCostFactor(CostKind) * LT.first;
}
return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
CostKind);
}
-int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
switch (ICA.getID()) {
case Intrinsic::get_active_lane_mask:
// Currently we make a somewhat optimistic assumption that
@@ -1550,25 +1658,44 @@
case Intrinsic::usub_sat: {
if (!ST->hasMVEIntegerOps())
break;
- // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
Type *VT = ICA.getReturnType();
- if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
- VT = VectorType::get(VT, ICA.getVectorFactor());
- std::pair<int, MVT> LT =
- TLI->getTypeLegalizationCost(DL, VT);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
LT.second == MVT::v16i8) {
- // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
+ // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
// need to extend the type, as it uses shr(qadd(shl, shl)).
- unsigned Instrs = LT.second.getScalarSizeInBits() ==
- ICA.getReturnType()->getScalarSizeInBits()
- ? 1
- : 4;
- return LT.first * ST->getMVEVectorCostFactor() * Instrs;
+ unsigned Instrs =
+ LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
+ return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
}
break;
}
+ case Intrinsic::abs:
+ case Intrinsic::smin:
+ case Intrinsic::smax:
+ case Intrinsic::umin:
+ case Intrinsic::umax: {
+ if (!ST->hasMVEIntegerOps())
+ break;
+ Type *VT = ICA.getReturnType();
+
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
+ if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v16i8)
+ return LT.first * ST->getMVEVectorCostFactor(CostKind);
+ break;
+ }
+ case Intrinsic::minnum:
+ case Intrinsic::maxnum: {
+ if (!ST->hasMVEFloatOps())
+ break;
+ Type *VT = ICA.getReturnType();
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
+ if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
+ return LT.first * ST->getMVEVectorCostFactor(CostKind);
+ break;
+ }
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -1763,7 +1890,7 @@
default:
break;
case Intrinsic::start_loop_iterations:
- case Intrinsic::test_set_loop_iterations:
+ case Intrinsic::test_start_loop_iterations:
case Intrinsic::loop_decrement:
case Intrinsic::loop_decrement_reg:
return true;
@@ -2009,6 +2136,10 @@
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
+ // Enable Upper bound unrolling universally, not dependant upon the conditions
+ // below.
+ UP.UpperBound = true;
+
// Only currently enable these preferences for M-Class cores.
if (!ST->isMClass())
return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
@@ -2019,10 +2150,6 @@
if (L->getHeader()->getParent()->hasOptSize())
return;
- // Only enable on Thumb-2 targets.
- if (!ST->isThumb2())
- return;
-
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
LLVM_DEBUG(dbgs() << "Loop has:\n"
@@ -2045,7 +2172,7 @@
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Don't unroll vectorised loop. MVE does not benefit from it as much as
@@ -2067,13 +2194,39 @@
}
}
+ // On v6m cores, there are very few registers available. We can easily end up
+ // spilling and reloading more registers in an unrolled loop. Look at the
+ // number of LCSSA phis as a rough measure of how many registers will need to
+ // be live out of the loop, reducing the default unroll count if more than 1
+ // value is needed. In the long run, all of this should be being learnt by a
+ // machine.
+ unsigned UnrollCount = 4;
+ if (ST->isThumb1Only()) {
+ unsigned ExitingValues = 0;
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L->getExitBlocks(ExitBlocks);
+ for (auto *Exit : ExitBlocks) {
+ // Count the number of LCSSA phis. Exclude values coming from GEP's as
+ // only the last is expected to be needed for address operands.
+ unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
+ return PH.getNumOperands() != 1 ||
+ !isa<GetElementPtrInst>(PH.getOperand(0));
+ });
+ ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
+ }
+ if (ExitingValues)
+ UnrollCount /= ExitingValues;
+ if (UnrollCount <= 1)
+ return;
+ }
+
LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
UP.Partial = true;
UP.Runtime = true;
- UP.UpperBound = true;
UP.UnrollRemainder = true;
- UP.DefaultUnrollRuntimeCount = 4;
+ UP.DefaultUnrollRuntimeCount = UnrollCount;
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
@@ -2088,11 +2241,6 @@
BaseT::getPeelingPreferences(L, SE, PP);
}
-bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const {
- return ST->hasMVEIntegerOps();
-}
-
bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
diff --git a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 7f04508..8899405 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -48,6 +48,11 @@
};
}
+// For controlling conversion of memcpy into Tail Predicated loop.
+namespace TPLoop {
+enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
+}
+
class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
using BaseT = BasicTTIImplBase<ARMTTIImpl>;
using TTI = TargetTransformInfo;
@@ -103,8 +108,8 @@
bool enableInterleavedAccessVectorization() { return true; }
- bool shouldFavorBackedgeIndex(const Loop *L) const;
- bool shouldFavorPostInc() const;
+ TTI::AddressingModeKind
+ getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const;
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
@@ -119,15 +124,17 @@
/// \name Scalar TTI Implementations
/// @{
- int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty);
using BaseT::getIntImmCost;
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
/// @}
@@ -149,16 +156,20 @@
return 13;
}
- unsigned getRegisterBitWidth(bool Vector) const {
- if (Vector) {
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasNEON())
- return 128;
+ return TypeSize::getFixed(128);
if (ST->hasMVEIntegerOps())
- return 128;
- return 0;
+ return TypeSize::getFixed(128);
+ return TypeSize::getFixed(0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
}
-
- return 32;
+ llvm_unreachable("Unsupported register kind");
}
unsigned getMaxInterleaveFactor(unsigned VF) {
@@ -179,15 +190,13 @@
return isLegalMaskedGather(Ty, Alignment);
}
- int getMemcpyCost(const Instruction *I);
+ InstructionCost getMemcpyCost(const Instruction *I);
int getNumMemOps(const IntrinsicInst *I) const;
- int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
-
- bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const;
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
@@ -197,24 +206,26 @@
bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
- int getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
- int getAddressComputationCost(Type *Val, ScalarEvolution *SE,
- const SCEV *Ptr);
+ InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
+ const SCEV *Ptr);
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
@@ -224,35 +235,36 @@
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind);
+ InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind);
- int getInterleavedMemoryOpCost(
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind);
+ InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind);
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
Type *ResTy, VectorType *ValTy,
TTI::TargetCostKind CostKind);
- int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
bool maybeLoweredToCall(Instruction &I);
bool isLoweredToCall(const Function *F);
@@ -278,13 +290,42 @@
// variables or functions in constant data, so don't convert switches to
// lookup tables if any of the values would need relocation.
if (ST->isROPI() || ST->isRWPI())
- return !C->needsRelocation();
+ return !C->needsDynamicRelocation();
return true;
}
/// @}
};
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize. (The order of the elements
+/// within each block of the vector is reversed.)
+inline bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for VREV are: 16, 32, 64");
+
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz != 8 && EltSz != 16 && EltSz != 32)
+ return false;
+
+ unsigned BlockElts = M[0] + 1;
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSz;
+
+ if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+ return false;
+
+ for (unsigned i = 0, e = M.size(); i < e; ++i) {
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
diff --git a/src/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/src/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 52577d7..e410fe0 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -32,7 +32,6 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCAsmParserExtension.h"
@@ -500,6 +499,7 @@
StringRef FullInst, bool &CanAcceptCarrySet,
bool &CanAcceptPredicationCode,
bool &CanAcceptVPTPredicationCode);
+ bool enableArchExtFeature(StringRef Name, SMLoc &ExtLoc);
void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
OperandVector &Operands);
@@ -847,7 +847,7 @@
unsigned BaseRegNum;
// Offset is in OffsetReg or OffsetImm. If both are zero, no offset
// was specified.
- const MCConstantExpr *OffsetImm; // Offset immediate value
+ const MCExpr *OffsetImm; // Offset immediate value
unsigned OffsetRegNum; // Offset register num, when OffsetImm == NULL
ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
unsigned ShiftImm; // shift for OffsetReg.
@@ -1107,7 +1107,10 @@
else if (isGPRMem()) {
if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
if(Memory.BaseRegNum != ARM::PC) return false;
- Val = Memory.OffsetImm->getValue();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ Val = CE->getValue();
+ else
+ return false;
}
else return false;
return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
@@ -1496,9 +1499,12 @@
return false;
// Immediate offset in range [-4095, 4095].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val > -4096 && Val < 4096) ||
- (Val == std::numeric_limits<int32_t>::min());
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val > -4096 && Val < 4096) ||
+ (Val == std::numeric_limits<int32_t>::min());
+ }
+ return false;
}
bool isAlignedMemory() const {
@@ -1581,8 +1587,11 @@
if (Memory.OffsetRegNum) return true;
// Immediate offset in range [-4095, 4095].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val > -4096 && Val < 4096;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val > -4096 && Val < 4096;
+ }
+ return false;
}
bool isAM2OffsetImm() const {
@@ -1608,11 +1617,14 @@
if (Memory.OffsetRegNum) return true;
// Immediate offset in range [-255, 255].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- // The #-0 offset is encoded as std::numeric_limits<int32_t>::min(), and we
- // have to check for this too.
- return (Val > -256 && Val < 256) ||
- Val == std::numeric_limits<int32_t>::min();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ // The #-0 offset is encoded as std::numeric_limits<int32_t>::min(), and
+ // we have to check for this too.
+ return (Val > -256 && Val < 256) ||
+ Val == std::numeric_limits<int32_t>::min();
+ }
+ return false;
}
bool isAM3Offset() const {
@@ -1640,9 +1652,12 @@
if (Memory.OffsetRegNum) return false;
// Immediate offset in range [-1020, 1020] and a multiple of 4.
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
- Val == std::numeric_limits<int32_t>::min();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+ Val == std::numeric_limits<int32_t>::min();
+ }
+ return false;
}
bool isAddrMode5FP16() const {
@@ -1656,9 +1671,12 @@
if (Memory.OffsetRegNum) return false;
// Immediate offset in range [-510, 510] and a multiple of 2.
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) ||
- Val == std::numeric_limits<int32_t>::min();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) ||
+ Val == std::numeric_limits<int32_t>::min();
+ }
+ return false;
}
bool isMemTBB() const {
@@ -1710,8 +1728,11 @@
return false;
// Immediate offset, multiple of 4 in range [0, 124].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+ }
+ return false;
}
bool isMemThumbRIs2() const {
@@ -1720,8 +1741,11 @@
return false;
// Immediate offset, multiple of 4 in range [0, 62].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+ }
+ return false;
}
bool isMemThumbRIs1() const {
@@ -1730,8 +1754,11 @@
return false;
// Immediate offset in range [0, 31].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val <= 31;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val <= 31;
+ }
+ return false;
}
bool isMemThumbSPI() const {
@@ -1740,8 +1767,11 @@
return false;
// Immediate offset, multiple of 4 in range [0, 1020].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+ }
+ return false;
}
bool isMemImm8s4Offset() const {
@@ -1754,11 +1784,15 @@
return false;
// Immediate offset a multiple of 4 in range [-1020, 1020].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- // Special case, #-0 is std::numeric_limits<int32_t>::min().
- return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) ||
- Val == std::numeric_limits<int32_t>::min();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ // Special case, #-0 is std::numeric_limits<int32_t>::min().
+ return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) ||
+ Val == std::numeric_limits<int32_t>::min();
+ }
+ return false;
}
+
bool isMemImm7s4Offset() const {
// If we have an immediate that's not a constant, treat it as a label
// reference needing a fixup. If it is a constant, it's something else
@@ -1771,17 +1805,24 @@
return false;
// Immediate offset a multiple of 4 in range [-508, 508].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- // Special case, #-0 is INT32_MIN.
- return (Val >= -508 && Val <= 508 && (Val & 3) == 0) || Val == INT32_MIN;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ // Special case, #-0 is INT32_MIN.
+ return (Val >= -508 && Val <= 508 && (Val & 3) == 0) || Val == INT32_MIN;
+ }
+ return false;
}
+
bool isMemImm0_1020s4Offset() const {
if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset a multiple of 4 in range [0, 1020].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+ }
+ return false;
}
bool isMemImm8Offset() const {
@@ -1791,9 +1832,12 @@
if (Memory.BaseRegNum == ARM::PC) return false;
// Immediate offset in range [-255, 255].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val == std::numeric_limits<int32_t>::min()) ||
- (Val > -256 && Val < 256);
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val == std::numeric_limits<int32_t>::min()) ||
+ (Val > -256 && Val < 256);
+ }
+ return false;
}
template<unsigned Bits, unsigned RegClassID>
@@ -1806,22 +1850,25 @@
// [-127, 127], shifted left by Bits.
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
- // INT32_MIN is a special-case value (indicating the encoding with
- // zero offset and the subtract bit set)
- if (Val == INT32_MIN)
- return true;
+ // INT32_MIN is a special-case value (indicating the encoding with
+ // zero offset and the subtract bit set)
+ if (Val == INT32_MIN)
+ return true;
- unsigned Divisor = 1U << Bits;
+ unsigned Divisor = 1U << Bits;
- // Check that the low bits are zero
- if (Val % Divisor != 0)
- return false;
+ // Check that the low bits are zero
+ if (Val % Divisor != 0)
+ return false;
- // Check that the remaining offset is within range.
- Val /= Divisor;
- return (Val >= -127 && Val <= 127);
+ // Check that the remaining offset is within range.
+ Val /= Divisor;
+ return (Val >= -127 && Val <= 127);
+ }
+ return false;
}
template <int shift> bool isMemRegRQOffset() const {
@@ -1853,20 +1900,24 @@
Memory.BaseRegNum))
return false;
- if(!Memory.OffsetImm) return true;
+ if (!Memory.OffsetImm)
+ return true;
static_assert(shift < 56,
"Such that we dont shift by a value higher than 62");
- int64_t Val = Memory.OffsetImm->getValue();
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
- // The value must be a multiple of (1 << shift)
- if ((Val & ((1U << shift) - 1)) != 0)
- return false;
+ // The value must be a multiple of (1 << shift)
+ if ((Val & ((1U << shift) - 1)) != 0)
+ return false;
- // And be in the right range, depending on the amount that it is shifted
- // by. Shift 0, is equal to 7 unsigned bits, the sign bit is set
- // separately.
- int64_t Range = (1U << (7+shift)) - 1;
- return (Val == INT32_MIN) || (Val > -Range && Val < Range);
+ // And be in the right range, depending on the amount that it is shifted
+ // by. Shift 0, is equal to 7 unsigned bits, the sign bit is set
+ // separately.
+ int64_t Range = (1U << (7 + shift)) - 1;
+ return (Val == INT32_MIN) || (Val > -Range && Val < Range);
+ }
+ return false;
}
bool isMemPosImm8Offset() const {
@@ -1874,8 +1925,11 @@
return false;
// Immediate offset in range [0, 255].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return Val >= 0 && Val < 256;
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return Val >= 0 && Val < 256;
+ }
+ return false;
}
bool isMemNegImm8Offset() const {
@@ -1885,9 +1939,12 @@
if (Memory.BaseRegNum == ARM::PC) return false;
// Immediate offset in range [-255, -1].
if (!Memory.OffsetImm) return false;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val == std::numeric_limits<int32_t>::min()) ||
- (Val > -256 && Val < 0);
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val == std::numeric_limits<int32_t>::min()) ||
+ (Val > -256 && Val < 0);
+ }
+ return false;
}
bool isMemUImm12Offset() const {
@@ -1895,8 +1952,11 @@
return false;
// Immediate offset in range [0, 4095].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val >= 0 && Val < 4096);
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val >= 0 && Val < 4096);
+ }
+ return false;
}
bool isMemImm12Offset() const {
@@ -1911,9 +1971,14 @@
return false;
// Immediate offset in range [-4095, 4095].
if (!Memory.OffsetImm) return true;
- int64_t Val = Memory.OffsetImm->getValue();
- return (Val > -4096 && Val < 4096) ||
- (Val == std::numeric_limits<int32_t>::min());
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int64_t Val = CE->getValue();
+ return (Val > -4096 && Val < 4096) ||
+ (Val == std::numeric_limits<int32_t>::min());
+ }
+ // If we have an immediate that's not a constant, treat it as a
+ // symbolic expression needing a fixup.
+ return true;
}
bool isConstPoolAsmImm() const {
@@ -2760,7 +2825,10 @@
assert(isGPRMem() && "Unknown value type!");
assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
- Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue()));
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
@@ -2800,8 +2868,10 @@
void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- int32_t Imm = Memory.OffsetImm->getValue();
- Inst.addOperand(MCOperand::createImm(Imm));
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
@@ -2872,22 +2942,31 @@
void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
assert(N == 3 && "Invalid number of operands!");
- int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
if (!Memory.OffsetRegNum) {
- ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
- // Special case for #-0
- if (Val == std::numeric_limits<int32_t>::min()) Val = 0;
- if (Val < 0) Val = -Val;
- Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int32_t Val = CE->getValue();
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == std::numeric_limits<int32_t>::min())
+ Val = 0;
+ if (Val < 0)
+ Val = -Val;
+ Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+ Inst.addOperand(MCOperand::createImm(Val));
+ } else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
} else {
// For register offset, we encode the shift type and negation flag
// here.
- Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
- Memory.ShiftImm, Memory.ShiftType);
+ int32_t Val =
+ ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+ Memory.ShiftImm, Memory.ShiftType);
+ Inst.addOperand(MCOperand::createImm(Val));
}
- Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
}
void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
@@ -2916,21 +2995,30 @@
return;
}
- int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
if (!Memory.OffsetRegNum) {
- ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
- // Special case for #-0
- if (Val == std::numeric_limits<int32_t>::min()) Val = 0;
- if (Val < 0) Val = -Val;
- Val = ARM_AM::getAM3Opc(AddSub, Val);
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int32_t Val = CE->getValue();
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == std::numeric_limits<int32_t>::min())
+ Val = 0;
+ if (Val < 0)
+ Val = -Val;
+ Val = ARM_AM::getAM3Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createImm(Val));
+ } else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
} else {
// For register offset, we encode the shift type and negation flag
// here.
- Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+ int32_t Val =
+ ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+ Inst.addOperand(MCOperand::createImm(Val));
}
- Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
}
void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
@@ -2966,15 +3054,22 @@
return;
}
- // The lower two bits are always zero and as such are not encoded.
- int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
- ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
- // Special case for #-0
- if (Val == std::numeric_limits<int32_t>::min()) Val = 0;
- if (Val < 0) Val = -Val;
- Val = ARM_AM::getAM5Opc(AddSub, Val);
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ // The lower two bits are always zero and as such are not encoded.
+ int32_t Val = CE->getValue() / 4;
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == std::numeric_limits<int32_t>::min())
+ Val = 0;
+ if (Val < 0)
+ Val = -Val;
+ Val = ARM_AM::getAM5Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createImm(Val));
+ } else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addAddrMode5FP16Operands(MCInst &Inst, unsigned N) const {
@@ -2988,15 +3083,22 @@
return;
}
- // The lower bit is always zero and as such is not encoded.
- int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 2 : 0;
- ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
- // Special case for #-0
- if (Val == std::numeric_limits<int32_t>::min()) Val = 0;
- if (Val < 0) Val = -Val;
- Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ // The lower bit is always zero and as such is not encoded.
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm)) {
+ int32_t Val = CE->getValue() / 2;
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == std::numeric_limits<int32_t>::min())
+ Val = 0;
+ if (Val < 0)
+ Val = -Val;
+ Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createImm(Val));
+ } else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
@@ -3010,9 +3112,8 @@
return;
}
- int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addMemImm7s4OffsetOperands(MCInst &Inst, unsigned N) const {
@@ -3026,24 +3127,26 @@
return;
}
- int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- // The lower two bits are always zero and as such are not encoded.
- int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ // The lower two bits are always zero and as such are not encoded.
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addMemImmOffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addMemRegRQOffsetOperands(MCInst &Inst, unsigned N) const {
@@ -3062,9 +3165,8 @@
}
// Otherwise, it's a normal memory reg+offset.
- int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
@@ -3077,9 +3179,8 @@
}
// Otherwise, it's a normal memory reg+offset.
- int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addConstPoolAsmImmOperands(MCInst &Inst, unsigned N) const {
@@ -3126,30 +3227,43 @@
void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ // The lower two bits are always zero and as such are not encoded.
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 2));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ addExpr(Inst, Memory.OffsetImm);
}
void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
- Inst.addOperand(MCOperand::createImm(Val));
+ if (!Memory.OffsetImm)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const auto *CE = dyn_cast<MCConstantExpr>(Memory.OffsetImm))
+ // The lower two bits are always zero and as such are not encoded.
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+ else
+ Inst.addOperand(MCOperand::createExpr(Memory.OffsetImm));
}
void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
@@ -3686,10 +3800,9 @@
}
static std::unique_ptr<ARMOperand>
- CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
- unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
- unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
- SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+ CreateMem(unsigned BaseRegNum, const MCExpr *OffsetImm, unsigned OffsetRegNum,
+ ARM_AM::ShiftOpc ShiftType, unsigned ShiftImm, unsigned Alignment,
+ bool isNegative, SMLoc S, SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
auto Op = std::make_unique<ARMOperand>(k_Memory);
Op->Memory.BaseRegNum = BaseRegNum;
Op->Memory.OffsetImm = OffsetImm;
@@ -4899,7 +5012,7 @@
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
- if (!Tok.getString().equals_lower("csync"))
+ if (!Tok.getString().equals_insensitive("csync"))
return MatchOperand_NoMatch;
Parser.Lex(); // Eat identifier token.
@@ -4919,7 +5032,7 @@
if (Tok.is(AsmToken::Identifier)) {
StringRef OptStr = Tok.getString();
- if (OptStr.equals_lower("sy"))
+ if (OptStr.equals_insensitive("sy"))
Opt = ARM_ISB::SY;
else
return MatchOperand_NoMatch;
@@ -5818,23 +5931,24 @@
E = Parser.getTok().getLoc();
bool isNegative = getParser().getTok().is(AsmToken::Minus);
- const MCExpr *Offset;
+ const MCExpr *Offset, *AdjustedOffset;
if (getParser().parseExpression(Offset))
return true;
- // The expression has to be a constant. Memory references with relocations
- // don't come through here, as they use the <label> forms of the relevant
- // instructions.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
- if (!CE)
- return Error (E, "constant expression expected");
-
- // If the constant was #-0, represent it as
- // std::numeric_limits<int32_t>::min().
- int32_t Val = CE->getValue();
- if (isNegative && Val == 0)
- CE = MCConstantExpr::create(std::numeric_limits<int32_t>::min(),
- getContext());
+ if (const auto *CE = dyn_cast<MCConstantExpr>(Offset)) {
+ // If the constant was #-0, represent it as
+ // std::numeric_limits<int32_t>::min().
+ int32_t Val = CE->getValue();
+ if (isNegative && Val == 0)
+ CE = MCConstantExpr::create(std::numeric_limits<int32_t>::min(),
+ getContext());
+ // Don't worry about range checking the value here. That's handled by
+ // the is*() predicates.
+ AdjustedOffset = CE;
+ } else
+ AdjustedOffset = Offset;
+ Operands.push_back(ARMOperand::CreateMem(
+ BaseRegNum, AdjustedOffset, 0, ARM_AM::no_shift, 0, 0, false, S, E));
// Now we should have the closing ']'
if (Parser.getTok().isNot(AsmToken::RBrac))
@@ -5842,12 +5956,6 @@
E = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat right bracket token.
- // Don't worry about range checking the value here. That's handled by
- // the is*() predicates.
- Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
- ARM_AM::no_shift, 0, 0,
- false, S, E));
-
// If there's a pre-indexing writeback marker, '!', just add it as a token
// operand.
if (Parser.getTok().is(AsmToken::Exclaim)) {
@@ -6086,7 +6194,7 @@
return true;
// If this is VMRS, check for the apsr_nzcv operand.
if (Mnemonic == "vmrs" &&
- Parser.getTok().getString().equals_lower("apsr_nzcv")) {
+ Parser.getTok().getString().equals_insensitive("apsr_nzcv")) {
S = Parser.getTok().getLoc();
Parser.Lex();
Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
@@ -6222,10 +6330,10 @@
}
enum {
- COFF = (1 << MCObjectFileInfo::IsCOFF),
- ELF = (1 << MCObjectFileInfo::IsELF),
- MACHO = (1 << MCObjectFileInfo::IsMachO),
- WASM = (1 << MCObjectFileInfo::IsWasm),
+ COFF = (1 << MCContext::IsCOFF),
+ ELF = (1 << MCContext::IsELF),
+ MACHO = (1 << MCContext::IsMachO),
+ WASM = (1 << MCContext::IsWasm),
};
static const struct PrefixEntry {
const char *Spelling;
@@ -6248,20 +6356,21 @@
}
uint8_t CurrentFormat;
- switch (getContext().getObjectFileInfo()->getObjectFileType()) {
- case MCObjectFileInfo::IsMachO:
+ switch (getContext().getObjectFileType()) {
+ case MCContext::IsMachO:
CurrentFormat = MACHO;
break;
- case MCObjectFileInfo::IsELF:
+ case MCContext::IsELF:
CurrentFormat = ELF;
break;
- case MCObjectFileInfo::IsCOFF:
+ case MCContext::IsCOFF:
CurrentFormat = COFF;
break;
- case MCObjectFileInfo::IsWasm:
+ case MCContext::IsWasm:
CurrentFormat = WASM;
break;
- case MCObjectFileInfo::IsXCOFF:
+ case MCContext::IsGOFF:
+ case MCContext::IsXCOFF:
llvm_unreachable("unexpected object format");
break;
}
@@ -7658,6 +7767,33 @@
"source register and base register can't be identical");
return false;
}
+ case ARM::t2LDR_PRE_imm:
+ case ARM::t2LDR_POST_imm:
+ case ARM::t2STR_PRE_imm:
+ case ARM::t2STR_POST_imm: {
+ // Rt must be different from Rn.
+ const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+
+ if (Rt == Rn)
+ return Error(Operands[3]->getStartLoc(),
+ "destination register and base register can't be identical");
+ if (Inst.getOpcode() == ARM::t2LDR_POST_imm ||
+ Inst.getOpcode() == ARM::t2STR_POST_imm) {
+ int Imm = Inst.getOperand(2).getImm();
+ if (Imm > 255 || Imm < -255)
+ return Error(Operands[5]->getStartLoc(),
+ "operand must be in range [-255, 255]");
+ }
+ if (Inst.getOpcode() == ARM::t2STR_PRE_imm ||
+ Inst.getOpcode() == ARM::t2STR_POST_imm) {
+ if (Inst.getOperand(0).getReg() == ARM::PC) {
+ return Error(Operands[3]->getStartLoc(),
+ "operand must be a register in range [r0, r14]");
+ }
+ }
+ return false;
+ }
case ARM::LDR_PRE_IMM:
case ARM::LDR_PRE_REG:
case ARM::t2LDR_PRE:
@@ -7923,7 +8059,10 @@
break;
case ARM::t2B: {
int op = (Operands[2]->isImm()) ? 2 : 3;
- if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
+ ARMOperand &Operand = static_cast<ARMOperand &>(*Operands[op]);
+ // Delay the checks of symbolic expressions until they are resolved.
+ if (!isa<MCBinaryExpr>(Operand.getImm()) &&
+ !Operand.isSignedOffset<24, 1>())
return Error(Operands[op]->getStartLoc(), "branch target out of range");
break;
}
@@ -8625,6 +8764,34 @@
Inst = TmpInst;
return true;
}
+ // Aliases for imm syntax of LDR instructions.
+ case ARM::t2LDR_PRE_imm:
+ case ARM::t2LDR_POST_imm: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Inst.getOpcode() == ARM::t2LDR_PRE_imm ? ARM::t2LDR_PRE
+ : ARM::t2LDR_POST);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rt
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // imm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ Inst = TmpInst;
+ return true;
+ }
+ // Aliases for imm syntax of STR instructions.
+ case ARM::t2STR_PRE_imm:
+ case ARM::t2STR_POST_imm: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Inst.getOpcode() == ARM::t2STR_PRE_imm ? ARM::t2STR_PRE
+ : ARM::t2STR_POST);
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+ TmpInst.addOperand(Inst.getOperand(0)); // Rt
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // imm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ Inst = TmpInst;
+ return true;
+ }
// Aliases for alternate PC+imm syntax of LDR instructions.
case ARM::t2LDRpcrel:
// Select the narrow version if the immediate will fit.
@@ -10835,10 +11002,9 @@
/// parseDirective parses the arm specific directives
bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
- const MCObjectFileInfo::Environment Format =
- getContext().getObjectFileInfo()->getObjectFileType();
- bool IsMachO = Format == MCObjectFileInfo::IsMachO;
- bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+ const MCContext::Environment Format = getContext().getObjectFileType();
+ bool IsMachO = Format == MCContext::IsMachO;
+ bool IsCOFF = Format == MCContext::IsCOFF;
std::string IDVal = DirectiveID.getIdentifier().lower();
if (IDVal == ".word")
@@ -10976,8 +11142,8 @@
/// ::= .thumbfunc symbol_name
bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
MCAsmParser &Parser = getParser();
- const auto Format = getContext().getObjectFileInfo()->getObjectFileType();
- bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+ const auto Format = getContext().getObjectFileType();
+ bool IsMachO = Format == MCContext::IsMachO;
// Darwin asm has (optionally) function name after .thumb_func direction
// ELF doesn't
@@ -11000,6 +11166,12 @@
"unexpected token in '.thumb_func' directive"))
return true;
+ // .thumb_func implies .thumb
+ if (!isThumb())
+ SwitchMode();
+
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
+
NextSymbolIsThumb = true;
return false;
}
@@ -11154,8 +11326,8 @@
TagLoc = Parser.getTok().getLoc();
if (Parser.getTok().is(AsmToken::Identifier)) {
StringRef Name = Parser.getTok().getIdentifier();
- Optional<unsigned> Ret =
- ELFAttrs::attrTypeFromString(Name, ARMBuildAttrs::ARMAttributeTags);
+ Optional<unsigned> Ret = ELFAttrs::attrTypeFromString(
+ Name, ARMBuildAttrs::getARMAttributeTags());
if (!Ret.hasValue()) {
Error(TagLoc, "attribute name not recognised: " + Name);
return false;
@@ -12060,9 +12232,7 @@
}
}
-/// parseDirectiveArchExtension
-/// ::= .arch_extension [no]feature
-bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) {
// FIXME: This structure should be moved inside ARMTargetParser
// when we start to table-generate them, and we can use the ARM
// flags below, that were generated by table-gen.
@@ -12071,48 +12241,45 @@
const FeatureBitset ArchCheck;
const FeatureBitset Features;
} Extensions[] = {
- { ARM::AEK_CRC, {Feature_HasV8Bit}, {ARM::FeatureCRC} },
- { ARM::AEK_CRYPTO, {Feature_HasV8Bit},
- {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
- { ARM::AEK_FP, {Feature_HasV8Bit},
- {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
- { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
- {Feature_HasV7Bit, Feature_IsNotMClassBit},
- {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
- { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit},
- {ARM::FeatureMP} },
- { ARM::AEK_SIMD, {Feature_HasV8Bit},
- {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
- { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} },
- // FIXME: Only available in A-class, isel not predicated
- { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} },
- { ARM::AEK_FP16, {Feature_HasV8_2aBit},
- {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
- { ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS} },
- { ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB} },
- // FIXME: Unsupported extensions.
- { ARM::AEK_OS, {}, {} },
- { ARM::AEK_IWMMXT, {}, {} },
- { ARM::AEK_IWMMXT2, {}, {} },
- { ARM::AEK_MAVERICK, {}, {} },
- { ARM::AEK_XSCALE, {}, {} },
+ {ARM::AEK_CRC, {Feature_HasV8Bit}, {ARM::FeatureCRC}},
+ {ARM::AEK_AES,
+ {Feature_HasV8Bit},
+ {ARM::FeatureAES, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+ {ARM::AEK_SHA2,
+ {Feature_HasV8Bit},
+ {ARM::FeatureSHA2, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+ {ARM::AEK_CRYPTO,
+ {Feature_HasV8Bit},
+ {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+ {ARM::AEK_FP,
+ {Feature_HasV8Bit},
+ {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8}},
+ {(ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
+ {Feature_HasV7Bit, Feature_IsNotMClassBit},
+ {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM}},
+ {ARM::AEK_MP,
+ {Feature_HasV7Bit, Feature_IsNotMClassBit},
+ {ARM::FeatureMP}},
+ {ARM::AEK_SIMD,
+ {Feature_HasV8Bit},
+ {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8}},
+ {ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone}},
+ // FIXME: Only available in A-class, isel not predicated
+ {ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization}},
+ {ARM::AEK_FP16,
+ {Feature_HasV8_2aBit},
+ {ARM::FeatureFPARMv8, ARM::FeatureFullFP16}},
+ {ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS}},
+ {ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB}},
+ // FIXME: Unsupported extensions.
+ {ARM::AEK_OS, {}, {}},
+ {ARM::AEK_IWMMXT, {}, {}},
+ {ARM::AEK_IWMMXT2, {}, {}},
+ {ARM::AEK_MAVERICK, {}, {}},
+ {ARM::AEK_XSCALE, {}, {}},
};
-
- MCAsmParser &Parser = getParser();
-
- if (getLexer().isNot(AsmToken::Identifier))
- return Error(getLexer().getLoc(), "expected architecture extension name");
-
- StringRef Name = Parser.getTok().getString();
- SMLoc ExtLoc = Parser.getTok().getLoc();
- Lex();
-
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.arch_extension' directive"))
- return true;
-
bool EnableFeature = true;
- if (Name.startswith_lower("no")) {
+ if (Name.startswith_insensitive("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
@@ -12140,8 +12307,35 @@
}
FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
setAvailableFeatures(Features);
- return false;
+ return true;
}
+ return false;
+}
+
+/// parseDirectiveArchExtension
+/// ::= .arch_extension [no]feature
+bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+
+ MCAsmParser &Parser = getParser();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(getLexer().getLoc(), "expected architecture extension name");
+
+ StringRef Name = Parser.getTok().getString();
+ SMLoc ExtLoc = Parser.getTok().getLoc();
+ Lex();
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.arch_extension' directive"))
+ return true;
+
+ if (Name == "nocrypto") {
+ enableArchExtFeature("nosha2", ExtLoc);
+ enableArchExtFeature("noaes", ExtLoc);
+ }
+
+ if (enableArchExtFeature(Name, ExtLoc))
+ return false;
return Error(ExtLoc, "unknown architectural extension: " + Name);
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/ARM/CMakeLists.txt
index e6b12ae..a3ec9e9 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/ARM/CMakeLists.txt
@@ -56,9 +56,10 @@
ARMTargetTransformInfo.cpp
MLxExpansionPass.cpp
MVEGatherScatterLowering.cpp
+ MVELaneInterleavingPass.cpp
MVETailPredication.cpp
MVEVPTBlockPass.cpp
- MVEVPTOptimisationsPass.cpp
+ MVETPAndVPTOptimisationsPass.cpp
Thumb1FrameLowering.cpp
Thumb1InstrInfo.cpp
ThumbRegisterInfo.cpp
@@ -73,6 +74,7 @@
AsmPrinter
CodeGen
Core
+ IPO
MC
Scalar
SelectionDAG
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/src/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 8ea323a..51fd450 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -2676,8 +2676,12 @@
if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
true, 4, Inst, Decoder))
Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
- if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
- return MCDisassembler::Fail;
+
+ // We already have BL_pred for BL w/ predicate, no need to add addition
+ // predicate opreands for BL
+ if (Inst.getOpcode() != ARM::BL)
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
return S;
}
@@ -6670,17 +6674,14 @@
return MCDisassembler::Fail;
if (TypeT3) {
Inst.setOpcode(sign1 ? ARM::t2SUBspImm12 : ARM::t2ADDspImm12);
- S = 0;
Inst.addOperand(MCOperand::createImm(Imm12)); // zext imm12
} else {
Inst.setOpcode(sign1 ? ARM::t2SUBspImm : ARM::t2ADDspImm);
if (!Check(DS, DecodeT2SOImm(Inst, Imm12, Address, Decoder))) // imm12
return MCDisassembler::Fail;
+ if (!Check(DS, DecodeCCOutOperand(Inst, S, Address, Decoder))) // cc_out
+ return MCDisassembler::Fail;
}
- if (!Check(DS, DecodeCCOutOperand(Inst, S, Address, Decoder))) // cc_out
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::createReg(0)); // pred
return DS;
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index b02aef3..9f7327f 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -55,6 +55,10 @@
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/ARM.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_ARM_NONE)
+ .Case("BFD_RELOC_8", ELF::R_ARM_ABS8)
+ .Case("BFD_RELOC_16", ELF::R_ARM_ABS16)
+ .Case("BFD_RELOC_32", ELF::R_ARM_ABS32)
.Default(-1u);
if (Type == -1u)
return None;
@@ -80,6 +84,7 @@
{"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_t2_pcrel_9", 0, 32,
IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_ldst_abs_12", 0, 32, 0},
{"fixup_thumb_adr_pcrel_10", 0, 8,
IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
{"fixup_arm_adr_pcrel_12", 0, 32, IsPCRelConstant},
@@ -116,8 +121,7 @@
{"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_bfcsel_else_target", 0, 32, 0},
{"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
- };
+ {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}};
const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
// This table *must* be in the order that the fixup_* kinds are defined in
// ARMFixupKinds.h.
@@ -134,6 +138,7 @@
{"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_t2_pcrel_9", 0, 32,
IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_ldst_abs_12", 0, 32, 0},
{"fixup_thumb_adr_pcrel_10", 8, 8,
IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
{"fixup_arm_adr_pcrel_12", 0, 32, IsPCRelConstant},
@@ -170,8 +175,7 @@
{"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_bfcsel_else_target", 0, 32, 0},
{"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
- };
+ {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}};
// Fixup kinds from .reloc directive are like R_ARM_NONE. They do not require
// any extra processing.
@@ -486,9 +490,11 @@
// ARM PC-relative values are offset by 8.
Value -= 4;
LLVM_FALLTHROUGH;
- case ARM::fixup_t2_ldst_pcrel_12: {
+ case ARM::fixup_t2_ldst_pcrel_12:
// Offset by 4, adjusted by two due to the half-word ordering of thumb.
Value -= 4;
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_arm_ldst_abs_12: {
bool isAdd = true;
if ((int64_t)Value < 0) {
Value = -Value;
@@ -936,6 +942,7 @@
case ARM::fixup_arm_ldst_pcrel_12:
case ARM::fixup_arm_pcrel_10:
case ARM::fixup_arm_pcrel_9:
+ case ARM::fixup_arm_ldst_abs_12:
case ARM::fixup_arm_adr_pcrel_12:
case ARM::fixup_arm_uncondbl:
case ARM::fixup_arm_condbl:
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 8cd7a4a..62eb1d7 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -20,7 +20,7 @@
: ARMAsmBackend(T, STI, support::little) {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createARMWinCOFFObjectWriter(/*Is64Bit=*/false);
+ return createARMWinCOFFObjectWriter();
}
};
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 07ca5c2..12076b8 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -46,7 +46,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/LEB128.h"
#include "llvm/Support/TargetParser.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -177,8 +176,8 @@
void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
if (IsVerboseAsm) {
- StringRef Name =
- ELFAttrs::attrTypeAsString(Attribute, ARMBuildAttrs::ARMAttributeTags);
+ StringRef Name = ELFAttrs::attrTypeAsString(
+ Attribute, ARMBuildAttrs::getARMAttributeTags());
if (!Name.empty())
OS << "\t@ " << Name;
}
@@ -195,7 +194,7 @@
OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
if (IsVerboseAsm) {
StringRef Name = ELFAttrs::attrTypeAsString(
- Attribute, ARMBuildAttrs::ARMAttributeTags);
+ Attribute, ARMBuildAttrs::getARMAttributeTags());
if (!Name.empty())
OS << "\t@ " << Name;
}
@@ -216,7 +215,7 @@
if (IsVerboseAsm)
OS << "\t@ "
<< ELFAttrs::attrTypeAsString(Attribute,
- ARMBuildAttrs::ARMAttributeTags);
+ ARMBuildAttrs::getARMAttributeTags());
break;
}
OS << "\n";
@@ -274,104 +273,13 @@
class ARMTargetELFStreamer : public ARMTargetStreamer {
private:
- // This structure holds all attributes, accounting for
- // their string/numeric value, so we can later emit them
- // in declaration order, keeping all in the same vector
- struct AttributeItem {
- enum {
- HiddenAttribute = 0,
- NumericAttribute,
- TextAttribute,
- NumericAndTextAttributes
- } Type;
- unsigned Tag;
- unsigned IntValue;
- std::string StringValue;
-
- static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
- // The conformance tag must be emitted first when serialised
- // into an object file. Specifically, the addenda to the ARM ABI
- // states that (2.3.7.4):
- //
- // "To simplify recognition by consumers in the common case of
- // claiming conformity for the whole file, this tag should be
- // emitted first in a file-scope sub-subsection of the first
- // public subsection of the attributes section."
- //
- // So it is special-cased in this comparison predicate when the
- // attributes are sorted in finishAttributeSection().
- return (RHS.Tag != ARMBuildAttrs::conformance) &&
- ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag));
- }
- };
-
StringRef CurrentVendor;
unsigned FPU = ARM::FK_INVALID;
ARM::ArchKind Arch = ARM::ArchKind::INVALID;
ARM::ArchKind EmittedArch = ARM::ArchKind::INVALID;
- SmallVector<AttributeItem, 64> Contents;
MCSection *AttributeSection = nullptr;
- AttributeItem *getAttributeItem(unsigned Attribute) {
- for (size_t i = 0; i < Contents.size(); ++i)
- if (Contents[i].Tag == Attribute)
- return &Contents[i];
- return nullptr;
- }
-
- void setAttributeItem(unsigned Attribute, unsigned Value,
- bool OverwriteExisting) {
- // Look for existing attribute item
- if (AttributeItem *Item = getAttributeItem(Attribute)) {
- if (!OverwriteExisting)
- return;
- Item->Type = AttributeItem::NumericAttribute;
- Item->IntValue = Value;
- return;
- }
-
- // Create new attribute item
- AttributeItem Item = {AttributeItem::NumericAttribute, Attribute, Value,
- std::string(StringRef(""))};
- Contents.push_back(Item);
- }
-
- void setAttributeItem(unsigned Attribute, StringRef Value,
- bool OverwriteExisting) {
- // Look for existing attribute item
- if (AttributeItem *Item = getAttributeItem(Attribute)) {
- if (!OverwriteExisting)
- return;
- Item->Type = AttributeItem::TextAttribute;
- Item->StringValue = std::string(Value);
- return;
- }
-
- // Create new attribute item
- AttributeItem Item = {AttributeItem::TextAttribute, Attribute, 0,
- std::string(Value)};
- Contents.push_back(Item);
- }
-
- void setAttributeItems(unsigned Attribute, unsigned IntValue,
- StringRef StringValue, bool OverwriteExisting) {
- // Look for existing attribute item
- if (AttributeItem *Item = getAttributeItem(Attribute)) {
- if (!OverwriteExisting)
- return;
- Item->Type = AttributeItem::NumericAndTextAttributes;
- Item->IntValue = IntValue;
- Item->StringValue = std::string(StringValue);
- return;
- }
-
- // Create new attribute item
- AttributeItem Item = {AttributeItem::NumericAndTextAttributes, Attribute,
- IntValue, std::string(StringValue)};
- Contents.push_back(Item);
- }
-
void emitArchDefaultAttributes();
void emitFPUDefaultAttributes();
@@ -406,8 +314,6 @@
void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
- size_t calculateContentSize() const;
-
// Reset state between object emissions
void reset() override;
@@ -579,6 +485,28 @@
}
}
+ /// If a label is defined before the .type directive sets the label's type
+ /// then the label can't be recorded as thumb function when the label is
+ /// defined. We override emitSymbolAttribute() which is called as part of the
+ /// parsing of .type so that if the symbol has already been defined we can
+ /// record the label as Thumb. FIXME: there is a corner case where the state
+ /// is changed in between the label definition and the .type directive, this
+ /// is not expected to occur in practice and handling it would require the
+ /// backend to track IsThumb for every label.
+ bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+ bool Val = MCELFStreamer::emitSymbolAttribute(Symbol, Attribute);
+
+ if (!IsThumb)
+ return Val;
+
+ unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+ if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) &&
+ Symbol->isDefined())
+ getAssembler().setIsThumbFunc(Symbol);
+
+ return Val;
+ };
+
private:
enum ElfMappingSymbol {
EMS_None,
@@ -755,26 +683,28 @@
if (!CurrentVendor.empty())
finishAttributeSection();
- assert(Contents.empty() &&
+ assert(getStreamer().Contents.empty() &&
".ARM.attributes should be flushed before changing vendor");
CurrentVendor = Vendor;
}
void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
- setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+ getStreamer().setAttributeItem(Attribute, Value,
+ /* OverwriteExisting= */ true);
}
void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
StringRef Value) {
- setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+ getStreamer().setAttributeItem(Attribute, Value,
+ /* OverwriteExisting= */ true);
}
void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
unsigned IntValue,
StringRef StringValue) {
- setAttributeItems(Attribute, IntValue, StringValue,
- /* OverwriteExisting= */ true);
+ getStreamer().setAttributeItems(Attribute, IntValue, StringValue,
+ /* OverwriteExisting= */ true);
}
void ARMTargetELFStreamer::emitArch(ARM::ArchKind Value) {
@@ -787,19 +717,14 @@
void ARMTargetELFStreamer::emitArchDefaultAttributes() {
using namespace ARMBuildAttrs;
+ ARMELFStreamer &S = getStreamer();
- setAttributeItem(CPU_name,
- ARM::getCPUAttr(Arch),
- false);
+ S.setAttributeItem(CPU_name, ARM::getCPUAttr(Arch), false);
if (EmittedArch == ARM::ArchKind::INVALID)
- setAttributeItem(CPU_arch,
- ARM::getArchAttr(Arch),
- false);
+ S.setAttributeItem(CPU_arch, ARM::getArchAttr(Arch), false);
else
- setAttributeItem(CPU_arch,
- ARM::getArchAttr(EmittedArch),
- false);
+ S.setAttributeItem(CPU_arch, ARM::getArchAttr(EmittedArch), false);
switch (Arch) {
case ARM::ArchKind::ARMV2:
@@ -807,49 +732,50 @@
case ARM::ArchKind::ARMV3:
case ARM::ArchKind::ARMV3M:
case ARM::ArchKind::ARMV4:
- setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
break;
case ARM::ArchKind::ARMV4T:
case ARM::ArchKind::ARMV5T:
+ case ARM::ArchKind::XSCALE:
case ARM::ArchKind::ARMV5TE:
case ARM::ArchKind::ARMV6:
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, Allowed, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, Allowed, false);
break;
case ARM::ArchKind::ARMV6T2:
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
break;
case ARM::ArchKind::ARMV6K:
case ARM::ArchKind::ARMV6KZ:
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, Allowed, false);
- setAttributeItem(Virtualization_use, AllowTZ, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, Allowed, false);
+ S.setAttributeItem(Virtualization_use, AllowTZ, false);
break;
case ARM::ArchKind::ARMV6M:
- setAttributeItem(THUMB_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, Allowed, false);
break;
case ARM::ArchKind::ARMV7A:
- setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
break;
case ARM::ArchKind::ARMV7R:
- setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ S.setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
break;
case ARM::ArchKind::ARMV7EM:
case ARM::ArchKind::ARMV7M:
- setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ S.setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
break;
case ARM::ArchKind::ARMV8A:
@@ -859,29 +785,29 @@
case ARM::ArchKind::ARMV8_4A:
case ARM::ArchKind::ARMV8_5A:
case ARM::ArchKind::ARMV8_6A:
- setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
- setAttributeItem(MPextension_use, Allowed, false);
- setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
+ S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ S.setAttributeItem(MPextension_use, Allowed, false);
+ S.setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
break;
case ARM::ArchKind::ARMV8MBaseline:
case ARM::ArchKind::ARMV8MMainline:
- setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
- setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+ S.setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
+ S.setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
break;
case ARM::ArchKind::IWMMXT:
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, Allowed, false);
- setAttributeItem(WMMX_arch, AllowWMMXv1, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, Allowed, false);
+ S.setAttributeItem(WMMX_arch, AllowWMMXv1, false);
break;
case ARM::ArchKind::IWMMXT2:
- setAttributeItem(ARM_ISA_use, Allowed, false);
- setAttributeItem(THUMB_ISA_use, Allowed, false);
- setAttributeItem(WMMX_arch, AllowWMMXv2, false);
+ S.setAttributeItem(ARM_ISA_use, Allowed, false);
+ S.setAttributeItem(THUMB_ISA_use, Allowed, false);
+ S.setAttributeItem(WMMX_arch, AllowWMMXv2, false);
break;
default:
@@ -895,123 +821,106 @@
}
void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+ ARMELFStreamer &S = getStreamer();
+
switch (FPU) {
case ARM::FK_VFP:
case ARM::FK_VFPV2:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv2,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv2,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3A,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3_FP16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3A,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::FP_HP_extension,
- ARMBuildAttrs::AllowHPFP,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3_D16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3B,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3_D16_FP16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3B,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::FP_HP_extension,
- ARMBuildAttrs::AllowHPFP,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3XD:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3B,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV3XD_FP16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3B,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::FP_HP_extension,
- ARMBuildAttrs::AllowHPFP,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_VFPV4:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv4A,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv4A,
+ /* OverwriteExisting= */ false);
break;
// ABI_HardFP_use is handled in ARMAsmPrinter, so _SP_D16 is treated the same
// as _D16 here.
case ARM::FK_FPV4_SP_D16:
case ARM::FK_VFPV4_D16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv4B,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv4B,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_FP_ARMV8:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPARMv8A,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPARMv8A,
+ /* OverwriteExisting= */ false);
break;
// FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so
// uses the FP_ARMV8_D16 build attribute.
case ARM::FK_FPV5_SP_D16:
case ARM::FK_FPV5_D16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPARMv8B,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPARMv8B,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_NEON:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3A,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
- ARMBuildAttrs::AllowNeon,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_NEON_FP16:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv3A,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
- ARMBuildAttrs::AllowNeon,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::FP_HP_extension,
- ARMBuildAttrs::AllowHPFP,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_NEON_VFPV4:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPv4A,
- /* OverwriteExisting= */ false);
- setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
- ARMBuildAttrs::AllowNeon2,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPv4A,
+ /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon2,
+ /* OverwriteExisting= */ false);
break;
case ARM::FK_NEON_FP_ARMV8:
case ARM::FK_CRYPTO_NEON_FP_ARMV8:
- setAttributeItem(ARMBuildAttrs::FP_arch,
- ARMBuildAttrs::AllowFPARMv8A,
- /* OverwriteExisting= */ false);
+ S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPARMv8A,
+ /* OverwriteExisting= */ false);
// 'Advanced_SIMD_arch' must be emitted not here, but within
// ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a()
break;
@@ -1026,39 +935,8 @@
}
}
-size_t ARMTargetELFStreamer::calculateContentSize() const {
- size_t Result = 0;
- for (size_t i = 0; i < Contents.size(); ++i) {
- AttributeItem item = Contents[i];
- switch (item.Type) {
- case AttributeItem::HiddenAttribute:
- break;
- case AttributeItem::NumericAttribute:
- Result += getULEB128Size(item.Tag);
- Result += getULEB128Size(item.IntValue);
- break;
- case AttributeItem::TextAttribute:
- Result += getULEB128Size(item.Tag);
- Result += item.StringValue.size() + 1; // string + '\0'
- break;
- case AttributeItem::NumericAndTextAttributes:
- Result += getULEB128Size(item.Tag);
- Result += getULEB128Size(item.IntValue);
- Result += item.StringValue.size() + 1; // string + '\0';
- break;
- }
- }
- return Result;
-}
-
void ARMTargetELFStreamer::finishAttributeSection() {
- // <format-version>
- // [ <section-length> "vendor-name"
- // [ <file-tag> <size> <attribute>*
- // | <section-tag> <size> <section-number>* 0 <attribute>*
- // | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
- // ]+
- // ]*
+ ARMELFStreamer &S = getStreamer();
if (FPU != ARM::FK_INVALID)
emitFPUDefaultAttributes();
@@ -1066,63 +944,30 @@
if (Arch != ARM::ArchKind::INVALID)
emitArchDefaultAttributes();
- if (Contents.empty())
+ if (S.Contents.empty())
return;
- llvm::sort(Contents, AttributeItem::LessTag);
+ auto LessTag = [](const MCELFStreamer::AttributeItem &LHS,
+ const MCELFStreamer::AttributeItem &RHS) -> bool {
+ // The conformance tag must be emitted first when serialised into an
+ // object file. Specifically, the addenda to the ARM ABI states that
+ // (2.3.7.4):
+ //
+ // "To simplify recognition by consumers in the common case of claiming
+ // conformity for the whole file, this tag should be emitted first in a
+ // file-scope sub-subsection of the first public subsection of the
+ // attributes section."
+ //
+ // So it is special-cased in this comparison predicate when the
+ // attributes are sorted in finishAttributeSection().
+ return (RHS.Tag != ARMBuildAttrs::conformance) &&
+ ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag));
+ };
+ llvm::sort(S.Contents, LessTag);
- ARMELFStreamer &Streamer = getStreamer();
+ S.emitAttributesSection(CurrentVendor, ".ARM.attributes",
+ ELF::SHT_ARM_ATTRIBUTES, AttributeSection);
- // Switch to .ARM.attributes section
- if (AttributeSection) {
- Streamer.SwitchSection(AttributeSection);
- } else {
- AttributeSection = Streamer.getContext().getELFSection(
- ".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
- Streamer.SwitchSection(AttributeSection);
-
- // Format version
- Streamer.emitInt8(0x41);
- }
-
- // Vendor size + Vendor name + '\0'
- const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
-
- // Tag + Tag Size
- const size_t TagHeaderSize = 1 + 4;
-
- const size_t ContentsSize = calculateContentSize();
-
- Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize);
- Streamer.emitBytes(CurrentVendor);
- Streamer.emitInt8(0); // '\0'
-
- Streamer.emitInt8(ARMBuildAttrs::File);
- Streamer.emitInt32(TagHeaderSize + ContentsSize);
-
- // Size should have been accounted for already, now
- // emit each field as its type (ULEB or String)
- for (size_t i = 0; i < Contents.size(); ++i) {
- AttributeItem item = Contents[i];
- Streamer.emitULEB128IntValue(item.Tag);
- switch (item.Type) {
- default: llvm_unreachable("Invalid attribute type");
- case AttributeItem::NumericAttribute:
- Streamer.emitULEB128IntValue(item.IntValue);
- break;
- case AttributeItem::TextAttribute:
- Streamer.emitBytes(item.StringValue);
- Streamer.emitInt8(0); // '\0'
- break;
- case AttributeItem::NumericAndTextAttributes:
- Streamer.emitULEB128IntValue(item.IntValue);
- Streamer.emitBytes(item.StringValue);
- Streamer.emitInt8(0); // '\0'
- break;
- }
- }
-
- Contents.clear();
FPU = ARM::FK_INVALID;
}
@@ -1203,7 +1048,8 @@
if (Group)
Flags |= ELF::SHF_GROUP;
MCSectionELF *EHSection = getContext().getELFSection(
- EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
+ EHSecName, Type, Flags, 0, Group, /*IsComdat=*/true,
+ FnSection.getUniqueID(),
static_cast<const MCSymbolELF *>(FnSection.getBeginSymbol()));
assert(EHSection && "Failed to get the required EH section");
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index bdf04a2..53258a8 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -36,6 +36,8 @@
// Equivalent to fixup_arm_pcrel_9, accounting for the short-swapped encoding
// of Thumb2 instructions.
fixup_t2_pcrel_9,
+ // 12-bit immediate value.
+ fixup_arm_ldst_abs_12,
// 10-bit PC relative relocation for symbol addresses where the lower 2 bits
// are not encoded (so it's encoded as an 8-bit immediate).
fixup_thumb_adr_pcrel_10,
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 744d919..15bbc7d 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -348,6 +349,20 @@
}
}
+void ARMInstPrinter::printOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ if (!Op.isImm() || !PrintBranchImmAsAddress || getUseMarkup())
+ return printOperand(MI, OpNum, STI, O);
+ uint64_t Target = ARM_MC::evaluateBranchTarget(MII.get(MI->getOpcode()),
+ Address, Op.getImm());
+ Target &= 0xffffffff;
+ O << formatHex(Target);
+ if (CommentStream)
+ *CommentStream << "imm = #" << formatImm(Op.getImm()) << '\n';
+}
+
void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -622,22 +637,6 @@
O << "]" << markup(">");
}
-void ARMInstPrinter::printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- const MCOperand &MO1 = MI->getOperand(OpNum);
- const MCOperand &MO2 = MI->getOperand(OpNum + 1);
-
- O << markup("<mem:") << "[";
- printRegName(O, MO1.getReg());
-
- int64_t Imm = MO2.getImm();
- if (Imm != 0)
- O << ", " << markup("<imm:") << '#' << Imm << markup(">");
-
- O << "]" << markup(">");
-}
-
void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -807,11 +806,11 @@
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOpcode() != ARM::t2CLRM) {
- assert(std::is_sorted(MI->begin() + OpNum, MI->end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
- MRI.getEncodingValue(RHS.getReg());
- }));
+ assert(is_sorted(drop_begin(*MI, OpNum),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
}
O << "{";
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index d975d79..aab5e13 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -44,10 +44,8 @@
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printOperand(MI, OpNum, STI, O);
- }
+ void printOperand(const MCInst *MI, uint64_t Address, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSORegRegOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -274,8 +272,6 @@
template<int shift>
void printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI, raw_ostream &O);
void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
private:
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 1cb9953..ced48cc 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -576,9 +576,11 @@
}
} else if (MO.isImm()) {
return static_cast<unsigned>(MO.getImm());
- } else if (MO.isFPImm()) {
- return static_cast<unsigned>(APFloat(MO.getFPImm())
- .bitcastToAPInt().getHiBits(32).getLimitedValue());
+ } else if (MO.isDFPImm()) {
+ return static_cast<unsigned>(APFloat(bit_cast<double>(MO.getDFPImm()))
+ .bitcastToAPInt()
+ .getHiBits(32)
+ .getLimitedValue());
}
llvm_unreachable("Unable to encode MCOperand!");
@@ -975,41 +977,45 @@
// {17-13} = reg
// {12} = (U)nsigned (add == '1', sub == '0')
// {11-0} = imm12
- unsigned Reg, Imm12;
+ unsigned Reg = 0, Imm12 = 0;
bool isAdd = true;
// If The first operand isn't a register, we have a label reference.
const MCOperand &MO = MI.getOperand(OpIdx);
- if (!MO.isReg()) {
- Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
- Imm12 = 0;
-
- if (MO.isExpr()) {
- const MCExpr *Expr = MO.getExpr();
- isAdd = false ; // 'U' bit is set as part of the fixup.
-
- MCFixupKind Kind;
- if (isThumb2(STI))
- Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
- else
- Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
- Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
-
- ++MCNumCPRelocations;
- } else {
- Reg = ARM::PC;
- int32_t Offset = MO.getImm();
- if (Offset == INT32_MIN) {
- Offset = 0;
- isAdd = false;
- } else if (Offset < 0) {
- Offset *= -1;
- isAdd = false;
- }
- Imm12 = Offset;
+ if (MO.isReg()) {
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ if (MO1.isImm()) {
+ isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
+ } else if (MO1.isExpr()) {
+ assert(!isThumb(STI) && !isThumb2(STI) &&
+ "Thumb mode requires different encoding");
+ Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ isAdd = false; // 'U' bit is set as part of the fixup.
+ MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_ldst_abs_12);
+ Fixups.push_back(MCFixup::create(0, MO1.getExpr(), Kind, MI.getLoc()));
}
- } else
- isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
+ } else if (MO.isExpr()) {
+ Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+ isAdd = false; // 'U' bit is set as part of the fixup.
+ MCFixupKind Kind;
+ if (isThumb2(STI))
+ Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+ else
+ Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ ++MCNumCPRelocations;
+ } else {
+ Reg = ARM::PC;
+ int32_t Offset = MO.getImm();
+ if (Offset == INT32_MIN) {
+ Offset = 0;
+ isAdd = false;
+ } else if (Offset < 0) {
+ Offset *= -1;
+ isAdd = false;
+ }
+ Imm12 = Offset;
+ }
uint32_t Binary = Imm12 & 0xfff;
// Immediate is always encoded as positive. The 'U' bit controls add vs sub.
if (isAdd)
@@ -1730,11 +1736,11 @@
Binary |= NumRegs * 2;
} else {
const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
- assert(std::is_sorted(MI.begin() + Op, MI.end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
+ assert(is_sorted(drop_begin(MI, Op),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
MRI.getEncodingValue(RHS.getReg());
- }));
+ }));
for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
Binary |= 1 << RegNo;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 774f250..87cce08 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -107,9 +107,8 @@
assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
assert(MI.getOperand(OI).isReg() && "expected register");
- if (MI.getOperand(OI).getReg() == ARM::SP ||
- MI.getOperand(OI).getReg() == ARM::PC) {
- Info = "use of SP or PC in the list is deprecated";
+ if (MI.getOperand(OI).getReg() == ARM::PC) {
+ Info = "use of PC in the list is deprecated";
return true;
}
}
@@ -134,9 +133,6 @@
case ARM::PC:
ListContainsPC = true;
break;
- case ARM::SP:
- Info = "use of SP in the list is deprecated";
- return true;
}
}
@@ -199,6 +195,24 @@
return false;
}
+uint64_t ARM_MC::evaluateBranchTarget(const MCInstrDesc &InstDesc,
+ uint64_t Addr, int64_t Imm) {
+ // For ARM instructions the PC offset is 8 bytes, for Thumb instructions it
+ // is 4 bytes.
+ uint64_t Offset =
+ ((InstDesc.TSFlags & ARMII::FormMask) == ARMII::ThumbFrm) ? 4 : 8;
+
+ // A Thumb instruction BLX(i) can be 16-bit aligned while targets Arm code
+ // which is 32-bit aligned. The target address for the case is calculated as
+ // targetAddress = Align(PC,4) + imm32;
+ // where
+ // Align(x, y) = y * (x DIV y);
+ if (InstDesc.getOpcode() == ARM::tBLXi)
+ Addr &= ~0x3;
+
+ return Addr + Imm + Offset;
+}
+
MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
@@ -412,55 +426,20 @@
return MCInstrAnalysis::isConditionalBranch(Inst);
}
- bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
- uint64_t Size, uint64_t &Target) const override {
- // We only handle PCRel branches for now.
- if (Inst.getNumOperands() == 0 ||
- Info->get(Inst.getOpcode()).OpInfo[0].OperandType !=
- MCOI::OPERAND_PCREL)
- return false;
-
- int64_t Imm = Inst.getOperand(0).getImm();
- Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
- return true;
- }
-};
-
-class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis {
-public:
- ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {}
-
bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
uint64_t &Target) const override {
- unsigned OpId;
- switch (Inst.getOpcode()) {
- default:
- OpId = 0;
- if (Inst.getNumOperands() == 0)
- return false;
- break;
- case ARM::MVE_WLSTP_8:
- case ARM::MVE_WLSTP_16:
- case ARM::MVE_WLSTP_32:
- case ARM::MVE_WLSTP_64:
- case ARM::t2WLS:
- case ARM::MVE_LETP:
- case ARM::t2LEUpdate:
- OpId = 2;
- break;
- case ARM::t2LE:
- OpId = 1;
- break;
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+
+ // Find the PC-relative immediate operand in the instruction.
+ for (unsigned OpNum = 0; OpNum < Desc.getNumOperands(); ++OpNum) {
+ if (Inst.getOperand(OpNum).isImm() &&
+ Desc.OpInfo[OpNum].OperandType == MCOI::OPERAND_PCREL) {
+ int64_t Imm = Inst.getOperand(OpNum).getImm();
+ Target = ARM_MC::evaluateBranchTarget(Desc, Addr, Imm);
+ return true;
+ }
}
-
- // We only handle PCRel branches for now.
- if (Info->get(Inst.getOpcode()).OpInfo[OpId].OperandType !=
- MCOI::OPERAND_PCREL)
- return false;
-
- // In Thumb mode the PC is always off by 4 bytes.
- Target = Addr + Inst.getOperand(OpId).getImm() + 4;
- return true;
+ return false;
}
};
@@ -470,10 +449,6 @@
return new ARMMCInstrAnalysis(Info);
}
-static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
- return new ThumbMCInstrAnalysis(Info);
-}
-
bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
// Unfortunately we don't have ARMTargetInfo in the disassembler, so we have
// to rely on feature bits.
@@ -521,10 +496,9 @@
}
// Register the MC instruction analyzer.
- for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget()})
+ for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
+ &getTheThumbLETarget(), &getTheThumbBETarget()})
TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
- for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
- TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()}) {
TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 5a0874f..7ccdc6f 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -57,6 +57,9 @@
return false;
}
+uint64_t evaluateBranchTarget(const MCInstrDesc &InstDesc, uint64_t Addr,
+ int64_t Imm);
+
/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc.
/// do not need to go through TargetRegistry.
MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -106,7 +109,7 @@
/// Construct an ARM PE/COFF object writer.
std::unique_ptr<MCObjectTargetWriter>
-createARMWinCOFFObjectWriter(bool Is64Bit);
+createARMWinCOFFObjectWriter();
/// Construct ARM Mach-O relocation info.
MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 900c5fe..31a8149 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCFixupKindInfo.h"
@@ -26,9 +27,8 @@
class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
public:
- ARMWinCOFFObjectWriter(bool Is64Bit)
+ ARMWinCOFFObjectWriter()
: MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
- assert(!Is64Bit && "AArch64 support not yet implemented");
}
~ARMWinCOFFObjectWriter() override = default;
@@ -47,13 +47,20 @@
const MCFixup &Fixup,
bool IsCrossSection,
const MCAsmBackend &MAB) const {
- assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
- "AArch64 support not yet implemented");
-
MCSymbolRefExpr::VariantKind Modifier =
Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
- switch (static_cast<unsigned>(Fixup.getKind())) {
+ unsigned FixupKind = Fixup.getKind();
+ if (IsCrossSection) {
+ if (FixupKind != FK_Data_4) {
+ Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+ return COFF::IMAGE_REL_ARM_ADDR32;
+ }
+ FixupKind = FK_PCRel_4;
+ }
+
+
+ switch (FixupKind) {
default: {
const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
@@ -67,6 +74,8 @@
default:
return COFF::IMAGE_REL_ARM_ADDR32;
}
+ case FK_PCRel_4:
+ return COFF::IMAGE_REL_ARM_REL32;
case FK_SecRel_2:
return COFF::IMAGE_REL_ARM_SECTION;
case FK_SecRel_4:
@@ -91,8 +100,8 @@
namespace llvm {
std::unique_ptr<MCObjectTargetWriter>
-createARMWinCOFFObjectWriter(bool Is64Bit) {
- return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
+createARMWinCOFFObjectWriter() {
+ return std::make_unique<ARMWinCOFFObjectWriter>();
}
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/src/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 5682373..4981b80 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -17,6 +17,7 @@
#include "ARMSubtarget.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -81,11 +82,17 @@
Align Alignment);
// Check whether Ptr is hidden behind a bitcast and look through it
void lookThroughBitcast(Value *&Ptr);
+ // Decompose a ptr into Base and Offsets, potentially using a GEP to return a
+ // scalar base and vector offsets, or else fallback to using a base of 0 and
+ // offset of Ptr where possible.
+ Value *decomposePtr(Value *Ptr, Value *&Offsets, int &Scale,
+ FixedVectorType *Ty, Type *MemoryTy,
+ IRBuilder<> &Builder);
// Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets
// argument
- Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
- IRBuilder<> &Builder);
+ Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
+ GetElementPtrInst *GEP, IRBuilder<> &Builder);
// Compute the scale of this gather/scatter instruction
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
// If the value is a constant, or derived from constants via additions
@@ -96,42 +103,43 @@
// the other summand.
std::pair<Value *, int64_t> getVarAndConst(Value *Inst, int TypeScale);
- Value *lowerGather(IntrinsicInst *I);
+ Instruction *lowerGather(IntrinsicInst *I);
// Create a gather from a base + vector of offsets
- Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
- Instruction *&Root, IRBuilder<> &Builder);
+ Instruction *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
+ Instruction *&Root,
+ IRBuilder<> &Builder);
// Create a gather from a vector of pointers
- Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> &Builder, int64_t Increment = 0);
+ Instruction *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
// Create an incrementing gather from a vector of pointers
- Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> &Builder,
- int64_t Increment = 0);
+ Instruction *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
- Value *lowerScatter(IntrinsicInst *I);
+ Instruction *lowerScatter(IntrinsicInst *I);
// Create a scatter to a base + vector of offsets
- Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
- IRBuilder<> &Builder);
+ Instruction *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
+ IRBuilder<> &Builder);
// Create a scatter to a vector of pointers
- Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> &Builder,
- int64_t Increment = 0);
+ Instruction *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
// Create an incrementing scatter from a vector of pointers
- Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> &Builder,
- int64_t Increment = 0);
+ Instruction *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
// QI gathers and scatters can increment their offsets on their own if
// the increment is a constant value (digit)
- Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
- Value *Ptr, GetElementPtrInst *GEP,
- IRBuilder<> &Builder);
+ Instruction *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder);
// QI gathers/scatters can increment their offsets on their own if the
// increment is a constant value (digit) - this creates a writeback QI
// gather/scatter
- Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
- Value *Ptr, unsigned TypeScale,
- IRBuilder<> &Builder);
+ Instruction *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
+ Value *Ptr, unsigned TypeScale,
+ IRBuilder<> &Builder);
// Optimise the base and offsets of the given address
bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
@@ -214,12 +222,40 @@
return true;
}
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
- GetElementPtrInst *GEP,
- IRBuilder<> &Builder) {
+Value *MVEGatherScatterLowering::decomposePtr(Value *Ptr, Value *&Offsets,
+ int &Scale, FixedVectorType *Ty,
+ Type *MemoryTy,
+ IRBuilder<> &Builder) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (Value *V = decomposeGEP(Offsets, Ty, GEP, Builder)) {
+ Scale =
+ computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
+ MemoryTy->getScalarSizeInBits());
+ return Scale == -1 ? nullptr : V;
+ }
+ }
+
+ // If we couldn't use the GEP (or it doesn't exist), attempt to use a
+ // BasePtr of 0 with Ptr as the Offsets, so long as there are only 4
+ // elements.
+ FixedVectorType *PtrTy = cast<FixedVectorType>(Ptr->getType());
+ if (PtrTy->getNumElements() != 4 || MemoryTy->getScalarSizeInBits() == 32)
+ return nullptr;
+ Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0);
+ Value *BasePtr = Builder.CreateIntToPtr(Zero, Builder.getInt8PtrTy());
+ Offsets = Builder.CreatePtrToInt(
+ Ptr, FixedVectorType::get(Builder.getInt32Ty(), 4));
+ Scale = 0;
+ return BasePtr;
+}
+
+Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
+ FixedVectorType *Ty,
+ GetElementPtrInst *GEP,
+ IRBuilder<> &Builder) {
if (!GEP) {
- LLVM_DEBUG(
- dbgs() << "masked gathers/scatters: no getelementpointer found\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
+ << "found\n");
return nullptr;
}
LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
@@ -275,8 +311,8 @@
auto *BCTy = cast<FixedVectorType>(BitCast->getType());
auto *BCSrcTy = cast<FixedVectorType>(BitCast->getOperand(0)->getType());
if (BCTy->getNumElements() == BCSrcTy->getNumElements()) {
- LLVM_DEBUG(
- dbgs() << "masked gathers/scatters: looking through bitcast\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: looking through "
+ << "bitcast\n");
Ptr = BitCast->getOperand(0);
}
}
@@ -347,9 +383,10 @@
return std::pair<Value *, int64_t>(Summand, Immediate);
}
-Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
+Instruction *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
using namespace PatternMatch;
- LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
+ LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"
+ << *I << "\n");
// @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
// Attempt to turn the masked gather in I into a MVE intrinsic
@@ -371,7 +408,10 @@
Builder.SetCurrentDebugLocation(I->getDebugLoc());
Instruction *Root = I;
- Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
+
+ Instruction *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
+ if (!Load)
+ Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
@@ -380,7 +420,8 @@
if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
<< "creating select\n");
- Load = Builder.CreateSelect(Mask, Load, PassThru);
+ Load = SelectInst::Create(Mask, Load, PassThru);
+ Builder.Insert(Load);
}
Root->replaceAllUsesWith(Load);
@@ -390,14 +431,13 @@
// sext/zext as well as of the gather itself
I->eraseFromParent();
- LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
+ LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"
+ << *Load << "\n");
return Load;
}
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I,
- Value *Ptr,
- IRBuilder<> &Builder,
- int64_t Increment) {
+Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
using namespace PatternMatch;
auto *Ty = cast<FixedVectorType>(I->getType());
LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
@@ -416,13 +456,12 @@
{Ptr, Builder.getInt32(Increment), Mask});
}
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
+Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
using namespace PatternMatch;
auto *Ty = cast<FixedVectorType>(I->getType());
- LLVM_DEBUG(
- dbgs()
- << "masked gathers: loading from vector of pointers with writeback\n");
+ LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers with "
+ << "writeback\n");
if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
// Can't build an intrinsic for this
return nullptr;
@@ -438,79 +477,93 @@
{Ptr, Builder.getInt32(Increment), Mask});
}
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
+Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) {
using namespace PatternMatch;
- Type *OriginalTy = I->getType();
- Type *ResultTy = OriginalTy;
+ Type *MemoryTy = I->getType();
+ Type *ResultTy = MemoryTy;
unsigned Unsigned = 1;
// The size of the gather was already checked in isLegalTypeAndAlignment;
// if it was not a full vector width an appropriate extend should follow.
auto *Extend = Root;
- if (OriginalTy->getPrimitiveSizeInBits() < 128) {
- // Only transform gathers with exactly one use
- if (!I->hasOneUse())
- return nullptr;
-
- // The correct root to replace is not the CallInst itself, but the
- // instruction which extends it
- Extend = cast<Instruction>(*I->users().begin());
- if (isa<SExtInst>(Extend)) {
- Unsigned = 0;
- } else if (!isa<ZExtInst>(Extend)) {
- LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. "
- << "Expanding\n");
- return nullptr;
+ bool TruncResult = false;
+ if (MemoryTy->getPrimitiveSizeInBits() < 128) {
+ if (I->hasOneUse()) {
+ // If the gather has a single extend of the correct type, use an extending
+ // gather and replace the ext. In which case the correct root to replace
+ // is not the CallInst itself, but the instruction which extends it.
+ Instruction* User = cast<Instruction>(*I->users().begin());
+ if (isa<SExtInst>(User) &&
+ User->getType()->getPrimitiveSizeInBits() == 128) {
+ LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
+ << *User << "\n");
+ Extend = User;
+ ResultTy = User->getType();
+ Unsigned = 0;
+ } else if (isa<ZExtInst>(User) &&
+ User->getType()->getPrimitiveSizeInBits() == 128) {
+ LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
+ << *ResultTy << "\n");
+ Extend = User;
+ ResultTy = User->getType();
+ }
}
- LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n");
- ResultTy = Extend->getType();
+
+ // If an extend hasn't been found and the type is an integer, create an
+ // extending gather and truncate back to the original type.
+ if (ResultTy->getPrimitiveSizeInBits() < 128 &&
+ ResultTy->isIntOrIntVectorTy()) {
+ ResultTy = ResultTy->getWithNewBitWidth(
+ 128 / cast<FixedVectorType>(ResultTy)->getNumElements());
+ TruncResult = true;
+ LLVM_DEBUG(dbgs() << "masked gathers: Small input type, truncing to: "
+ << *ResultTy << "\n");
+ }
+
// The final size of the gather must be a full vector width
if (ResultTy->getPrimitiveSizeInBits() != 128) {
- LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. "
- << "Expanding\n");
+ LLVM_DEBUG(dbgs() << "masked gathers: Extend needed but not provided "
+ "from the correct type. Expanding\n");
return nullptr;
}
}
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
- Value *BasePtr =
- checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
+ int Scale;
+ Value *BasePtr = decomposePtr(
+ Ptr, Offsets, Scale, cast<FixedVectorType>(ResultTy), MemoryTy, Builder);
if (!BasePtr)
return nullptr;
- // Check whether the offset is a constant increment that could be merged into
- // a QI gather
- Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
- if (Load)
- return Load;
- int Scale = computeScale(
- BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
- OriginalTy->getScalarSizeInBits());
- if (Scale == -1)
- return nullptr;
Root = Extend;
-
Value *Mask = I->getArgOperand(2);
+ Instruction *Load = nullptr;
if (!match(Mask, m_One()))
- return Builder.CreateIntrinsic(
+ Load = Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset_predicated,
{ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
- {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
else
- return Builder.CreateIntrinsic(
+ Load = Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset,
{ResultTy, BasePtr->getType(), Offsets->getType()},
- {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
+
+ if (TruncResult) {
+ Load = TruncInst::Create(Instruction::Trunc, Load, MemoryTy);
+ Builder.Insert(Load);
+ }
+ return Load;
}
-Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
+Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
using namespace PatternMatch;
- LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
+ LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"
+ << *I << "\n");
// @llvm.masked.scatter.*(data, ptrs, alignment, mask)
// Attempt to turn the masked scatter in I into a MVE intrinsic
@@ -531,18 +584,21 @@
Builder.SetInsertPoint(I);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+ Instruction *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
+ if (!Store)
+ Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
if (!Store)
Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
if (!Store)
return nullptr;
- LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
+ LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"
+ << *Store << "\n");
I->eraseFromParent();
return Store;
}
-Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
+Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
using namespace PatternMatch;
Value *Input = I->getArgOperand(0);
@@ -566,14 +622,13 @@
{Ptr, Builder.getInt32(Increment), Input, Mask});
}
-Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
+Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
using namespace PatternMatch;
Value *Input = I->getArgOperand(0);
auto *Ty = cast<FixedVectorType>(Input->getType());
- LLVM_DEBUG(
- dbgs()
- << "masked scatters: storing to a vector of pointers with writeback\n");
+ LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers "
+ << "with writeback\n");
if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
// Can't build an intrinsic for this
return nullptr;
@@ -589,13 +644,14 @@
{Ptr, Builder.getInt32(Increment), Input, Mask});
}
-Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
+Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
using namespace PatternMatch;
Value *Input = I->getArgOperand(0);
Value *Mask = I->getArgOperand(3);
Type *InputTy = Input->getType();
Type *MemoryTy = InputTy;
+
LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
<< " to base + vector of offsets\n");
// If the input has been truncated, try to integrate that trunc into the
@@ -608,31 +664,34 @@
InputTy = PreTruncTy;
}
}
+ bool ExtendInput = false;
+ if (InputTy->getPrimitiveSizeInBits() < 128 &&
+ InputTy->isIntOrIntVectorTy()) {
+ // If we can't find a trunc to incorporate into the instruction, create an
+ // implicit one with a zext, so that we can still create a scatter. We know
+ // that the input type is 4x/8x/16x and of type i8/i16/i32, so any type
+ // smaller than 128 bits will divide evenly into a 128bit vector.
+ InputTy = InputTy->getWithNewBitWidth(
+ 128 / cast<FixedVectorType>(InputTy)->getNumElements());
+ ExtendInput = true;
+ LLVM_DEBUG(dbgs() << "masked scatters: Small input type, will extend:\n"
+ << *Input << "\n");
+ }
if (InputTy->getPrimitiveSizeInBits() != 128) {
- LLVM_DEBUG(
- dbgs() << "masked scatters: cannot create scatters for non-standard"
- << " input types. Expanding.\n");
+ LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for "
+ "non-standard input types. Expanding.\n");
return nullptr;
}
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
- Value *BasePtr =
- checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
+ int Scale;
+ Value *BasePtr = decomposePtr(
+ Ptr, Offsets, Scale, cast<FixedVectorType>(InputTy), MemoryTy, Builder);
if (!BasePtr)
return nullptr;
- // Check whether the offset is a constant increment that could be merged into
- // a QI gather
- Value *Store =
- tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
- if (Store)
- return Store;
- int Scale = computeScale(
- BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
- MemoryTy->getScalarSizeInBits());
- if (Scale == -1)
- return nullptr;
+ if (ExtendInput)
+ Input = Builder.CreateZExt(Input, InputTy);
if (!match(Mask, m_One()))
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vstr_scatter_offset_predicated,
@@ -650,22 +709,29 @@
Builder.getInt32(Scale)});
}
-Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
- IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
- IRBuilder<> &Builder) {
+Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
FixedVectorType *Ty;
if (I->getIntrinsicID() == Intrinsic::masked_gather)
Ty = cast<FixedVectorType>(I->getType());
else
Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
+
// Incrementing gathers only exist for v4i32
- if (Ty->getNumElements() != 4 ||
- Ty->getScalarSizeInBits() != 32)
+ if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
return nullptr;
+ // Incrementing gathers are not beneficial outside of a loop
Loop *L = LI->getLoopFor(I->getParent());
if (L == nullptr)
- // Incrementing gathers are not beneficial outside of a loop
return nullptr;
+
+ // Decompose the GEP into Base and Offsets
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ Value *Offsets;
+ Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
+ if (!BasePtr)
+ return nullptr;
+
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"wb gather/scatter\n");
@@ -683,11 +749,11 @@
// Only in this case do we want to build a wb gather, because the wb will
// change the phi which does affect other users of the gep (which will still
// be using the phi in the old way)
- Value *Load =
- tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder);
- if (Load != nullptr)
+ if (auto *Load = tryCreateIncrementingWBGatScat(I, BasePtr, Offsets,
+ TypeScale, Builder))
return Load;
}
+
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"non-wb gather/scatter\n");
@@ -713,14 +779,12 @@
"StartIndex", I);
if (I->getIntrinsicID() == Intrinsic::masked_gather)
- return cast<IntrinsicInst>(
- tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate));
+ return tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate);
else
- return cast<IntrinsicInst>(
- tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate));
+ return tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate);
}
-Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
+Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale,
IRBuilder<> &Builder) {
// Check whether this gather's offset is incremented by a constant - if so,
@@ -780,19 +844,21 @@
Builder.SetInsertPoint(I);
- Value *EndResult;
- Value *NewInduction;
+ Instruction *EndResult;
+ Instruction *NewInduction;
if (I->getIntrinsicID() == Intrinsic::masked_gather) {
// Build the incrementing gather
Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate);
// One value to be handed to whoever uses the gather, one is the loop
// increment
- EndResult = Builder.CreateExtractValue(Load, 0, "Gather");
- NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement");
+ EndResult = ExtractValueInst::Create(Load, 0, "Gather");
+ NewInduction = ExtractValueInst::Create(Load, 1, "GatherIncrement");
+ Builder.Insert(EndResult);
+ Builder.Insert(NewInduction);
} else {
// Build the incrementing scatter
- NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate);
- EndResult = NewInduction;
+ EndResult = NewInduction =
+ tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate);
}
Instruction *AddInst = cast<Instruction>(Offsets);
AddInst->replaceAllUsesWith(NewInduction);
@@ -882,7 +948,8 @@
bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
LoopInfo *LI) {
- LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n"
+ << *Offsets << "\n");
// Optimise the addresses of gathers/scatters by moving invariant
// calculations out of the loop
if (!isa<Instruction>(Offsets))
@@ -910,52 +977,38 @@
Phi = cast<PHINode>(Offs->getOperand(1));
OffsSecondOp = 0;
} else {
- bool Changed = true;
+ bool Changed = false;
if (isa<Instruction>(Offs->getOperand(0)) &&
L->contains(cast<Instruction>(Offs->getOperand(0))))
Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
if (isa<Instruction>(Offs->getOperand(1)) &&
L->contains(cast<Instruction>(Offs->getOperand(1))))
Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
- if (!Changed) {
+ if (!Changed)
return false;
+ if (isa<PHINode>(Offs->getOperand(0))) {
+ Phi = cast<PHINode>(Offs->getOperand(0));
+ OffsSecondOp = 1;
+ } else if (isa<PHINode>(Offs->getOperand(1))) {
+ Phi = cast<PHINode>(Offs->getOperand(1));
+ OffsSecondOp = 0;
} else {
- if (isa<PHINode>(Offs->getOperand(0))) {
- Phi = cast<PHINode>(Offs->getOperand(0));
- OffsSecondOp = 1;
- } else if (isa<PHINode>(Offs->getOperand(1))) {
- Phi = cast<PHINode>(Offs->getOperand(1));
- OffsSecondOp = 0;
- } else {
- return false;
- }
+ return false;
}
}
// A phi node we want to perform this function on should be from the
- // loop header, and shouldn't have more than 2 incoming values
- if (Phi->getParent() != L->getHeader() ||
- Phi->getNumIncomingValues() != 2)
+ // loop header.
+ if (Phi->getParent() != L->getHeader())
return false;
- // The phi must be an induction variable
- int IncrementingBlock = -1;
-
- for (int i = 0; i < 2; i++)
- if (auto *Op = dyn_cast<Instruction>(Phi->getIncomingValue(i)))
- if (Op->getOpcode() == Instruction::Add &&
- (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
- IncrementingBlock = i;
- if (IncrementingBlock == -1)
+ // We're looking for a simple add recurrence.
+ BinaryOperator *IncInstruction;
+ Value *Start, *IncrementPerRound;
+ if (!matchSimpleRecurrence(Phi, IncInstruction, Start, IncrementPerRound) ||
+ IncInstruction->getOpcode() != Instruction::Add)
return false;
- Instruction *IncInstruction =
- cast<Instruction>(Phi->getIncomingValue(IncrementingBlock));
-
- // If the phi is not used by anything else, we can just adapt it when
- // replacing the instruction; if it is, we'll have to duplicate it
- PHINode *NewPhi;
- Value *IncrementPerRound = IncInstruction->getOperand(
- (IncInstruction->getOperand(0) == Phi) ? 1 : 0);
+ int IncrementingBlock = Phi->getIncomingValue(0) == IncInstruction ? 0 : 1;
// Get the value that is added to/multiplied with the phi
Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
@@ -972,6 +1025,9 @@
!L->contains(cast<Instruction>(IncrementPerRound))))
return false;
+ // If the phi is not used by anything else, we can just adapt it when
+ // replacing the instruction; if it is, we'll have to duplicate it
+ PHINode *NewPhi;
if (Phi->getNumUses() == 2) {
// No other users -> reuse existing phi (One user is the instruction
// we're looking at, the other is the phi increment)
@@ -986,8 +1042,7 @@
NewPhi = Phi;
} else {
// There are other users -> create a new phi
- NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi);
- std::vector<Value *> Increases;
+ NewPhi = PHINode::Create(Phi->getType(), 2, "NewPhi", Phi);
// Copy the incoming values of the old phi
NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
@@ -1014,8 +1069,8 @@
default:
return false;
}
- LLVM_DEBUG(
- dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: simplified loop variable "
+ << "add/mul\n");
// The instruction has now been "absorbed" into the phi value
Offs->replaceAllUsesWith(NewPhi);
@@ -1139,9 +1194,8 @@
// (always i32 if it is not of vector type) and the base has to be a
// pointer.
if (Offsets && Base && Base != GEP) {
- PointerType *BaseType = cast<PointerType>(Base->getType());
GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
- BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP);
+ GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
GEP->replaceAllUsesWith(NewAddress);
GEP = NewAddress;
Changed = true;
@@ -1183,23 +1237,23 @@
}
for (unsigned i = 0; i < Gathers.size(); i++) {
IntrinsicInst *I = Gathers[i];
- Value *L = lowerGather(I);
+ Instruction *L = lowerGather(I);
if (L == nullptr)
continue;
// Get rid of any now dead instructions
- SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent());
+ SimplifyInstructionsInBlock(L->getParent());
Changed = true;
}
for (unsigned i = 0; i < Scatters.size(); i++) {
IntrinsicInst *I = Scatters[i];
- Value *S = lowerScatter(I);
+ Instruction *S = lowerScatter(I);
if (S == nullptr)
continue;
// Get rid of any now dead instructions
- SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent());
+ SimplifyInstructionsInBlock(S->getParent());
Changed = true;
}
return Changed;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/src/llvm-project/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
new file mode 100644
index 0000000..538bd10
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -0,0 +1,390 @@
+//===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass interleaves around sext/zext/trunc instructions. MVE does not have
+// a single sext/zext or trunc instruction that takes the bottom half of a
+// vector and extends to a full width, like NEON has with MOVL. Instead it is
+// expected that this happens through top/bottom instructions. So the MVE
+// equivalent VMOVLT/B instructions take either the even or odd elements of the
+// input and extend them to the larger type, producing a vector with half the
+// number of elements each of double the bitwidth. As there is no simple
+// instruction, we often have to turn sext/zext/trunc into a series of lane
+// moves (or stack loads/stores, which we do not do yet).
+//
+// This pass takes vector code that starts at truncs, looks for interconnected
+// blobs of operations that end with sext/zext (or constants/splats) of the
+// form:
+// %sa = sext v8i16 %a to v8i32
+// %sb = sext v8i16 %b to v8i32
+// %add = add v8i32 %sa, %sb
+// %r = trunc %add to v8i16
+// And adds shuffles to allow the use of VMOVL/VMOVN instrctions:
+// %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7>
+// %sa = sext v8i16 %sha to v8i32
+// %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7>
+// %sb = sext v8i16 %shb to v8i32
+// %add = add v8i32 %sa, %sb
+// %r = trunc %add to v8i16
+// %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7>
+// Which can then be split and lowered to MVE instructions efficiently:
+// %sa_b = VMOVLB.s16 %a
+// %sa_t = VMOVLT.s16 %a
+// %sb_b = VMOVLB.s16 %b
+// %sb_t = VMOVLT.s16 %b
+// %add_b = VADD.i32 %sa_b, %sb_b
+// %add_t = VADD.i32 %sa_t, %sb_t
+// %r = VMOVNT.i16 %add_b, %add_t
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-laneinterleave"
+
+cl::opt<bool> EnableInterleave(
+ "enable-mve-interleave", cl::Hidden, cl::init(true),
+ cl::desc("Enable interleave MVE vector operation lowering"));
+
+namespace {
+
+class MVELaneInterleaving : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit MVELaneInterleaving() : FunctionPass(ID) {
+ initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "MVE lane interleaving"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char MVELaneInterleaving::ID = 0;
+
+INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false,
+ false)
+
+Pass *llvm::createMVELaneInterleavingPass() {
+ return new MVELaneInterleaving();
+}
+
+static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
+ SmallSetVector<Instruction *, 4> &Truncs) {
+ // This is not always beneficial to transform. Exts can be incorporated into
+ // loads, Truncs can be folded into stores.
+ // Truncs are usually the same number of instructions,
+ // VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving
+ // Exts are unfortunately more instructions in the general case:
+ // A=VLDRH.32; B=VLDRH.32;
+ // vs with interleaving:
+ // T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
+ // But those VMOVL may be folded into a VMULL.
+
+ // But expensive extends/truncs are always good to remove. FPExts always
+ // involve extra VCVT's so are always considered to be beneficial to convert.
+ for (auto *E : Exts) {
+ if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
+ LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
+ return true;
+ }
+ }
+ for (auto *T : Truncs) {
+ if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
+ LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
+ return true;
+ }
+ }
+
+ // Otherwise, we know we have a load(ext), see if any of the Extends are a
+ // vmull. This is a simple heuristic and certainly not perfect.
+ for (auto *E : Exts) {
+ if (!E->hasOneUse() ||
+ cast<Instruction>(*E->user_begin())->getOpcode() != Instruction::Mul) {
+ LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool tryInterleave(Instruction *Start,
+ SmallPtrSetImpl<Instruction *> &Visited) {
+ LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
+ auto *VT = cast<FixedVectorType>(Start->getType());
+
+ if (!isa<Instruction>(Start->getOperand(0)))
+ return false;
+
+ // Look for connected operations starting from Ext's, terminating at Truncs.
+ std::vector<Instruction *> Worklist;
+ Worklist.push_back(Start);
+ Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
+
+ SmallSetVector<Instruction *, 4> Truncs;
+ SmallSetVector<Instruction *, 4> Exts;
+ SmallSetVector<Use *, 4> OtherLeafs;
+ SmallSetVector<Instruction *, 4> Ops;
+
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ switch (I->getOpcode()) {
+ // Truncs
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ if (Truncs.count(I))
+ continue;
+ Truncs.insert(I);
+ Visited.insert(I);
+ break;
+
+ // Extend leafs
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ case Instruction::FPExt:
+ if (Exts.count(I))
+ continue;
+ for (auto *Use : I->users())
+ Worklist.push_back(cast<Instruction>(Use));
+ Exts.insert(I);
+ break;
+
+ case Instruction::Call: {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::abs:
+ case Intrinsic::smin:
+ case Intrinsic::smax:
+ case Intrinsic::umin:
+ case Intrinsic::umax:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::minnum:
+ case Intrinsic::maxnum:
+ case Intrinsic::fabs:
+ case Intrinsic::fma:
+ case Intrinsic::ceil:
+ case Intrinsic::floor:
+ case Intrinsic::rint:
+ case Intrinsic::round:
+ case Intrinsic::trunc:
+ break;
+ default:
+ return false;
+ }
+ LLVM_FALLTHROUGH; // Fall through to treating these like an operator below.
+ }
+ // Binary/tertiary ops
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::Shl:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::Select:
+ if (Ops.count(I))
+ continue;
+ Ops.insert(I);
+
+ for (Use &Op : I->operands()) {
+ if (!isa<FixedVectorType>(Op->getType()))
+ continue;
+ if (isa<Instruction>(Op))
+ Worklist.push_back(cast<Instruction>(&Op));
+ else
+ OtherLeafs.insert(&Op);
+ }
+
+ for (auto *Use : I->users())
+ Worklist.push_back(cast<Instruction>(Use));
+ break;
+
+ case Instruction::ShuffleVector:
+ // A shuffle of a splat is a splat.
+ if (cast<ShuffleVectorInst>(I)->isZeroEltSplat())
+ continue;
+ LLVM_FALLTHROUGH;
+
+ default:
+ LLVM_DEBUG(dbgs() << " Unhandled instruction: " << *I << "\n");
+ return false;
+ }
+ }
+
+ if (Exts.empty() && OtherLeafs.empty())
+ return false;
+
+ LLVM_DEBUG({
+ dbgs() << "Found group:\n Exts:";
+ for (auto *I : Exts)
+ dbgs() << " " << *I << "\n";
+ dbgs() << " Ops:";
+ for (auto *I : Ops)
+ dbgs() << " " << *I << "\n";
+ dbgs() << " OtherLeafs:";
+ for (auto *I : OtherLeafs)
+ dbgs() << " " << *I->get() << " of " << *I->getUser() << "\n";
+ dbgs() << "Truncs:";
+ for (auto *I : Truncs)
+ dbgs() << " " << *I << "\n";
+ });
+
+ assert(!Truncs.empty() && "Expected some truncs");
+
+ // Check types
+ unsigned NumElts = VT->getNumElements();
+ unsigned BaseElts = VT->getScalarSizeInBits() == 16
+ ? 8
+ : (VT->getScalarSizeInBits() == 8 ? 16 : 0);
+ if (BaseElts == 0 || NumElts % BaseElts != 0) {
+ LLVM_DEBUG(dbgs() << " Type is unsupported\n");
+ return false;
+ }
+ if (Start->getOperand(0)->getType()->getScalarSizeInBits() !=
+ VT->getScalarSizeInBits() * 2) {
+ LLVM_DEBUG(dbgs() << " Type not double sized\n");
+ return false;
+ }
+ for (Instruction *I : Exts)
+ if (I->getOperand(0)->getType() != VT) {
+ LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n");
+ return false;
+ }
+ for (Instruction *I : Truncs)
+ if (I->getType() != VT) {
+ LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n");
+ return false;
+ }
+
+ // Check that it looks beneficial
+ if (!isProfitableToInterleave(Exts, Truncs))
+ return false;
+
+ // Create new shuffles around the extends / truncs / other leaves.
+ IRBuilder<> Builder(Start);
+
+ SmallVector<int, 16> LeafMask;
+ SmallVector<int, 16> TruncMask;
+ // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7 8, 10, 12, 14, 9, 11, 13, 15
+ // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7 8, 12, 9, 13, 10, 14, 11, 15
+ for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
+ for (unsigned i = 0; i < BaseElts / 2; i++)
+ LeafMask.push_back(Base + i * 2);
+ for (unsigned i = 0; i < BaseElts / 2; i++)
+ LeafMask.push_back(Base + i * 2 + 1);
+ }
+ for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
+ for (unsigned i = 0; i < BaseElts / 2; i++) {
+ TruncMask.push_back(Base + i);
+ TruncMask.push_back(Base + i + BaseElts / 2);
+ }
+ }
+
+ for (Instruction *I : Exts) {
+ LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
+ Builder.SetInsertPoint(I);
+ Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
+ bool FPext = isa<FPExtInst>(I);
+ bool Sext = isa<SExtInst>(I);
+ Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
+ : Sext ? Builder.CreateSExt(Shuffle, I->getType())
+ : Builder.CreateZExt(Shuffle, I->getType());
+ I->replaceAllUsesWith(Ext);
+ LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
+ }
+
+ for (Use *I : OtherLeafs) {
+ LLVM_DEBUG(dbgs() << "Replacing leaf " << *I << "\n");
+ Builder.SetInsertPoint(cast<Instruction>(I->getUser()));
+ Value *Shuffle = Builder.CreateShuffleVector(I->get(), LeafMask);
+ I->getUser()->setOperand(I->getOperandNo(), Shuffle);
+ LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
+ }
+
+ for (Instruction *I : Truncs) {
+ LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");
+
+ Builder.SetInsertPoint(I->getParent(), ++I->getIterator());
+ Value *Shuf = Builder.CreateShuffleVector(I, TruncMask);
+ I->replaceAllUsesWith(Shuf);
+ cast<Instruction>(Shuf)->setOperand(0, I);
+
+ LLVM_DEBUG(dbgs() << " with " << *Shuf << "\n");
+ }
+
+ return true;
+}
+
+bool MVELaneInterleaving::runOnFunction(Function &F) {
+ if (!EnableInterleave)
+ return false;
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ bool Changed = false;
+
+ SmallPtrSet<Instruction *, 16> Visited;
+ for (Instruction &I : reverse(instructions(F))) {
+ if (I.getType()->isVectorTy() &&
+ (isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
+ Changed |= tryInterleave(&I, Visited);
+ }
+
+ return Changed;
+}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/src/llvm-project/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
similarity index 79%
rename from src/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
rename to src/llvm-project/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 00e4449..6fa5402 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -1,4 +1,4 @@
-//===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===//
+//===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -41,13 +41,13 @@
cl::init(true));
namespace {
-class MVEVPTOptimisations : public MachineFunctionPass {
+class MVETPAndVPTOptimisations : public MachineFunctionPass {
public:
static char ID;
const Thumb2InstrInfo *TII;
MachineRegisterInfo *MRI;
- MVEVPTOptimisations() : MachineFunctionPass(ID) {}
+ MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -64,6 +64,7 @@
}
private:
+ bool LowerWhileLoopStart(MachineLoop *ML);
bool MergeLoopEnd(MachineLoop *ML);
bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
@@ -74,18 +75,21 @@
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
bool ConvertVPSEL(MachineBasicBlock &MBB);
+ bool HintDoLoopStartReg(MachineBasicBlock &MBB);
+ MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
+ MachineInstr *LoopStart);
};
-char MVEVPTOptimisations::ID = 0;
+char MVETPAndVPTOptimisations::ID = 0;
} // end anonymous namespace
-INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
"ARM MVE TailPred and VPT Optimisations pass", false,
false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE,
+INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
"ARM MVE TailPred and VPT Optimisations pass", false, false)
static MachineInstr *LookThroughCOPY(MachineInstr *MI,
@@ -164,7 +168,9 @@
? LoopPhi->getOperand(3).getReg()
: LoopPhi->getOperand(1).getReg();
LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
- if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) {
+ if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
+ LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
+ LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
return false;
}
@@ -173,6 +179,132 @@
return true;
}
+static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
+ MachineBasicBlock *MBB = MI->getParent();
+ assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
+ "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
+
+ // Subs
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+ MIB.add(MI->getOperand(0));
+ MIB.add(MI->getOperand(1));
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ MIB.addReg(ARM::CPSR, RegState::Define);
+
+ // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
+ for (MachineInstr &I : MBB->terminators()) {
+ if (I.getOpcode() == ARM::t2WhileLoopStart) {
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::EQ);
+ MIB.addReg(ARM::CPSR);
+ I.eraseFromParent();
+ break;
+ }
+ }
+
+ MI->eraseFromParent();
+}
+
+// The Hardware Loop insertion and ISel Lowering produce the pseudos for the
+// start of a while loop:
+// %a:gprlr = t2WhileLoopSetup %Cnt
+// t2WhileLoopStart %a, %BB
+// We want to convert those to a single instruction which, like t2LoopEndDec and
+// t2DoLoopStartTP is both a terminator and produces a value:
+// %a:grplr: t2WhileLoopStartLR %Cnt, %BB
+//
+// Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
+// t2WhileLoopStart are not valid past regalloc.
+bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
+ LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
+ << ML->getHeader()->getName() << "\n");
+
+ MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
+ if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
+ return false;
+
+ if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
+ return false;
+
+ Register LR = LoopStart->getOperand(0).getReg();
+ auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
+ return MI.getOpcode() == ARM::t2WhileLoopStart;
+ });
+ if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
+ RevertWhileLoopSetup(LoopStart, TII);
+ RevertLoopDec(LoopStart, TII);
+ RevertLoopEnd(LoopStart, TII);
+ return true;
+ }
+
+ MachineInstrBuilder MI =
+ BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
+ TII->get(ARM::t2WhileLoopStartLR), LR)
+ .add(LoopStart->getOperand(1))
+ .add(WLSIt->getOperand(1));
+ (void)MI;
+ LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());
+
+ WLSIt->eraseFromParent();
+ LoopStart->eraseFromParent();
+ return true;
+}
+
+// Return true if this instruction is invalid in a low overhead loop, usually
+// because it clobbers LR.
+static bool IsInvalidTPInstruction(MachineInstr &MI) {
+ return MI.isCall() || isLoopStart(MI);
+}
+
+// Starting from PreHeader, search for invalid instructions back until the
+// LoopStart block is reached. If invalid instructions are found, the loop start
+// is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
+// return the new DLS LoopStart if updated.
+MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
+ MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
+ SmallVector<MachineBasicBlock *> Worklist;
+ SmallPtrSet<MachineBasicBlock *, 4> Visited;
+ Worklist.push_back(PreHeader);
+ Visited.insert(LoopStart->getParent());
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
+ if (Visited.count(MBB))
+ continue;
+
+ for (MachineInstr &MI : *MBB) {
+ if (!IsInvalidTPInstruction(MI))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);
+
+ // Create a t2DoLoopStart at the end of the preheader.
+ MachineInstrBuilder MIB =
+ BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
+ LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
+ MIB.add(LoopStart->getOperand(0));
+ MIB.add(LoopStart->getOperand(1));
+
+ // Make sure to remove the kill flags, to prevent them from being invalid.
+ LoopStart->getOperand(1).setIsKill(false);
+
+ // Revert the t2WhileLoopStartLR to a CMP and Br.
+ RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
+ return MIB;
+ }
+
+ Visited.insert(MBB);
+ for (auto *Pred : MBB->predecessors())
+ Worklist.push_back(Pred);
+ }
+ return LoopStart;
+}
+
// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
// will be valid to be used for the low overhead loop, which means nothing else
@@ -180,7 +312,7 @@
// loop. The t2LoopEndDec is a branching terminator that produces a value (the
// decrement) around the loop edge, which means we need to be careful that they
// will be valid to allocate without any spilling.
-bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
+bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
if (!MergeEndDec)
return false;
@@ -192,12 +324,20 @@
return false;
// Check if there is an illegal instruction (a call) in the low overhead loop
- // and if so revert it now before we get any further.
+ // and if so revert it now before we get any further. While loops also need to
+ // check the preheaders, but can be reverted to a DLS loop if needed.
+ auto *PreHeader = ML->getLoopPreheader();
+ if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
+ LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);
+
for (MachineBasicBlock *MBB : ML->blocks()) {
for (MachineInstr &MI : *MBB) {
- if (MI.isCall()) {
- LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
- RevertDoLoopStart(LoopStart, TII);
+ if (IsInvalidTPInstruction(MI)) {
+ LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
+ if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
+ RevertDoLoopStart(LoopStart, TII);
+ else
+ RevertWhileLoopStartLR(LoopStart, TII);
RevertLoopDec(LoopDec, TII);
RevertLoopEnd(LoopEnd, TII);
return true;
@@ -236,8 +376,16 @@
};
if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
!CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
- !CheckUsers(StartReg, {LoopPhi}, MRI))
+ !CheckUsers(StartReg, {LoopPhi}, MRI)) {
+ // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
+ if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
+ RevertWhileLoopStartLR(LoopStart, TII);
+ RevertLoopDec(LoopDec, TII);
+ RevertLoopEnd(LoopEnd, TII);
+ return true;
+ }
return false;
+ }
MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
@@ -271,7 +419,7 @@
// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
// instruction, making the backend ARMLowOverheadLoops passes job of finding the
// VCTP operand much simpler.
-bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
+bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
MachineDominatorTree *DT) {
LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
<< ML->getHeader()->getName() << "\n");
@@ -281,7 +429,8 @@
MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
return false;
- if (LoopDec != LoopEnd)
+ if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
+ LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
return false;
SmallVector<MachineInstr *, 4> VCTPs;
@@ -346,12 +495,16 @@
return false;
}
- MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
- TII->get(ARM::t2DoLoopStartTP))
- .add(LoopStart->getOperand(0))
- .add(LoopStart->getOperand(1))
- .addReg(CountReg);
- (void)MI;
+ unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
+ ? ARM::t2DoLoopStartTP
+ : ARM::t2WhileLoopStartTP;
+ MachineInstrBuilder MI =
+ BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
+ .add(LoopStart->getOperand(0))
+ .add(LoopStart->getOperand(1))
+ .addReg(CountReg);
+ if (NewOpc == ARM::t2WhileLoopStartTP)
+ MI.add(LoopStart->getOperand(2));
LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
<< *MI.getInstr());
MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
@@ -443,7 +596,7 @@
// And returns the newly inserted VPNOT.
// This optimization is done in the hopes of preventing spills/reloads of VPR by
// reducing the number of VCCR values with overlapping lifetimes.
-MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT(
+MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
Register Target) {
Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
@@ -528,7 +681,7 @@
// %Foo = (some op that uses %B)
// %TMP2:vccr = VPNOT %B
// %Bar = (some op that uses %A)
-bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
+bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
SmallVector<MachineInstr *, 4> DeadInstructions;
bool Modified = false;
@@ -656,7 +809,7 @@
}
// This optimisation replaces VCMPs with VPNOTs when they are equivalent.
-bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
+bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> DeadInstructions;
// The last VCMP that we have seen and that couldn't be replaced.
@@ -729,7 +882,7 @@
return !DeadInstructions.empty();
}
-bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
+bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
MachineDominatorTree *DT) {
// Scan through the block, looking for instructions that use constants moves
// into VPR that are the negative of one another. These are expected to be
@@ -818,7 +971,7 @@
// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
// different semantics under tail predication. Until that is modelled we just
// convert to a VMOVT (via a predicated VORR) instead.
-bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
+bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
bool HasVCTP = false;
SmallVector<MachineInstr *, 4> DeadInstructions;
@@ -852,7 +1005,22 @@
return !DeadInstructions.empty();
}
-bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
+// Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
+// the instruction may be removable as a noop.
+bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (MI.getOpcode() != ARM::t2DoLoopStart)
+ continue;
+ Register R = MI.getOperand(1).getReg();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
+ Changed = true;
+ }
+ return Changed;
+}
+
+bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -869,11 +1037,13 @@
bool Modified = false;
for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
+ Modified |= LowerWhileLoopStart(ML);
Modified |= MergeLoopEnd(ML);
Modified |= ConvertTailPredLoop(ML, DT);
}
for (MachineBasicBlock &MBB : Fn) {
+ Modified |= HintDoLoopStartReg(MBB);
Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
@@ -884,7 +1054,7 @@
return Modified;
}
-/// createMVEVPTOptimisationsPass
-FunctionPass *llvm::createMVEVPTOptimisationsPass() {
- return new MVEVPTOptimisations();
+/// createMVETPAndVPTOptimisationsPass
+FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() {
+ return new MVETPAndVPTOptimisations();
}
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h b/src/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
index 9ab5d92..8c64893 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
@@ -68,30 +68,65 @@
return false;
}
-static inline bool isLoopStart(MachineInstr &MI) {
+static inline bool isDoLoopStart(const MachineInstr &MI) {
return MI.getOpcode() == ARM::t2DoLoopStart ||
- MI.getOpcode() == ARM::t2DoLoopStartTP ||
- MI.getOpcode() == ARM::t2WhileLoopStart;
+ MI.getOpcode() == ARM::t2DoLoopStartTP;
}
-// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
-// beq that branches to the exit branch.
-inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII,
- unsigned BrOpc = ARM::t2Bcc) {
- MachineBasicBlock *MBB = MI->getParent();
+static inline bool isWhileLoopStart(const MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2WhileLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStartLR ||
+ MI.getOpcode() == ARM::t2WhileLoopStartTP;
+}
- // Cmp
- MachineInstrBuilder MIB =
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
- MIB.add(MI->getOperand(0));
- MIB.addImm(0);
- MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::NoRegister);
+static inline bool isLoopStart(const MachineInstr &MI) {
+ return isDoLoopStart(MI) || isWhileLoopStart(MI);
+}
+
+// Return the TargetBB stored in a t2WhileLoopStartLR/t2WhileLoopStartTP.
+inline MachineBasicBlock *getWhileLoopStartTargetBB(const MachineInstr &MI) {
+ assert(isWhileLoopStart(MI) && "Expected WhileLoopStart!");
+ unsigned Op = MI.getOpcode() == ARM::t2WhileLoopStartTP ? 3 : 2;
+ return MI.getOperand(Op).getMBB();
+}
+
+// WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a
+// beq that branches to the exit branch.
+// If UseCmp is true, this will create a t2CMP instead of a t2SUBri, meaning the
+// value of LR into the loop will not be setup. This is used if the LR setup is
+// done via another means (via a t2DoLoopStart, for example).
+inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
+ unsigned BrOpc = ARM::t2Bcc,
+ bool UseCmp = false) {
+ MachineBasicBlock *MBB = MI->getParent();
+ assert((MI->getOpcode() == ARM::t2WhileLoopStartLR ||
+ MI->getOpcode() == ARM::t2WhileLoopStartTP) &&
+ "Only expected a t2WhileLoopStartLR/TP in RevertWhileLoopStartLR!");
+
+ // Subs/Cmp
+ if (UseCmp) {
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
+ MIB.add(MI->getOperand(1));
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ } else {
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+ MIB.add(MI->getOperand(0));
+ MIB.add(MI->getOperand(1));
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ MIB.addReg(ARM::CPSR, RegState::Define);
+ }
// Branch
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
- MIB.add(MI->getOperand(1)); // branch target
- MIB.addImm(ARMCC::EQ); // condition code
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+ MIB.addMBB(getWhileLoopStartTargetBB(*MI)); // branch target
+ MIB.addImm(ARMCC::EQ); // condition code
MIB.addReg(ARM::CPSR);
MI->eraseFromParent();
diff --git a/src/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/src/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
index cccac55..cf9e248 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -156,7 +156,7 @@
Intrinsic::ID ID = Call->getIntrinsicID();
if (ID == Intrinsic::start_loop_iterations ||
- ID == Intrinsic::test_set_loop_iterations)
+ ID == Intrinsic::test_start_loop_iterations)
return cast<IntrinsicInst>(&I);
}
return nullptr;
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/src/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index d568e9a..ccd272a 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -403,7 +403,7 @@
AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
- if (RegInfo->needsStackRealignment(MF)) {
+ if (RegInfo->hasStackRealignment(MF)) {
const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
// Emit the following sequence, using R4 as a temporary, since we cannot use
// SP as a source or destination register for the shifts:
@@ -681,7 +681,7 @@
// R7 may be used as a frame pointer, hence marked as not generally
// allocatable, however there's no reason to not use it as a temporary for
// restoring LR.
- if (STI.useR7AsFramePointer())
+ if (STI.getFramePointerReg() == ARM::R7)
PopFriendly.set(ARM::R7);
assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 79afa37..cf5eb4b 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -16,6 +16,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
using namespace llvm;
@@ -23,12 +24,12 @@
: ARMBaseInstrInfo(STI), RI() {}
/// Return the noop instruction to use for a noop.
-void Thumb1InstrInfo::getNoop(MCInst &NopInst) const {
- NopInst.setOpcode(ARM::tMOVr);
- NopInst.addOperand(MCOperand::createReg(ARM::R8));
- NopInst.addOperand(MCOperand::createReg(ARM::R8));
- NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
- NopInst.addOperand(MCOperand::createReg(0));
+MCInst Thumb1InstrInfo::getNop() const {
+ return MCInstBuilder(ARM::tMOVr)
+ .addReg(ARM::R8)
+ .addReg(ARM::R8)
+ .addImm(ARMCC::AL)
+ .addReg(0);
}
unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 017b722..0b8f3ae 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -25,7 +25,7 @@
explicit Thumb1InstrInfo(const ARMSubtarget &STI);
/// Return the noop instruction to use for a noop.
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
// Return the non-pre/post incrementing version of 'Opc'. Return 0
// if there is not such an opcode.
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index d728572..5204e3b 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
@@ -48,11 +49,8 @@
: ARMBaseInstrInfo(STI) {}
/// Return the noop instruction to use for a noop.
-void Thumb2InstrInfo::getNoop(MCInst &NopInst) const {
- NopInst.setOpcode(ARM::tHINT);
- NopInst.addOperand(MCOperand::createImm(0));
- NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
- NopInst.addOperand(MCOperand::createReg(0));
+MCInst Thumb2InstrInfo::getNop() const {
+ return MCInstBuilder(ARM::tHINT).addImm(0).addImm(ARMCC::AL).addReg(0);
}
unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
diff --git a/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 808167b..e6d5179 100644
--- a/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -26,7 +26,7 @@
explicit Thumb2InstrInfo(const ARMSubtarget &STI);
/// Return the noop instruction to use for a noop.
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
// Return the non-pre/post incrementing version of 'Opc'. Return 0
// if there is not such an opcode.
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVR.h b/src/llvm-project/llvm/lib/Target/AVR/AVR.h
index f0746d7..7332307 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVR.h
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVR.h
@@ -22,6 +22,7 @@
class AVRTargetMachine;
class FunctionPass;
+Pass *createAVRShiftExpandPass();
FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
CodeGenOpt::Level OptLevel);
FunctionPass *createAVRExpandPseudoPass();
@@ -30,6 +31,7 @@
FunctionPass *createAVRDynAllocaSRPass();
FunctionPass *createAVRBranchSelectionPass();
+void initializeAVRShiftExpandPass(PassRegistry &);
void initializeAVRExpandPseudoPass(PassRegistry&);
void initializeAVRRelaxMemPass(PassRegistry&);
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 722eecd..e8a13c7 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -15,6 +15,7 @@
#include "AVRMCInstLower.h"
#include "AVRSubtarget.h"
#include "MCTargetDesc/AVRInstPrinter.h"
+#include "MCTargetDesc/AVRMCExpr.h"
#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
@@ -53,6 +54,8 @@
void emitInstruction(const MachineInstr *MI) override;
+ const MCExpr *lowerConstant(const Constant *CV) override;
+
private:
const MCRegisterInfo &MRI;
};
@@ -176,6 +179,20 @@
EmitToStreamer(*OutStreamer, I);
}
+const MCExpr *AVRAsmPrinter::lowerConstant(const Constant *CV) {
+ MCContext &Ctx = OutContext;
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+ bool IsProgMem = GV->getAddressSpace() == AVR::ProgramMemory;
+ if (IsProgMem) {
+ const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GV), Ctx);
+ return AVRMCExpr::create(AVRMCExpr::VK_AVR_PM, Expr, false, Ctx);
+ }
+ }
+
+ return AsmPrinter::lowerConstant(CV);
+}
+
} // end of namespace llvm
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index a48d3d1..f9f91f5 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -90,6 +90,18 @@
Block &MBB,
BlockIt MBBI);
+ /// Specific shift implementation.
+ bool expandLSLB7Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSRB7Rd(Block &MBB, BlockIt MBBI);
+ bool expandASRB7Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSLW4Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSRW4Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSLW8Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSRW8Rd(Block &MBB, BlockIt MBBI);
+ bool expandASRW8Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSLW12Rd(Block &MBB, BlockIt MBBI);
+ bool expandLSRW12Rd(Block &MBB, BlockIt MBBI);
+
/// Scavenges a free GPR8 register for use.
Register scavengeGPR8(MachineInstr &MI);
};
@@ -438,12 +450,12 @@
.addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
.addReg(DstLoReg, getKillRegState(DstIsKill));
- // Do an extra SBCI.
+ // Do an extra SBC.
auto MISBCI =
- buildMI(MBB, MBBI, AVR::SBCIRdK)
+ buildMI(MBB, MBBI, AVR::SBCRdRr)
.addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
.addReg(DstHiReg, getKillRegState(DstIsKill))
- .addImm(0);
+ .addReg(ZERO_REGISTER);
if (ImpIsDead)
MISBCI->getOperand(3).setIsDead();
// SREG is always implicitly killed
@@ -650,10 +662,10 @@
if (TmpReg) {
// Move the high byte into the final destination.
- buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+ buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg);
// Move the low byte from the scratch space into the final destination.
- buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+ buildMI(MBB, MBBI, AVR::POPRd, DstLoReg);
}
MIBLO.setMemRefs(MI.memoperands());
@@ -767,10 +779,10 @@
if (TmpReg) {
// Move the high byte into the final destination.
- buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+ buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg);
// Move the low byte from the scratch space into the final destination.
- buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+ buildMI(MBB, MBBI, AVR::POPRd, DstLoReg);
}
MIBLO.setMemRefs(MI.memoperands());
@@ -815,10 +827,10 @@
if (TmpReg) {
// Move the high byte into the final destination.
- buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+ buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg);
// Move the low byte from the scratch space into the final destination.
- buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+ buildMI(MBB, MBBI, AVR::POPRd, DstLoReg);
}
MIBLO.setMemRefs(MI.memoperands());
@@ -883,20 +895,24 @@
Block &MBB,
BlockIt MBBI) {
return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
- auto Op1 = MI.getOperand(0);
- auto Op2 = MI.getOperand(1);
+ auto DstReg = MI.getOperand(0).getReg();
+ auto PtrOp = MI.getOperand(1);
+ auto SrcReg = MI.getOperand(2).getReg();
unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
+ // FIXME: this returns the new value (after the operation), not the old
+ // value as the atomicrmw instruction is supposed to do!
+
// Create the load
- buildMI(MBB, MBBI, LoadOpcode).add(Op1).add(Op2);
+ buildMI(MBB, MBBI, LoadOpcode, DstReg).addReg(PtrOp.getReg());
// Create the arithmetic op
- buildMI(MBB, MBBI, ArithOpcode).add(Op1).add(Op1).add(Op2);
+ buildMI(MBB, MBBI, ArithOpcode, DstReg).addReg(DstReg).addReg(SrcReg);
// Create the store
- buildMI(MBB, MBBI, StoreOpcode).add(Op2).add(Op1);
+ buildMI(MBB, MBBI, StoreOpcode).add(PtrOp).addReg(DstReg);
});
}
@@ -1055,6 +1071,7 @@
Register SrcLoReg, SrcHiReg;
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
+ bool DstIsUndef = MI.getOperand(0).isUndef();
bool SrcIsKill = MI.getOperand(1).isKill();
unsigned OpLo = AVR::STPtrRr;
unsigned OpHi = AVR::STDPtrQRr;
@@ -1062,11 +1079,11 @@
//:TODO: need to reverse this order like inw and stsw?
auto MIBLO = buildMI(MBB, MBBI, OpLo)
- .addReg(DstReg)
+ .addReg(DstReg, getUndefRegState(DstIsUndef))
.addReg(SrcLoReg, getKillRegState(SrcIsKill));
auto MIBHI = buildMI(MBB, MBBI, OpHi)
- .addReg(DstReg)
+ .addReg(DstReg, getUndefRegState(DstIsUndef))
.addImm(1)
.addReg(SrcHiReg, getKillRegState(SrcIsKill));
@@ -1328,42 +1345,20 @@
// to explicitly add the carry bit.
MachineInstr &MI = *MBBI;
- unsigned OpShiftOut, OpLoad, OpShiftIn, OpAdd;
Register DstReg = MI.getOperand(0).getReg();
- bool DstIsDead = MI.getOperand(0).isDead();
- OpShiftOut = AVR::LSRRd;
- OpLoad = AVR::LDIRdK;
- OpShiftIn = AVR::RORRd;
- OpAdd = AVR::ORRdRr;
- // lsr r16
- // ldi r0, 0
- // ror r0
- // or r16, r17
+ // bst r16, 0
+ // ror r16
+ // bld r16, 7
- // Shift out
- buildMI(MBB, MBBI, OpShiftOut)
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg);
+ // Move the lowest bit from DstReg into the T bit
+ buildMI(MBB, MBBI, AVR::BST).addReg(DstReg).addImm(0);
- // Put 0 in temporary register
- buildMI(MBB, MBBI, OpLoad)
- .addReg(SCRATCH_REGISTER, RegState::Define | getDeadRegState(true))
- .addImm(0x00);
+ // Rotate to the right
+ buildMI(MBB, MBBI, AVR::RORRd, DstReg).addReg(DstReg);
- // Shift in
- buildMI(MBB, MBBI, OpShiftIn)
- .addReg(SCRATCH_REGISTER, RegState::Define | getDeadRegState(true))
- .addReg(SCRATCH_REGISTER);
-
- // Add the results together using an or-instruction
- auto MIB = buildMI(MBB, MBBI, OpAdd)
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg)
- .addReg(SCRATCH_REGISTER);
-
- // SREG is always implicitly killed
- MIB->getOperand(2).setIsKill();
+ // Move the T bit into the highest bit of DstReg.
+ buildMI(MBB, MBBI, AVR::BLD, DstReg).addReg(DstReg).addImm(7);
MI.eraseFromParent();
return true;
@@ -1402,6 +1397,149 @@
return true;
}
+bool AVRExpandPseudo::expandLSLW4Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // swap Rh
+ // swap Rl
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ // andi Rh, 0xf0
+ auto MI0 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addImm(0xf0);
+ // SREG is implicitly dead.
+ MI0->getOperand(3).setIsDead();
+
+ // eor Rh, Rl
+ auto MI1 =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(DstLoReg);
+ // SREG is implicitly dead.
+ MI1->getOperand(3).setIsDead();
+
+ // andi Rl, 0xf0
+ auto MI2 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addImm(0xf0);
+ // SREG is implicitly dead.
+ MI2->getOperand(3).setIsDead();
+
+ // eor Rh, Rl
+ auto MI3 =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(DstLoReg);
+ if (ImpIsDead)
+ MI3->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::expandLSLW8Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // mov Rh, Rl
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg);
+
+ // clr Rl
+ auto MIBLO =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+ if (ImpIsDead)
+ MIBLO->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::expandLSLW12Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // mov Rh, Rl
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg);
+
+ // swap Rh
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ // andi Rh, 0xf0
+ auto MI0 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addImm(0xf0);
+ // SREG is implicitly dead.
+ MI0->getOperand(3).setIsDead();
+
+ // clr Rl
+ auto MI1 =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+ if (ImpIsDead)
+ MI1->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLWNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 4:
+ return expandLSLW4Rd(MBB, MBBI);
+ case 8:
+ return expandLSLW8Rd(MBB, MBBI);
+ case 12:
+ return expandLSLW12Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented lslwn");
+ return false;
+ }
+}
+
template <>
bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
@@ -1433,6 +1571,149 @@
return true;
}
+bool AVRExpandPseudo::expandLSRW4Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // swap Rh
+ // swap Rl
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ // andi Rl, 0xf
+ auto MI0 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addImm(0xf);
+ // SREG is implicitly dead.
+ MI0->getOperand(3).setIsDead();
+
+ // eor Rl, Rh
+ auto MI1 =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(DstHiReg);
+ // SREG is implicitly dead.
+ MI1->getOperand(3).setIsDead();
+
+ // andi Rh, 0xf
+ auto MI2 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addImm(0xf);
+ // SREG is implicitly dead.
+ MI2->getOperand(3).setIsDead();
+
+ // eor Rl, Rh
+ auto MI3 =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(DstHiReg);
+ if (ImpIsDead)
+ MI3->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::expandLSRW8Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Move upper byte to lower byte.
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg);
+
+ // Clear upper byte.
+ auto MIBHI =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::expandLSRW12Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Move upper byte to lower byte.
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg);
+
+ // swap Rl
+ buildMI(MBB, MBBI, AVR::SWAPRd)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ // andi Rl, 0xf
+ auto MI0 =
+ buildMI(MBB, MBBI, AVR::ANDIRdK)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addImm(0xf);
+ // SREG is implicitly dead.
+ MI0->getOperand(3).setIsDead();
+
+ // Clear upper byte.
+ auto MIBHI =
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRWNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 4:
+ return expandLSRW4Rd(MBB, MBBI);
+ case 8:
+ return expandLSRW8Rd(MBB, MBBI);
+ case 12:
+ return expandLSRW12Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented lsrwn");
+ return false;
+ }
+}
+
template <>
bool AVRExpandPseudo::expand<AVR::RORWRd>(Block &MBB, BlockIt MBBI) {
llvm_unreachable("RORW unimplemented");
@@ -1476,13 +1757,58 @@
return true;
}
+bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Move upper byte to lower byte.
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg);
+
+ // Move the sign bit to the C flag.
+ buildMI(MBB, MBBI, AVR::ADDRdRr)
+ .addReg(DstHiReg, RegState::Define, getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill) | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ // Set upper byte to 0 or -1.
+ auto MIBHI =
+ buildMI(MBB, MBBI, AVR::SBCRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
template <>
-bool AVRExpandPseudo::expand<AVR::LSLB7Rd>(Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expand<AVR::ASRWNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 8:
+ return expandASRW8Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented asrwn");
+ return false;
+ }
+}
+
+bool AVRExpandPseudo::expandLSLB7Rd(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool DstIsKill = MI.getOperand(1).isKill();
- bool ImpIsDead = MI.getOperand(2).isDead();
+ bool ImpIsDead = MI.getOperand(3).isDead();
// ror r24
// clr r24
@@ -1490,7 +1816,8 @@
buildMI(MBB, MBBI, AVR::RORRd)
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg, getKillRegState(DstIsKill));
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ ->getOperand(3).setIsUndef(true);
buildMI(MBB, MBBI, AVR::EORRdRr)
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -1513,12 +1840,24 @@
}
template <>
-bool AVRExpandPseudo::expand<AVR::LSRB7Rd>(Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expand<AVR::LSLBNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 7:
+ return expandLSLB7Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented lslbn");
+ return false;
+ }
+}
+
+bool AVRExpandPseudo::expandLSRB7Rd(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool DstIsKill = MI.getOperand(1).isKill();
- bool ImpIsDead = MI.getOperand(2).isDead();
+ bool ImpIsDead = MI.getOperand(3).isDead();
// rol r24
// clr r24
@@ -1527,7 +1866,8 @@
buildMI(MBB, MBBI, AVR::ADCRdRr)
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
.addReg(DstReg, getKillRegState(DstIsKill))
- .addReg(DstReg, getKillRegState(DstIsKill));
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ ->getOperand(4).setIsUndef(true);
buildMI(MBB, MBBI, AVR::EORRdRr)
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -1551,12 +1891,24 @@
}
template <>
-bool AVRExpandPseudo::expand<AVR::ASRB7Rd>(Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expand<AVR::LSRBNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 7:
+ return expandLSRB7Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented lsrbn");
+ return false;
+ }
+}
+
+bool AVRExpandPseudo::expandASRB7Rd(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool DstIsKill = MI.getOperand(1).isKill();
- bool ImpIsDead = MI.getOperand(2).isDead();
+ bool ImpIsDead = MI.getOperand(3).isDead();
// lsl r24
// sbc r24, r24
@@ -1581,6 +1933,19 @@
return true;
}
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRBNRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Imm = MI.getOperand(2).getImm();
+ switch (Imm) {
+ case 7:
+ return expandASRB7Rd(MBB, MBBI);
+ default:
+ llvm_unreachable("unimplemented asrbn");
+ return false;
+ }
+}
+
template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register DstLoReg, DstHiReg;
@@ -1666,8 +2031,8 @@
auto EOR = buildMI(MBB, MBBI, AVR::EORRdRr)
.addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstHiReg, RegState::Kill)
- .addReg(DstHiReg, RegState::Kill);
+ .addReg(DstHiReg, RegState::Kill | RegState::Undef)
+ .addReg(DstHiReg, RegState::Kill | RegState::Undef);
if (ImpIsDead)
EOR->getOperand(3).setIsDead();
@@ -1802,9 +2167,12 @@
EXPAND(AVR::RORWRd);
EXPAND(AVR::ROLWRd);
EXPAND(AVR::ASRWRd);
- EXPAND(AVR::LSLB7Rd);
- EXPAND(AVR::LSRB7Rd);
- EXPAND(AVR::ASRB7Rd);
+ EXPAND(AVR::LSLWNRd);
+ EXPAND(AVR::LSRWNRd);
+ EXPAND(AVR::ASRWNRd);
+ EXPAND(AVR::LSLBNRd);
+ EXPAND(AVR::LSRBNRd);
+ EXPAND(AVR::ASRBNRd);
EXPAND(AVR::SEXT);
EXPAND(AVR::ZEXT);
EXPAND(AVR::SPREAD);
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index 757b414..89ed30e 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -83,6 +83,11 @@
.addReg(AVR::R0, RegState::Kill)
.addReg(AVR::R0, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
+ .addReg(AVR::R1, RegState::Define)
+ .addReg(AVR::R1, RegState::Kill)
+ .addReg(AVR::R1, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
}
// Early exit if the frame pointer is not needed in this function.
@@ -362,7 +367,7 @@
New->getOperand(3).setIsDead();
BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
- .addReg(AVR::R31R30, RegState::Kill);
+ .addReg(AVR::R31R30);
// Make sure the remaining stack stores are converted to real store
// instructions.
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3e7c298..58a7aed 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -334,7 +334,7 @@
llvm_unreachable("Invalid shift opcode");
}
- // Optimize int8 shifts.
+ // Optimize int8/int16 shifts.
if (VT.getSizeInBits() == 8) {
if (Op.getOpcode() == ISD::SHL && 4 <= ShiftAmount && ShiftAmount < 7) {
// Optimize LSL when 4 <= ShiftAmount <= 6.
@@ -351,17 +351,71 @@
ShiftAmount -= 4;
} else if (Op.getOpcode() == ISD::SHL && ShiftAmount == 7) {
// Optimize LSL when ShiftAmount == 7.
- Victim = DAG.getNode(AVRISD::LSL7, dl, VT, Victim);
+ Victim = DAG.getNode(AVRISD::LSLBN, dl, VT, Victim,
+ DAG.getConstant(7, dl, VT));
ShiftAmount = 0;
} else if (Op.getOpcode() == ISD::SRL && ShiftAmount == 7) {
// Optimize LSR when ShiftAmount == 7.
- Victim = DAG.getNode(AVRISD::LSR7, dl, VT, Victim);
+ Victim = DAG.getNode(AVRISD::LSRBN, dl, VT, Victim,
+ DAG.getConstant(7, dl, VT));
ShiftAmount = 0;
} else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 7) {
// Optimize ASR when ShiftAmount == 7.
- Victim = DAG.getNode(AVRISD::ASR7, dl, VT, Victim);
+ Victim = DAG.getNode(AVRISD::ASRBN, dl, VT, Victim,
+ DAG.getConstant(7, dl, VT));
ShiftAmount = 0;
}
+ } else if (VT.getSizeInBits() == 16) {
+ if (4 <= ShiftAmount && ShiftAmount < 8)
+ switch (Op.getOpcode()) {
+ case ISD::SHL:
+ Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim,
+ DAG.getConstant(4, dl, VT));
+ ShiftAmount -= 4;
+ break;
+ case ISD::SRL:
+ Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim,
+ DAG.getConstant(4, dl, VT));
+ ShiftAmount -= 4;
+ break;
+ default:
+ break;
+ }
+ else if (8 <= ShiftAmount && ShiftAmount < 12)
+ switch (Op.getOpcode()) {
+ case ISD::SHL:
+ Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim,
+ DAG.getConstant(8, dl, VT));
+ ShiftAmount -= 8;
+ break;
+ case ISD::SRL:
+ Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim,
+ DAG.getConstant(8, dl, VT));
+ ShiftAmount -= 8;
+ break;
+ case ISD::SRA:
+ Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
+ DAG.getConstant(8, dl, VT));
+ ShiftAmount -= 8;
+ break;
+ default:
+ break;
+ }
+ else if (12 <= ShiftAmount)
+ switch (Op.getOpcode()) {
+ case ISD::SHL:
+ Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim,
+ DAG.getConstant(12, dl, VT));
+ ShiftAmount -= 12;
+ break;
+ case ISD::SRL:
+ Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim,
+ DAG.getConstant(12, dl, VT));
+ ShiftAmount -= 12;
+ break;
+ default:
+ break;
+ }
}
while (ShiftAmount--) {
@@ -477,7 +531,7 @@
SDValue Cmp;
- if (LHS.getSimpleValueType() == MVT::i16 && dyn_cast<ConstantSDNode>(RHS)) {
+ if (LHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(RHS)) {
// Generate a CPI/CPC pair if RHS is a 16-bit constant.
SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
DAG.getIntPtrConstant(0, DL));
@@ -1269,15 +1323,17 @@
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
}
- // Second, stack arguments have to walked in reverse order by inserting
- // chained stores, this ensures their order is not changed by the scheduler
- // and that the push instruction sequence generated is correct, otherwise they
- // can be freely intermixed.
+ // Second, stack arguments have to walked.
+ // Previously this code created chained stores but those chained stores appear
+ // to be unchained in the legalization phase. Therefore, do not attempt to
+ // chain them here. In fact, chaining them here somehow causes the first and
+ // second store to be reversed which is the exact opposite of the intended
+ // effect.
if (HasStackArgs) {
- for (AE = AI, AI = ArgLocs.size(); AI != AE; --AI) {
- unsigned Loc = AI - 1;
- CCValAssign &VA = ArgLocs[Loc];
- SDValue Arg = OutVals[Loc];
+ SmallVector<SDValue, 8> MemOpChains;
+ for (; AI != AE; AI++) {
+ CCValAssign &VA = ArgLocs[AI];
+ SDValue Arg = OutVals[AI];
assert(VA.isMemLoc());
@@ -1287,10 +1343,13 @@
DAG.getRegister(AVR::SP, getPointerTy(DAG.getDataLayout())),
DAG.getIntPtrConstant(VA.getLocMemOffset() + 1, DL));
- Chain =
+ MemOpChains.push_back(
DAG.getStore(Chain, DL, Arg, PtrOff,
- MachinePointerInfo::getStack(MF, VA.getLocMemOffset()));
+ MachinePointerInfo::getStack(MF, VA.getLocMemOffset())));
}
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
}
// Build a sequence of copy-to-reg nodes chained together with token chain and
@@ -1871,44 +1930,65 @@
AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- // We only support i8 and i16.
- //
- //:FIXME: remove this assert for now since it gets sometimes executed
- // assert((VT == MVT::i16 || VT == MVT::i8) && "Wrong operand type.");
-
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'a': // Simple upper registers r16..r23.
- return std::make_pair(0U, &AVR::LD8loRegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &AVR::LD8loRegClass);
+ else if (VT == MVT::i16)
+ return std::make_pair(0U, &AVR::DREGSLD8loRegClass);
+ break;
case 'b': // Base pointer registers: y, z.
- return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
+ break;
case 'd': // Upper registers r16..r31.
- return std::make_pair(0U, &AVR::LD8RegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &AVR::LD8RegClass);
+ else if (VT == MVT::i16)
+ return std::make_pair(0U, &AVR::DLDREGSRegClass);
+ break;
case 'l': // Lower registers r0..r15.
- return std::make_pair(0U, &AVR::GPR8loRegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &AVR::GPR8loRegClass);
+ else if (VT == MVT::i16)
+ return std::make_pair(0U, &AVR::DREGSloRegClass);
+ break;
case 'e': // Pointer register pairs: x, y, z.
- return std::make_pair(0U, &AVR::PTRREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(0U, &AVR::PTRREGSRegClass);
+ break;
case 'q': // Stack pointer register: SPH:SPL.
return std::make_pair(0U, &AVR::GPRSPRegClass);
case 'r': // Any register: r0..r31.
if (VT == MVT::i8)
return std::make_pair(0U, &AVR::GPR8RegClass);
-
- assert(VT == MVT::i16 && "inline asm constraint too large");
- return std::make_pair(0U, &AVR::DREGSRegClass);
+ else if (VT == MVT::i16)
+ return std::make_pair(0U, &AVR::DREGSRegClass);
+ break;
case 't': // Temporary register: r0.
- return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
+ break;
case 'w': // Special upper register pairs: r24, r26, r28, r30.
- return std::make_pair(0U, &AVR::IWREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(0U, &AVR::IWREGSRegClass);
+ break;
case 'x': // Pointer register pair X: r27:r26.
case 'X':
- return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
+ break;
case 'y': // Pointer register pair Y: r29:r28.
case 'Y':
- return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
+ break;
case 'z': // Pointer register pair Z: r31:r30.
case 'Z':
- return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
+ break;
default:
break;
}
@@ -2031,37 +2111,21 @@
if (VT == LLT::scalar(8)) {
Reg = StringSwitch<unsigned>(RegName)
- .Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
- .Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
- .Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
- .Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
- .Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
- .Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
- .Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
- .Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
- .Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
- .Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
- .Case("r30", AVR::R30).Case("r31", AVR::R31)
- .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
- .Default(0);
+ .Case("r0", AVR::R0)
+ .Case("r1", AVR::R1)
+ .Default(0);
} else {
Reg = StringSwitch<unsigned>(RegName)
- .Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
- .Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
- .Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
- .Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
- .Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
- .Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
- .Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
- .Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
- .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
- .Default(0);
+ .Case("r0", AVR::R1R0)
+ .Case("sp", AVR::SP)
+ .Default(0);
}
if (Reg)
return Reg;
- report_fatal_error("Invalid register name global variable");
+ report_fatal_error(
+ Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
}
} // end of namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h b/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
index 7aff415..8130cf0 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -36,11 +36,14 @@
/// TargetExternalSymbol, and TargetGlobalAddress.
WRAPPER,
LSL, ///< Logical shift left.
+ LSLBN, ///< Byte logical shift left N bits.
+ LSLWN, ///< Word logical shift left N bits.
LSR, ///< Logical shift right.
+ LSRBN, ///< Byte logical shift right N bits.
+ LSRWN, ///< Word logical shift right N bits.
ASR, ///< Arithmetic shift right.
- LSL7, ///< Logical shift left 7 bits.
- LSR7, ///< Logical shift right 7 bits.
- ASR7, ///< Arithmetic shift right 7 bits.
+ ASRBN, ///< Byte arithmetic shift right N bits.
+ ASRWN, ///< Word arithmetic shift right N bits.
ROR, ///< Bit rotate right.
ROL, ///< Bit rotate left.
LSLLOOP, ///< A loop of single logical shift left instructions.
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td b/src/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
index 9f7c16f..c7c9656 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -59,9 +59,12 @@
def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
-def AVRlsl7 : SDNode<"AVRISD::LSL7", SDTIntUnaryOp>;
-def AVRlsr7 : SDNode<"AVRISD::LSR7", SDTIntUnaryOp>;
-def AVRasr7 : SDNode<"AVRISD::ASR7", SDTIntUnaryOp>;
+def AVRlslbn : SDNode<"AVRISD::LSLBN", SDTIntBinOp>;
+def AVRlsrbn : SDNode<"AVRISD::LSRBN", SDTIntBinOp>;
+def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
+def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>;
+def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>;
+def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>;
// Pseudo shift nodes for non-constant shift amounts.
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
@@ -357,11 +360,13 @@
"#ADJCALLSTACKDOWN",
[(AVRcallseq_start timm:$amt, timm:$amt2)]>;
- // R31R30 is used to update SP, since it is a scratch reg and this instruction
- // is placed after the function call then R31R30 should be always free.
- //let Defs = [R31R30],
- //Uses = [R31R30] in
- //:TODO: if we enable this, the pseudo is killed because it looks dead
+ // R31R30 is used to update SP. It is normally free because it is a
+ // call-clobbered register but it is necessary to set it as a def as the
+ // register allocator might use it in rare cases (for rematerialization, it
+ // seems). hasSideEffects needs to be set to true so this instruction isn't
+ // considered dead.
+ let Defs = [R31R30],
+ hasSideEffects=1 in
def ADJCALLSTACKUP : Pseudo<(outs),
(ins i16imm:$amt1, i16imm:$amt2),
"#ADJCALLSTACKUP",
@@ -750,7 +755,7 @@
// Expands to:
// neg Rd+1
// neg Rd
- // sbci Rd+1, 0
+ // sbc Rd+1, r1
def NEGWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
"negw\t$rd",
@@ -1292,6 +1297,7 @@
Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op",
[(Op i16:$rd, DRC:$rr)]>;
+let Constraints = "@earlyclobber $rd" in
class AtomicLoadOp<PatFrag Op, RegisterClass DRC,
RegisterClass PTRRC> :
Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand),
@@ -1669,10 +1675,17 @@
"lslw\t$rd",
[(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
- def LSLB7Rd : Pseudo<(outs GPR8:$rd),
- (ins GPR8:$src),
- "lslb7\t$rd",
- [(set i8:$rd, (AVRlsl7 i8:$src)), (implicit SREG)]>;
+ def LSLWNRd : Pseudo<(outs DLDREGS:$rd),
+ (ins DREGS:$src, imm16:$bits),
+ "lslwn\t$rd, $bits",
+ [(set i16:$rd, (AVRlslwn i16:$src, imm:$bits)),
+ (implicit SREG)]>;
+
+ def LSLBNRd : Pseudo<(outs LD8:$rd),
+ (ins GPR8:$src, imm_ldi8:$bits),
+ "lslbn\t$rd, $bits",
+ [(set i8:$rd, (AVRlslbn i8:$src, imm:$bits)),
+ (implicit SREG)]>;
def LSRRd : FRd<0b1001,
0b0100110,
@@ -1681,16 +1694,23 @@
"lsr\t$rd",
[(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
- def LSRB7Rd : Pseudo<(outs GPR8:$rd),
- (ins GPR8:$src),
- "lsrb7\t$rd",
- [(set i8:$rd, (AVRlsr7 i8:$src)), (implicit SREG)]>;
-
def LSRWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
"lsrw\t$rd",
[(set i16:$rd, (AVRlsr i16:$src)), (implicit SREG)]>;
+ def LSRWNRd : Pseudo<(outs DLDREGS:$rd),
+ (ins DREGS:$src, imm16:$bits),
+ "lsrwn\t$rd, $bits",
+ [(set i16:$rd, (AVRlsrwn i16:$src, imm:$bits)),
+ (implicit SREG)]>;
+
+ def LSRBNRd : Pseudo<(outs LD8:$rd),
+ (ins GPR8:$src, imm_ldi8:$bits),
+ "lsrbn\t$rd, $bits",
+ [(set i8:$rd, (AVRlsrbn i8:$src, imm:$bits)),
+ (implicit SREG)]>;
+
def ASRRd : FRd<0b1001,
0b0100101,
(outs GPR8:$rd),
@@ -1698,30 +1718,36 @@
"asr\t$rd",
[(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
- def ASRB7Rd : Pseudo<(outs GPR8:$rd),
- (ins GPR8:$src),
- "asrb7\t$rd",
- [(set i8:$rd, (AVRasr7 i8:$src)), (implicit SREG)]>;
+ def ASRWNRd : Pseudo<(outs DLDREGS:$rd),
+ (ins DREGS:$src, imm16:$bits),
+ "asrwn\t$rd, $bits",
+ [(set i16:$rd, (AVRasrwn i16:$src, imm:$bits)),
+ (implicit SREG)]>;
+
+ def ASRBNRd : Pseudo<(outs LD8:$rd),
+ (ins GPR8:$src, imm_ldi8:$bits),
+ "asrbn\t$rd, $bits",
+ [(set i8:$rd, (AVRasrbn i8:$src, imm:$bits)),
+ (implicit SREG)]>;
def ASRWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
"asrw\t$rd",
[(set i16:$rd, (AVRasr i16:$src)), (implicit SREG)]>;
+ def ROLBRd : Pseudo<(outs GPR8:$rd),
+ (ins GPR8:$src),
+ "rolb\t$rd",
+ [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
+
+ def RORBRd : Pseudo<(outs GPR8:$rd),
+ (ins GPR8:$src),
+ "rorb\t$rd",
+ [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
+
// Bit rotate operations.
let Uses = [SREG] in
{
- // 8-bit ROL is an alias of ADC Rd, Rd
-
- def ROLBRd : Pseudo<(outs GPR8:$rd),
- (ins GPR8:$src),
- "rolb\t$rd",
- [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
-
- def RORBRd : Pseudo<(outs GPR8:$rd),
- (ins GPR8:$src),
- "rorb\t$rd",
- [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
def ROLWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
@@ -1777,10 +1803,11 @@
"bst\t$rd, $b",
[]>;
-let Uses = [SREG] in
+let Constraints = "$src = $rd",
+Uses = [SREG] in
def BLD : FRdB<0b00,
- (outs),
- (ins GPR8:$rd, i8imm:$b),
+ (outs GPR8:$rd),
+ (ins GPR8:$src, i8imm:$b),
"bld\t$rd, $b",
[]>;
@@ -2123,12 +2150,7 @@
def : Pat<(i16 (AVRWrapper tblockaddress:$dst)),
(LDIWRdK tblockaddress:$dst)>;
-// hi-reg truncation : trunc(int16 >> 8)
-//:FIXME: i think it's better to emit an extract subreg node in the DAG than
-// all this mess once we get optimal shift code
-// lol... I think so, too. [@agnat]
-def : Pat<(i8 (trunc (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr
- (AVRlsr DREGS:$src)))))))))),
+def : Pat<(i8 (trunc (AVRlsrwn DLDREGS:$src, (i16 8)))),
(EXTRACT_SUBREG DREGS:$src, sub_hi)>;
// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRRegisterInfo.td b/src/llvm-project/llvm/lib/Target/AVR/AVRRegisterInfo.td
index ab5d023..1948fcb 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -67,12 +67,12 @@
def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>;
def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>;
def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>;
-def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>;
-def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>;
-def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>;
-def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>;
-def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>;
-def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>;
+def R26 : AVRReg<26, "r26", [], ["xl"]>, DwarfRegNum<[26]>;
+def R27 : AVRReg<27, "r27", [], ["xh"]>, DwarfRegNum<[27]>;
+def R28 : AVRReg<28, "r28", [], ["yl"]>, DwarfRegNum<[28]>;
+def R29 : AVRReg<29, "r29", [], ["yh"]>, DwarfRegNum<[29]>;
+def R30 : AVRReg<30, "r30", [], ["zl"]>, DwarfRegNum<[30]>;
+def R31 : AVRReg<31, "r31", [], ["zh"]>, DwarfRegNum<[31]>;
def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
@@ -171,6 +171,21 @@
R14R13, R12R11, R10R9
)>;
+// Lower 16-bit pair registers in R0..R15, only used in inline assembly.
+def DREGSlo : RegisterClass<"AVR", [i16], 8,
+ (
+ add R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, R1R0
+ )>;
+
+// Lower 16-bit pair registers in r16..r23, only used in inline assembly.
+def DREGSLD8lo : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R19R18, R21R20, R23R22,
+ // Callee saved registers.
+ R17R16
+ )>;
+
// 16-bit pair register class for movw
def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
(
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
index 6be9017..7d2d19d 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
@@ -113,7 +113,7 @@
// Pop the original state of the pointer register.
buildMI(MBB, MBBI, AVR::POPWRd)
- .addReg(Ptr.getReg(), getKillRegState(Ptr.isKill()));
+ .addDef(Ptr.getReg(), getKillRegState(Ptr.isKill()));
MI.removeFromParent();
}
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRShiftExpand.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRShiftExpand.cpp
new file mode 100644
index 0000000..b7dcd86
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRShiftExpand.cpp
@@ -0,0 +1,147 @@
+//===- AVRShift.cpp - Shift Expansion Pass --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Expand 32-bit shift instructions (shl, lshr, ashr) to inline loops, just
+/// like avr-gcc. This must be done in IR because otherwise the type legalizer
+/// will turn 32-bit shifts into (non-existing) library calls such as __ashlsi3.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+
+using namespace llvm;
+
+namespace {
+
+class AVRShiftExpand : public FunctionPass {
+public:
+ static char ID;
+
+ AVRShiftExpand() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AVR Shift Expansion"; }
+
+private:
+ void expand(BinaryOperator *BI);
+};
+
+} // end of anonymous namespace
+
+char AVRShiftExpand::ID = 0;
+
+INITIALIZE_PASS(AVRShiftExpand, "avr-shift-expand", "AVR Shift Expansion",
+ false, false)
+
+Pass *llvm::createAVRShiftExpandPass() { return new AVRShiftExpand(); }
+
+bool AVRShiftExpand::runOnFunction(Function &F) {
+ SmallVector<BinaryOperator *, 1> ShiftInsts;
+ auto &Ctx = F.getContext();
+ for (Instruction &I : instructions(F)) {
+ if (!I.isShift())
+ // Only expand shift instructions (shl, lshr, ashr).
+ continue;
+ if (I.getType() != Type::getInt32Ty(Ctx))
+ // Only expand plain i32 types.
+ continue;
+ if (isa<ConstantInt>(I.getOperand(1)))
+ // Only expand when the shift amount is not known.
+ // Known shift amounts are (currently) better expanded inline.
+ continue;
+ ShiftInsts.push_back(cast<BinaryOperator>(&I));
+ }
+
+ // The expanding itself needs to be done separately as expand() will remove
+ // these instructions. Removing instructions while iterating over a basic
+ // block is not a great idea.
+ for (auto *I : ShiftInsts) {
+ expand(I);
+ }
+
+ // Return whether this function expanded any shift instructions.
+ return ShiftInsts.size() > 0;
+}
+
+void AVRShiftExpand::expand(BinaryOperator *BI) {
+ auto &Ctx = BI->getContext();
+ IRBuilder<> Builder(BI);
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int8Ty = Type::getInt8Ty(Ctx);
+ Value *Int8Zero = ConstantInt::get(Int8Ty, 0);
+
+ // Split the current basic block at the point of the existing shift
+ // instruction and insert a new basic block for the loop.
+ BasicBlock *BB = BI->getParent();
+ Function *F = BB->getParent();
+ BasicBlock *EndBB = BB->splitBasicBlock(BI, "shift.done");
+ BasicBlock *LoopBB = BasicBlock::Create(Ctx, "shift.loop", F, EndBB);
+
+ // Truncate the shift amount to i8, which is trivially lowered to a single
+ // AVR register.
+ Builder.SetInsertPoint(&BB->back());
+ Value *ShiftAmount = Builder.CreateTrunc(BI->getOperand(1), Int8Ty);
+
+ // Replace the unconditional branch that splitBasicBlock created with a
+ // conditional branch.
+ Value *Cmp1 = Builder.CreateICmpEQ(ShiftAmount, Int8Zero);
+ Builder.CreateCondBr(Cmp1, EndBB, LoopBB);
+ BB->back().eraseFromParent();
+
+ // Create the loop body starting with PHI nodes.
+ Builder.SetInsertPoint(LoopBB);
+ PHINode *ShiftAmountPHI = Builder.CreatePHI(Int8Ty, 2);
+ ShiftAmountPHI->addIncoming(ShiftAmount, BB);
+ PHINode *ValuePHI = Builder.CreatePHI(Int32Ty, 2);
+ ValuePHI->addIncoming(BI->getOperand(0), BB);
+
+ // Subtract the shift amount by one, as we're shifting one this loop
+ // iteration.
+ Value *ShiftAmountSub =
+ Builder.CreateSub(ShiftAmountPHI, ConstantInt::get(Int8Ty, 1));
+ ShiftAmountPHI->addIncoming(ShiftAmountSub, LoopBB);
+
+ // Emit the actual shift instruction. The difference is that this shift
+ // instruction has a constant shift amount, which can be emitted inline
+ // without a library call.
+ Value *ValueShifted;
+ switch (BI->getOpcode()) {
+ case Instruction::Shl:
+ ValueShifted = Builder.CreateShl(ValuePHI, ConstantInt::get(Int32Ty, 1));
+ break;
+ case Instruction::LShr:
+ ValueShifted = Builder.CreateLShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
+ break;
+ case Instruction::AShr:
+ ValueShifted = Builder.CreateAShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
+ break;
+ default:
+ llvm_unreachable("asked to expand an instruction that is not a shift");
+ }
+ ValuePHI->addIncoming(ValueShifted, LoopBB);
+
+ // Branch to either the loop again (if there is more to shift) or to the
+ // basic block after the loop (if all bits are shifted).
+ Value *Cmp2 = Builder.CreateICmpEQ(ShiftAmountSub, Int8Zero);
+ Builder.CreateCondBr(Cmp2, EndBB, LoopBB);
+
+ // Collect the resulting value. This is necessary in the IR but won't produce
+ // any actual instructions.
+ Builder.SetInsertPoint(BI);
+ PHINode *Result = Builder.CreatePHI(Int32Ty, 2);
+ Result->addIncoming(BI->getOperand(0), BB);
+ Result->addIncoming(ValueShifted, LoopBB);
+
+ // Replace the original shift instruction.
+ BI->replaceAllUsesWith(Result);
+ BI->eraseFromParent();
+}
diff --git a/src/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 0fa8623..5be4260 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -65,6 +65,7 @@
return getTM<AVRTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -76,6 +77,15 @@
return new AVRPassConfig(*this, PM);
}
+void AVRPassConfig::addIRPasses() {
+ // Expand instructions like
+ // %result = shl i32 %n, %amount
+ // to a loop so that library calls are avoided.
+ addPass(createAVRShiftExpandPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
// Register the target.
RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
@@ -83,6 +93,7 @@
auto &PR = *PassRegistry::getPassRegistry();
initializeAVRExpandPseudoPass(PR);
initializeAVRRelaxMemPass(PR);
+ initializeAVRShiftExpandPass(PR);
}
const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
diff --git a/src/llvm-project/llvm/lib/Target/AVR/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/AVR/CMakeLists.txt
index f24bbd3..cde2138 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/AVR/CMakeLists.txt
@@ -24,6 +24,7 @@
AVRMCInstLower.cpp
AVRRelaxMemOperations.cpp
AVRRegisterInfo.cpp
+ AVRShiftExpand.cpp
AVRSubtarget.cpp
AVRTargetMachine.cpp
AVRTargetObjectFile.cpp
diff --git a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 1c69fea..bedf68d 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AVRFixupKinds.h"
+#include "MCTargetDesc/AVRMCExpr.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
#include "llvm/MC/MCAssembler.h"
@@ -72,6 +73,7 @@
case MCSymbolRefExpr::VK_None:
return ELF::R_AVR_16;
case MCSymbolRefExpr::VK_AVR_NONE:
+ case MCSymbolRefExpr::VK_AVR_PM:
return ELF::R_AVR_16_PM;
case MCSymbolRefExpr::VK_AVR_DIFF16:
return ELF::R_AVR_DIFF16;
diff --git a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index db995e2..50872d6 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -254,11 +254,8 @@
if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
if (MO.isImm()) return static_cast<unsigned>(MO.getImm());
- if (MO.isFPImm())
- return static_cast<unsigned>(APFloat(MO.getFPImm())
- .bitcastToAPInt()
- .getHiBits(32)
- .getLimitedValue());
+ if (MO.isDFPImm())
+ return static_cast<unsigned>(bit_cast<double>(MO.getDFPImm()));
// MO must be an Expr.
assert(MO.isExpr());
diff --git a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 77b4993..0743344 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -10,14 +10,14 @@
// instructions on to the real streamer.
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "avrmcelfstreamer"
-
#include "MCTargetDesc/AVRMCELFStreamer.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCObjectWriter.h"
+#define DEBUG_TYPE "avrmcelfstreamer"
+
using namespace llvm;
void AVRMCELFStreamer::emitValueForModiferKind(
diff --git a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 9eff554..a4f8787 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -26,6 +26,7 @@
{"hh8", AVRMCExpr::VK_AVR_HH8}, // synonym with hlo8
{"hlo8", AVRMCExpr::VK_AVR_HH8}, {"hhi8", AVRMCExpr::VK_AVR_HHI8},
+ {"pm", AVRMCExpr::VK_AVR_PM},
{"pm_lo8", AVRMCExpr::VK_AVR_PM_LO8}, {"pm_hi8", AVRMCExpr::VK_AVR_PM_HI8},
{"pm_hh8", AVRMCExpr::VK_AVR_PM_HH8},
@@ -87,6 +88,9 @@
MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
if (Modifier != MCSymbolRefExpr::VK_None)
return false;
+ if (Kind == VK_AVR_PM) {
+ Modifier = MCSymbolRefExpr::VK_AVR_PM;
+ }
Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
Result = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
@@ -131,6 +135,7 @@
Value &= 0xff0000;
Value >>= 16;
break;
+ case AVRMCExpr::VK_AVR_PM:
case AVRMCExpr::VK_AVR_GS:
Value >>= 1; // Program memory addresses must always be shifted by one.
break;
@@ -167,6 +172,7 @@
case VK_AVR_PM_HH8:
Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
break;
+ case VK_AVR_PM:
case VK_AVR_GS:
Kind = AVR::fixup_16_pm;
break;
diff --git a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index 3b696ba..e35385e 100644
--- a/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/src/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -20,13 +20,14 @@
public:
/// Specifies the type of an expression.
enum VariantKind {
- VK_AVR_None,
+ VK_AVR_None = 0,
VK_AVR_HI8, ///< Corresponds to `hi8()`.
VK_AVR_LO8, ///< Corresponds to `lo8()`.
VK_AVR_HH8, ///< Corresponds to `hlo8() and hh8()`.
VK_AVR_HHI8, ///< Corresponds to `hhi8()`.
+ VK_AVR_PM, ///< Corresponds to `pm()`, reference to program memory.
VK_AVR_PM_LO8, ///< Corresponds to `pm_lo8()`.
VK_AVR_PM_HI8, ///< Corresponds to `pm_hi8()`.
VK_AVR_PM_HH8, ///< Corresponds to `pm_hh8()`.
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/src/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index cd994a9..ab7e848 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -268,6 +268,11 @@
return DimSize;
}
+static Type *getBaseElementType(const CallInst *Call) {
+ // Element type is stored in an elementtype() attribute on the first param.
+ return Call->getAttributes().getParamElementType(0);
+}
+
/// Check whether a call is a preserve_*_access_index intrinsic call or not.
bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
CallInfo &CInfo) {
@@ -284,8 +289,7 @@
report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic");
CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
CInfo.Base = Call->getArgOperand(0);
- CInfo.RecordAlignment =
- DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
+ CInfo.RecordAlignment = DL->getABITypeAlign(getBaseElementType(Call));
return true;
}
if (GV->getName().startswith("llvm.preserve.union.access.index")) {
@@ -306,8 +310,7 @@
report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic");
CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
CInfo.Base = Call->getArgOperand(0);
- CInfo.RecordAlignment =
- DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
+ CInfo.RecordAlignment = DL->getABITypeAlign(getBaseElementType(Call));
return true;
}
if (GV->getName().startswith("llvm.bpf.preserve.field.info")) {
@@ -367,8 +370,8 @@
IdxList.push_back(Zero);
IdxList.push_back(Call->getArgOperand(GEPIndex));
- auto *GEP = GetElementPtrInst::CreateInBounds(Call->getArgOperand(0),
- IdxList, "", Call);
+ auto *GEP = GetElementPtrInst::CreateInBounds(
+ getBaseElementType(Call), Call->getArgOperand(0), IdxList, "", Call);
Call->replaceAllUsesWith(GEP);
Call->eraseFromParent();
}
@@ -872,6 +875,8 @@
if (CInfo.Kind == BPFPreserveFieldInfoAI) {
InfoKind = CInfo.AccessIndex;
+ if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE)
+ PatchImm = 1;
break;
}
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/src/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
index da543e7..7088d55 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -16,6 +16,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -25,6 +26,7 @@
#define DEBUG_TYPE "bpf-adjust-opt"
using namespace llvm;
+using namespace llvm::PatternMatch;
static cl::opt<bool>
DisableBPFserializeICMP("bpf-disable-serialize-icmp", cl::Hidden,
@@ -115,12 +117,14 @@
// comp2 = icmp <opcode> ...;
// new_comp1 = __builtin_bpf_passthrough(seq_num, comp1)
// ... or new_comp1 comp2 ...
- if (I.getOpcode() != Instruction::Or)
+ Value *Op0, *Op1;
+ // Use LogicalOr (accept `or i1` as well as `select i1 Op0, true, Op1`)
+ if (!match(&I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))
return false;
- auto *Icmp1 = dyn_cast<ICmpInst>(I.getOperand(0));
+ auto *Icmp1 = dyn_cast<ICmpInst>(Op0);
if (!Icmp1)
return false;
- auto *Icmp2 = dyn_cast<ICmpInst>(I.getOperand(1));
+ auto *Icmp2 = dyn_cast<ICmpInst>(Op1);
if (!Icmp2)
return false;
@@ -268,9 +272,9 @@
// load/store insn before this instruction in this basic
// block. Most likely it cannot be hoisted out. Skip it.
for (auto &I2 : *Inst->getParent()) {
- if (dyn_cast<CallInst>(&I2))
+ if (isa<CallInst>(&I2))
return false;
- if (dyn_cast<LoadInst>(&I2) || dyn_cast<StoreInst>(&I2))
+ if (isa<LoadInst>(&I2) || isa<StoreInst>(&I2))
return false;
if (&I2 == Inst)
break;
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index f10a0d4..eb506d3 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -100,7 +100,7 @@
bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
// if Address is FI, get the TargetFrameIndex.
SDLoc DL(Addr);
- if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
return true;
@@ -112,12 +112,10 @@
// Addresses of the form Addr+const or Addr|const
if (CurDAG->isBaseWithConstantOffset(Addr)) {
- ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ auto *CN = cast<ConstantSDNode>(Addr.getOperand(1));
if (isInt<16>(CN->getSExtValue())) {
-
// If the first operand is a FI, get the TargetFI Node
- if (FrameIndexSDNode *FIN =
- dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
else
Base = Addr.getOperand(0);
@@ -141,11 +139,10 @@
return false;
// Addresses of the form Addr+const or Addr|const
- ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ auto *CN = cast<ConstantSDNode>(Addr.getOperand(1));
if (isInt<16>(CN->getSExtValue())) {
-
// If the first operand is a FI, get the TargetFI Node
- if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
else
return false;
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp b/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 20fc394..c543dfc 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -78,6 +78,24 @@
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ // Set unsupported atomic operations as Custom so
+ // we can emit better error messages than fatal error
+ // from selectiondag.
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
+ if (VT == MVT::i32) {
+ if (STI.getHasAlu32())
+ continue;
+ } else {
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+ }
+
+ setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+ }
+
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i32 && !STI.getHasAlu32())
continue;
@@ -236,6 +254,30 @@
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+void BPFTargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ const char *err_msg;
+ uint32_t Opcode = N->getOpcode();
+ switch (Opcode) {
+ default:
+ report_fatal_error("Unhandled custom legalization");
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ if (HasAlu32 || Opcode == ISD::ATOMIC_LOAD_ADD)
+ err_msg = "Unsupported atomic operations, please use 32/64 bit version";
+ else
+ err_msg = "Unsupported atomic operations, please use 64 bit version";
+ break;
+ }
+
+ SDLoc DL(N);
+ fail(DL, DAG, err_msg);
+}
+
SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
case ISD::BR_CC:
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.h b/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.h
index 7fa3ce8..d500742 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -102,11 +102,16 @@
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override {
return Op.size() >= 8 ? MVT::i64 : MVT::i32;
}
+ bool isIntDivCheap(EVT VT, AttributeList Attr) const override { return true; }
+
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override {
return true;
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index a8fef25..5b04312 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -122,11 +122,10 @@
});
}
-void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) {
+void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineStartEPCallback(
[=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
- FunctionPassManager FPM(DebugPassManager);
+ FunctionPassManager FPM;
FPM.addPass(BPFAbstractMemberAccessPass(this));
FPM.addPass(BPFPreserveDITypePass());
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
index 61c8a44..98f64cc 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -41,8 +41,7 @@
}
void adjustPassManager(PassManagerBuilder &) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
}
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 6205549..417e8b6 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -44,16 +44,34 @@
return TTI::TCC_Basic;
}
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const llvm::Instruction *I = nullptr) {
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const llvm::Instruction *I = nullptr) {
if (Opcode == Instruction::Select)
- return SCEVCheapExpansionBudget;
+ return SCEVCheapExpansionBudget.getValue();
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
I);
}
+
+ InstructionCost getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+ const Instruction *CxtI = nullptr) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ if (ISD == ISD::ADD && CostKind == TTI::TCK_RecipThroughput)
+ return SCEVCheapExpansionBudget.getValue() + 1;
+
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info, Opd1PropInfo,
+ Opd2PropInfo);
+ }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp b/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
index 9249d67..c1f8ea9 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1030,13 +1030,16 @@
FieldRelocTable[SecNameOff].push_back(FieldReloc);
}
-void BTFDebug::processReloc(const MachineOperand &MO) {
+void BTFDebug::processGlobalValue(const MachineOperand &MO) {
// check whether this is a candidate or not
if (MO.isGlobal()) {
const GlobalValue *GVal = MO.getGlobal();
auto *GVar = dyn_cast<GlobalVariable>(GVal);
- if (!GVar)
+ if (!GVar) {
+ // Not a global variable. Maybe an extern function reference.
+ processFuncPrototypes(dyn_cast<Function>(GVal));
return;
+ }
if (!GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr) &&
!GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
@@ -1087,12 +1090,12 @@
//
// If the insn is "r2 = LD_imm64 @<an TypeIdAttr global>",
// The LD_imm64 result will be replaced with a btf type id.
- processReloc(MI->getOperand(1));
+ processGlobalValue(MI->getOperand(1));
} else if (MI->getOpcode() == BPF::CORE_MEM ||
MI->getOpcode() == BPF::CORE_ALU32_MEM ||
MI->getOpcode() == BPF::CORE_SHIFT) {
// relocation insn is a load, store or shift insn.
- processReloc(MI->getOperand(3));
+ processGlobalValue(MI->getOperand(3));
} else if (MI->getOpcode() == BPF::JAL) {
// check extern function references
const MachineOperand &MO = MI->getOperand(0);
@@ -1146,10 +1149,6 @@
SecName = ".rodata";
else
SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data";
- } else {
- // extern variables without explicit section,
- // put them into ".extern" section.
- SecName = ".extern";
}
if (ProcessingMapDef != SecName.startswith(".maps"))
@@ -1213,7 +1212,9 @@
std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
uint32_t VarId = addType(std::move(VarEntry));
- assert(!SecName.empty());
+ // An empty SecName means an extern variable without section attribute.
+ if (SecName.empty())
+ continue;
// Find or create a DataSec
if (DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
diff --git a/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.h b/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
index 76f1901..2fdcf85 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
+++ b/src/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
@@ -320,8 +320,9 @@
/// Populating unprocessed type on demand.
unsigned populateType(const DIType *Ty);
- /// Process relocation instructions.
- void processReloc(const MachineOperand &MO);
+ /// Process global variables referenced by relocation instructions
+ /// and extern function references.
+ void processGlobalValue(const MachineOperand &MO);
/// Emit common header of .BTF and .BTF.ext sections.
void emitCommonHeader();
diff --git a/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 29e9d5d..6687dbe 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -63,7 +63,7 @@
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved,
const MCSubtargetInfo *STI) const {
- if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
+ if (Fixup.getKind() == FK_SecRel_8) {
// The Value is 0 for global variables, and the in-section offset
// for static variables. Write to the immediate field of the inst.
assert(Value <= UINT32_MAX);
diff --git a/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index ef4e324..7c0c23a 100644
--- a/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -43,12 +43,13 @@
default:
llvm_unreachable("invalid fixup kind!");
case FK_SecRel_8:
+ // LD_imm64 instruction.
return ELF::R_BPF_64_64;
case FK_PCRel_4:
- case FK_SecRel_4:
+ // CALL instruction.
return ELF::R_BPF_64_32;
case FK_Data_8:
- return ELF::R_BPF_64_64;
+ return ELF::R_BPF_64_ABS64;
case FK_Data_4:
if (const MCSymbolRefExpr *A = Target.getSymA()) {
const MCSymbol &Sym = A->getSymbol();
@@ -63,23 +64,22 @@
if (Sym.isTemporary()) {
// .BTF.ext generates FK_Data_4 relocations for
// insn offset by creating temporary labels.
- // The insn offset is within the code section and
- // already been fulfilled by applyFixup(). No
- // further relocation is needed.
// The reloc symbol should be in text section.
+ // Use a different relocation to instruct ExecutionEngine
+ // RuntimeDyld not to do relocation for it, yet still to
+ // allow lld to do proper adjustment when merging sections.
if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_EXECINSTR))
- return ELF::R_BPF_NONE;
+ return ELF::R_BPF_64_NODYLD32;
} else {
// .BTF generates FK_Data_4 relocations for variable
- // offset in DataSec kind. Similar to the above .BTF.ext
- // insn offset, no further relocation is needed.
+ // offset in DataSec kind.
// The reloc symbol should be in data section.
if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_WRITE))
- return ELF::R_BPF_NONE;
+ return ELF::R_BPF_64_NODYLD32;
}
}
}
- return ELF::R_BPF_64_32;
+ return ELF::R_BPF_64_ABS32;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/CMakeLists.txt
index e87ccde..417fd83 100644
--- a/src/llvm-project/llvm/lib/Target/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/CMakeLists.txt
@@ -21,7 +21,8 @@
# When building shared objects for each target there are some internal APIs
# that are used across shared objects which we can't hide.
-if (NOT BUILD_SHARED_LIBS AND NOT APPLE)
+if (NOT BUILD_SHARED_LIBS AND NOT APPLE AND
+ NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
# Set default visibility to hidden, so we don't export all the Target classes
# in libLLVM.so.
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..e7b5cdb
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_component_library(LLVMCSKYAsmParser
+ CSKYAsmParser.cpp
+
+ LINK_COMPONENTS
+ CSKYDesc
+ CSKYInfo
+ MC
+ MCParser
+ Support
+
+ ADD_TO_COMPONENT
+ CSKY
+ )
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
new file mode 100644
index 0000000..f2a3811
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -0,0 +1,652 @@
+//===-- CSKYAsmParser.cpp - Parse CSKY assembly to MCInst instructions --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/CSKYMCExpr.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+struct CSKYOperand;
+
+class CSKYAsmParser : public MCTargetAsmParser {
+
+ bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
+ int64_t Lower, int64_t Upper, Twine Msg);
+
+ SMLoc getLoc() const { return getParser().getTok().getLoc(); }
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
+
+// Auto-generated instruction matching functions
+#define GET_ASSEMBLER_HEADER
+#include "CSKYGenAsmMatcher.inc"
+
+ OperandMatchResultTy parseImmediate(OperandVector &Operands);
+ OperandMatchResultTy parseRegister(OperandVector &Operands);
+ OperandMatchResultTy parseBaseRegImm(OperandVector &Operands);
+ OperandMatchResultTy parseCSKYSymbol(OperandVector &Operands);
+ OperandMatchResultTy parseConstpoolSymbol(OperandVector &Operands);
+
+ bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+
+public:
+ enum CSKYMatchResultTy {
+ Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "CSKYGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+ };
+
+ CSKYAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI, MII) {
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+};
+
+/// Instances of this class represent a parsed machine instruction.
+struct CSKYOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Register,
+ Immediate,
+ } Kind;
+
+ struct RegOp {
+ unsigned RegNum;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ SMLoc StartLoc, EndLoc;
+ union {
+ StringRef Tok;
+ RegOp Reg;
+ ImmOp Imm;
+ };
+
+ CSKYOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+public:
+ CSKYOperand(const CSKYOperand &o) : MCParsedAsmOperand() {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ switch (Kind) {
+ case Register:
+ Reg = o.Reg;
+ break;
+ case Immediate:
+ Imm = o.Imm;
+ break;
+ case Token:
+ Tok = o.Tok;
+ break;
+ }
+ }
+
+ bool isToken() const override { return Kind == Token; }
+ bool isReg() const override { return Kind == Register; }
+ bool isImm() const override { return Kind == Immediate; }
+ bool isMem() const override { return false; }
+
+ static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm) {
+ if (auto CE = dyn_cast<MCConstantExpr>(Expr)) {
+ Imm = CE->getValue();
+ return true;
+ }
+
+ return false;
+ }
+
+ template <unsigned num, unsigned shift = 0> bool isUImm() const {
+ if (!isImm())
+ return false;
+
+ int64_t Imm;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+ return IsConstantImm && isShiftedUInt<num, shift>(Imm);
+ }
+
+ template <unsigned num> bool isOImm() const {
+ if (!isImm())
+ return false;
+
+ int64_t Imm;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+ return IsConstantImm && isUInt<num>(Imm - 1);
+ }
+
+ template <unsigned num, unsigned shift = 0> bool isSImm() const {
+ if (!isImm())
+ return false;
+
+ int64_t Imm;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+ return IsConstantImm && isShiftedInt<num, shift>(Imm);
+ }
+
+ bool isUImm2() const { return isUImm<2>(); }
+ bool isUImm5() const { return isUImm<5>(); }
+ bool isUImm12() const { return isUImm<12>(); }
+ bool isUImm16() const { return isUImm<16>(); }
+
+ bool isOImm12() const { return isOImm<12>(); }
+ bool isOImm16() const { return isOImm<16>(); }
+
+ bool isUImm12Shift1() { return isUImm<12, 1>(); }
+ bool isUImm12Shift2() { return isUImm<12, 2>(); }
+
+ bool isSImm16Shift1() { return isSImm<16, 1>(); }
+
+ bool isCSKYSymbol() const {
+ int64_t Imm;
+ // Must be of 'immediate' type but not a constant.
+ return isImm() && !evaluateConstantImm(getImm(), Imm);
+ }
+
+ bool isConstpoolSymbol() const {
+ int64_t Imm;
+ // Must be of 'immediate' type but not a constant.
+ return isImm() && !evaluateConstantImm(getImm(), Imm);
+ }
+
+ /// Gets location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// Gets location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ unsigned getReg() const override {
+ assert(Kind == Register && "Invalid type access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid type access!");
+ return Imm.Val;
+ }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid type access!");
+ return Tok;
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case Immediate:
+ OS << *getImm();
+ break;
+ case Register:
+ OS << "<register x" << getReg() << ">";
+ break;
+ case Token:
+ OS << "'" << getToken() << "'";
+ break;
+ }
+ }
+
+ static std::unique_ptr<CSKYOperand> createToken(StringRef Str, SMLoc S) {
+ auto Op = std::make_unique<CSKYOperand>(Token);
+ Op->Tok = Str;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<CSKYOperand> createReg(unsigned RegNo, SMLoc S,
+ SMLoc E) {
+ auto Op = std::make_unique<CSKYOperand>(Register);
+ Op->Reg.RegNum = RegNo;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<CSKYOperand> createImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ auto Op = std::make_unique<CSKYOperand>(Immediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ assert(Expr && "Expr shouldn't be null!");
+ if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ // Used by the TableGen Code.
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+};
+} // end anonymous namespace.
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
+#include "CSKYGenAsmMatcher.inc"
+
+static std::string CSKYMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
+ unsigned VariantID = 0);
+
+bool CSKYAsmParser::generateImmOutOfRangeError(
+ OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
+ Twine Msg = "immediate must be an integer in the range") {
+ SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
+}
+
+bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ FeatureBitset MissingFeatures;
+
+ auto Result = MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+ MatchingInlineAsm);
+ switch (Result) {
+ default:
+ break;
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.emitInstruction(Inst, getSTI());
+ return false;
+ case Match_MissingFeature: {
+ assert(MissingFeatures.any() && "Unknown missing features!");
+ ListSeparator LS;
+ std::string Msg = "instruction requires the following: ";
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+ if (MissingFeatures[i]) {
+ Msg += LS;
+ Msg += getSubtargetFeatureName(i);
+ }
+ }
+ return Error(IDLoc, Msg);
+ }
+ case Match_MnemonicFail: {
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ std::string Suggestion =
+ CSKYMnemonicSpellCheck(((CSKYOperand &)*Operands[0]).getToken(), FBS);
+ return Error(IDLoc, "unrecognized instruction mnemonic" + Suggestion);
+ }
+ case Match_InvalidTiedOperand:
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size())
+ return Error(ErrorLoc, "too few operands for instruction");
+
+ ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ }
+
+ // Handle the case when the error message is of specific type
+ // other than the generic Match_InvalidOperand, and the
+ // corresponding operand is missing.
+ if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
+ return Error(ErrorLoc, "too few operands for instruction");
+ }
+
+ switch (Result) {
+ default:
+ break;
+ case Match_InvalidOImm12:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 12));
+ case Match_InvalidOImm16:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 16));
+ case Match_InvalidUImm2:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 2) - 1);
+ case Match_InvalidUImm5:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+ case Match_InvalidUImm12:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 12) - 1);
+ case Match_InvalidUImm12Shift1:
+ return generateImmOutOfRangeError(
+ Operands, ErrorInfo, 0, (1 << 12) - 2,
+ "immediate must be a multiple of 2 bytes in the range");
+ case Match_InvalidUImm12Shift2:
+ return generateImmOutOfRangeError(
+ Operands, ErrorInfo, 0, (1 << 12) - 4,
+ "immediate must be a multiple of 4 bytes in the range");
+ case Match_InvalidUImm16:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 16) - 1);
+ case Match_InvalidCSKYSymbol: {
+ SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a symbol name");
+ }
+ case Match_InvalidConstpool: {
+ SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a constpool symbol name");
+ }
+ }
+
+ llvm_unreachable("Unknown match type detected!");
+}
+
+// Attempts to match Name as a register (either using the default name or
+// alternative ABI names), setting RegNo to the matching register. Upon
+// failure, returns true and sets RegNo to 0.
+static bool matchRegisterNameHelper(MCRegister &RegNo, StringRef Name) {
+ RegNo = MatchRegisterName(Name);
+
+ if (RegNo == CSKY::NoRegister)
+ RegNo = MatchRegisterAltName(Name);
+
+ return RegNo == CSKY::NoRegister;
+}
+
+bool CSKYAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ const AsmToken &Tok = getParser().getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+ StringRef Name = getLexer().getTok().getIdentifier();
+
+ if (!matchRegisterNameHelper((MCRegister &)RegNo, Name)) {
+ getParser().Lex(); // Eat identifier token.
+ return false;
+ }
+
+ return Error(StartLoc, "invalid register name");
+}
+
+OperandMatchResultTy CSKYAsmParser::parseRegister(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+ switch (getLexer().getKind()) {
+ default:
+ return MatchOperand_NoMatch;
+ case AsmToken::Identifier: {
+ StringRef Name = getLexer().getTok().getIdentifier();
+ MCRegister RegNo;
+
+ if (matchRegisterNameHelper((MCRegister &)RegNo, Name))
+ return MatchOperand_NoMatch;
+
+ getLexer().Lex();
+ Operands.push_back(CSKYOperand::createReg(RegNo, S, E));
+
+ return MatchOperand_Success;
+ }
+ }
+}
+
+OperandMatchResultTy CSKYAsmParser::parseBaseRegImm(OperandVector &Operands) {
+ assert(getLexer().is(AsmToken::LParen));
+
+ Operands.push_back(CSKYOperand::createToken("(", getLoc()));
+
+ auto Tok = getParser().Lex(); // Eat '('
+
+ if (parseRegister(Operands) != MatchOperand_Success) {
+ getLexer().UnLex(Tok);
+ Operands.pop_back();
+ return MatchOperand_ParseFail;
+ }
+
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(getLoc(), "expected ','");
+ return MatchOperand_ParseFail;
+ }
+
+ getParser().Lex(); // Eat ','
+
+ if (parseRegister(Operands) == MatchOperand_Success) {
+ if (getLexer().isNot(AsmToken::LessLess)) {
+ Error(getLoc(), "expected '<<'");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(CSKYOperand::createToken("<<", getLoc()));
+
+ getParser().Lex(); // Eat '<<'
+
+ if (parseImmediate(Operands) != MatchOperand_Success) {
+ Error(getLoc(), "expected imm");
+ return MatchOperand_ParseFail;
+ }
+
+ } else if (parseImmediate(Operands) != MatchOperand_Success) {
+ Error(getLoc(), "expected imm");
+ return MatchOperand_ParseFail;
+ }
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(getLoc(), "expected ')'");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(CSKYOperand::createToken(")", getLoc()));
+
+ getParser().Lex(); // Eat ')'
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy CSKYAsmParser::parseImmediate(OperandVector &Operands) {
+ switch (getLexer().getKind()) {
+ default:
+ return MatchOperand_NoMatch;
+ case AsmToken::LParen:
+ case AsmToken::Minus:
+ case AsmToken::Plus:
+ case AsmToken::Integer:
+ case AsmToken::String:
+ break;
+ }
+
+ const MCExpr *IdVal;
+ SMLoc S = getLoc();
+ if (getParser().parseExpression(IdVal))
+ return MatchOperand_ParseFail;
+
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+ Operands.push_back(CSKYOperand::createImm(IdVal, S, E));
+ return MatchOperand_Success;
+}
+
+/// Looks at a token type and creates the relevant operand from this
+/// information, adding to Operands. If operand was parsed, returns false, else
+/// true.
+bool CSKYAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy Result =
+ MatchOperandParserImpl(Operands, Mnemonic, /*ParseForAllFeatures=*/true);
+ if (Result == MatchOperand_Success)
+ return false;
+ if (Result == MatchOperand_ParseFail)
+ return true;
+
+ // Attempt to parse token as register
+ if (parseRegister(Operands) == MatchOperand_Success)
+ return false;
+
+ // Attempt to parse token as (register, imm)
+ if (getLexer().is(AsmToken::LParen))
+ if (parseBaseRegImm(Operands) == MatchOperand_Success)
+ return false;
+
+ // Attempt to parse token as a imm.
+ if (parseImmediate(Operands) == MatchOperand_Success)
+ return false;
+
+ // Finally we have exhausted all options and must declare defeat.
+ Error(getLoc(), "unknown operand");
+ return true;
+}
+
+OperandMatchResultTy CSKYAsmParser::parseCSKYSymbol(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Identifier;
+ if (getParser().parseIdentifier(Identifier))
+ return MatchOperand_ParseFail;
+
+ CSKYMCExpr::VariantKind Kind = CSKYMCExpr::VK_CSKY_None;
+
+ if (Identifier.consume_back("@GOT"))
+ Kind = CSKYMCExpr::VK_CSKY_GOT;
+ else if (Identifier.consume_back("@GOTOFF"))
+ Kind = CSKYMCExpr::VK_CSKY_GOTOFF;
+ else if (Identifier.consume_back("@PLT"))
+ Kind = CSKYMCExpr::VK_CSKY_PLT;
+ else if (Identifier.consume_back("@GOTPC"))
+ Kind = CSKYMCExpr::VK_CSKY_GOTPC;
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+ if (Kind != CSKYMCExpr::VK_CSKY_None)
+ Res = CSKYMCExpr::create(Res, Kind, getContext());
+
+ Operands.push_back(CSKYOperand::createImm(Res, S, E));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+CSKYAsmParser::parseConstpoolSymbol(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+ if (getLexer().getKind() != AsmToken::LBrac)
+ return MatchOperand_NoMatch;
+
+ getLexer().Lex(); // Eat '['.
+
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Identifier;
+ if (getParser().parseIdentifier(Identifier))
+ return MatchOperand_ParseFail;
+
+ if (getLexer().getKind() != AsmToken::RBrac)
+ return MatchOperand_NoMatch;
+
+ getLexer().Lex(); // Eat ']'.
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ Operands.push_back(CSKYOperand::createImm(Res, S, E));
+ return MatchOperand_Success;
+}
+
+bool CSKYAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ // First operand is token for instruction.
+ Operands.push_back(CSKYOperand::createToken(Name, NameLoc));
+
+ // If there are no more operands, then finish.
+ if (getLexer().is(AsmToken::EndOfStatement))
+ return false;
+
+ // Parse first operand.
+ if (parseOperand(Operands, Name))
+ return true;
+
+ // Parse until end of statement, consuming commas between operands.
+ while (getLexer().is(AsmToken::Comma)) {
+ // Consume comma token.
+ getLexer().Lex();
+
+ // Parse next operand.
+ if (parseOperand(Operands, Name))
+ return true;
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ SMLoc Loc = getLexer().getLoc();
+ getParser().eatToEndOfStatement();
+ return Error(Loc, "unexpected token");
+ }
+
+ getParser().Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+OperandMatchResultTy CSKYAsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ const AsmToken &Tok = getParser().getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+
+ StringRef Name = getLexer().getTok().getIdentifier();
+
+ if (matchRegisterNameHelper((MCRegister &)RegNo, Name))
+ return MatchOperand_NoMatch;
+
+ getParser().Lex(); // Eat identifier token.
+ return MatchOperand_Success;
+}
+
+bool CSKYAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmParser() {
+ RegisterMCAsmParser<CSKYAsmParser> X(getTheCSKYTarget());
+}
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/CSKY/CMakeLists.txt
index ec487ed..c49c2a0 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/CSKY/CMakeLists.txt
@@ -2,9 +2,12 @@
set(LLVM_TARGET_DEFINITIONS CSKY.td)
-tablegen(LLVM CSKYGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM CSKYGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM CSKYGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM CSKYGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM CSKYGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM CSKYGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM CSKYGenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(CSKYCommonTableGen)
@@ -22,5 +25,6 @@
CSKY
)
-add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/CSKY.td b/src/llvm-project/llvm/lib/Target/CSKY/CSKY.td
index da6151b..854a8b5 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/CSKY.td
+++ b/src/llvm-project/llvm/lib/Target/CSKY/CSKY.td
@@ -19,7 +19,7 @@
// CSKY processors supported.
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"generic-csky", NoSchedModel, []>;
+def : ProcessorModel<"generic", NoSchedModel, []>;
//===----------------------------------------------------------------------===//
// Define the CSKY target.
@@ -27,6 +27,19 @@
def CSKYInstrInfo : InstrInfo;
+
+def CSKYAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterAltName = 1;
+ let AllowDuplicateRegisterNames = 1;
+}
+
+def CSKYAsmWriter : AsmWriter {
+ int PassSubtarget = 1;
+}
+
def CSKY : Target {
let InstructionSet = CSKYInstrInfo;
+ let AssemblyParsers = [CSKYAsmParser];
+ let AssemblyWriters = [CSKYAsmWriter];
+ let AllowRegisterRenaming = 1;
}
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
index 86f9dd0..dd71b69 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -54,13 +54,14 @@
pattern> {
bits<26> offset;
let Inst{25 - 0} = offset;
+ let isCall = 1;
+ let Defs = [ R15 ];
}
// Format< OP[6] | RZ[5] | SOP[3] | OFFSET[18] >
// Instructions(7): grs, lrs32.b, lrs32.h, lrs32.w, srs32.b, srs32.h, srs32.w
-class I_18_Z_L<bits<3> sop, string op, Operand operand, list<dag> pattern>
- : CSKY32Inst<AddrModeNone, 0x33, (outs GPR:$rz), (ins operand:$offset),
- !strconcat(op, "\t$rz, $offset"), pattern> {
+class I_18_Z_L<bits<3> sop, string asm, dag outs, dag ins, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x33, outs, ins, asm, pattern> {
bits<5> rz;
bits<18> offset;
let Inst{25 - 21} = rz;
@@ -100,10 +101,9 @@
// Format< OP[6] | SOP[5] | RZ[5] | OFFSET[16] >
// Instructions(1): lrw32
-class I_16_Z_L<bits<5> sop, string op, Operand operand, list<dag> pattern>
- : CSKY32Inst<AddrModeNone, 0x3a,
- (outs GPR:$rz), (ins operand:$imm16),
- !strconcat(op, "\t$rz, [$imm16]"), pattern> {
+class I_16_Z_L<bits<5> sop, string op, dag ins, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs GPR:$rz), ins,
+ !strconcat(op, "\t$rz, $imm16"), pattern> {
bits<5> rz;
bits<16> imm16;
let Inst{25 - 21} = sop;
@@ -113,22 +113,14 @@
// Format< OP[6] | SOP[5] | 00000[5] | OFFSET[16] >
// Instructions(5): bt32, bf32, br32, jmpi32, jsri32
-class I_16_L<bits<5> sop, dag outs, dag ins, string op, list<dag> pattern>
- : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, !strconcat(op, "\t$imm16"),
- pattern> {
+class I_16_L<bits<5> sop, dag outs, dag ins, string asm, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, asm, pattern> {
bits<16> imm16;
let Inst{25 - 21} = sop;
let Inst{20 - 16} = 0;
let Inst{15 - 0} = imm16;
}
-// bt32, bf32, br32, jmpi32
-class I_16_L_B<bits<5> sop, string op, Operand operand, list<dag> pattern>
- : I_16_L<sop, (outs), (ins operand:$imm16, CARRY:$ca), op, pattern> {
- let isBranch = 1;
- let isTerminator = 1;
-}
-
// Format< OP[6] | SOP[5] | RX[5] | 0000000000000000[16] >
// Instructions(2): jmp32, jsr32
class I_16_JX<bits<5> sop, string op, list<dag> pattern>
@@ -167,11 +159,24 @@
let isBarrier = 1;
}
+// Instructions(1): rte32
+class I_16_RET_I<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), op, pattern> {
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = pcode;
+ let Inst{15 - 10} = 0x10;
+ let Inst{9 - 5} = 1;
+ let Inst{4 - 0} = 0;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+}
+
// Format< OP[6] | SOP[5] | RX[5] | IMM16[16] >
// Instructions(3): cmpnei32, cmphsi32, cmplti32
-class I_16_X<bits<5> sop, string op>
+class I_16_X<bits<5> sop, string op, Operand operand>
: CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca),
- (ins GPR:$rx, i32imm:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
+ (ins GPR:$rx, operand:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
bits<16> imm16;
bits<5> rx;
let Inst{25 - 21} = sop;
@@ -211,7 +216,7 @@
class I_LDST<AddrMode am, bits<6> opcode, bits<4> sop, dag outs, dag ins,
string op, list<dag> pattern>
- : CSKY32Inst<am, opcode, outs, ins, !strconcat(op, "\t$rz, ($rx, $imm12)"),
+ : CSKY32Inst<am, opcode, outs, ins, !strconcat(op, "\t$rz, ($rx, ${imm12})"),
pattern> {
bits<5> rx;
bits<5> rz;
@@ -298,13 +303,13 @@
// Format< OP[6] | LSB[5] | RX[5] | SOP[6] | MSB[5] | RZ[5]>
// Instructions(6): zext32, zextb32, zexth32, sext32, sextb32, sexth32
-class I_5_XZ_U<bits<6> sop, bits<5> lsb, bits<5> msb, dag outs, dag ins,
- string op, list<dag> pattern>
- : CSKY32Inst<AddrModeNone, 0x31, outs, ins,
- op #"\t$rz, $rx, " #!cast<int>(msb) #", " #!cast<int>(lsb),
+class I_5_XZ_U<bits<6> sop, dag outs, dag ins, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, outs, ins, op #"\t$rz, $rx, $msb, $lsb",
pattern> {
bits<5> rx;
bits<5> rz;
+ bits<5> msb;
+ bits<5> lsb;
let Inst{25 - 21} = lsb; // lsb
let Inst{20 - 16} = rx;
let Inst{15 - 10} = sop;
@@ -313,12 +318,12 @@
}
// sextb, sexth
-class I_5_XZ_US<bits<6> sop, bits<5> lsb, bits<5> msb, string op, SDNode opnode,
- ValueType type> : I_5_XZ_U<sop, lsb, msb,
- (outs GPR:$rz), (ins GPR:$rx),op, [(set GPR:$rz, (opnode GPR:$rx, type))]>;
+class I_5_XZ_US<bits<6> sop, string op, SDNode opnode,
+ ValueType type> : I_5_XZ_U<sop, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), op,
+ [(set GPR:$rz, (opnode GPR:$rx, type))]>;
-class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
- : I_5_XZ_U<sop, lsb, msb, (outs GPR:$rz), (ins GPR:$rx), op,
+class I_5_XZ_UZ<bits<6> sop, string op, int v>
+ : I_5_XZ_U<sop, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), op,
[(set GPR:$rz, (and GPR:$rx, (i32 v)))]>;
// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | SIZE[5] | LSB[5]>
@@ -401,27 +406,26 @@
// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
// Instructions:(8) ldr32.b, ldr32.h, ldr32.bs, ldr32.hs, ldr32.w,
// str32.b, str32.h, str32.w
-class R_YXZ_LDST<bits<6> opcode, bits<6> sop, bits<5> pcode, int no, dag outs,
+class R_YXZ_LDST<bits<6> opcode, bits<6> sop, dag outs,
dag ins, string op, list<dag> pattern>
: CSKY32Inst<AddrModeNone, opcode, outs, ins,
- op #"\t$rz, ($rx, $ry << " #no #")", pattern> {
+ op # "\t$rz, ($rx, $ry << ${imm})", pattern> {
bits<5> rx;
bits<5> ry;
bits<5> rz;
+ bits<5> imm;
let Inst{25 - 21} = ry; // ry;
let Inst{20 - 16} = rx; // rx;
let Inst{15 - 10} = sop;
- let Inst{9 - 5} = pcode; // pcode;
+ let Inst{9 - 5} = imm; // pcode;
let Inst{4 - 0} = rz;
}
-class I_LDR<bits<6> sop, bits<5> pcode, string op, int no>
- : R_YXZ_LDST<0x34, sop, pcode, no,
- (outs GPR:$rz), (ins GPR:$rx, GPR:$ry), op, []>;
+class I_LDR<bits<6> sop, string op> : R_YXZ_LDST<0x34, sop,
+ (outs GPR:$rz), (ins GPR:$rx, GPR:$ry, uimm_shift:$imm), op, []>;
-class I_STR<bits<6> sop, bits<5> pcode, string op, int no>
- : R_YXZ_LDST<0x35, sop, pcode, no, (outs),
- (ins GPR:$rz, GPR:$rx, GPR:$ry), op, []>;
+class I_STR<bits<6> sop, string op> : R_YXZ_LDST<0x35, sop,
+ (outs), (ins GPR:$rz, GPR:$rx, GPR:$ry, uimm_shift:$imm), op, []>;
// Format< OP[6] | RX[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
// Instructions:(1) not32
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index 7add217..20adda4 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -10,31 +10,55 @@
//
//===----------------------------------------------------------------------===//
-include "CSKYInstrFormats.td"
//===----------------------------------------------------------------------===//
// CSKY specific DAG Nodes.
//===----------------------------------------------------------------------===//
-// TODO: Add CSKY specific DAG Nodes.
+// Target-dependent nodes.
+def CSKY_RET : SDNode<"CSKYISD::RET", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
+class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
+ let Name = prefix # "Imm" # width # suffix;
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = !strconcat("Invalid", Name);
+}
+
+class SImmAsmOperand<int width, string suffix = "">
+ : ImmAsmOperand<"S", width, suffix> {
+}
+
+class UImmAsmOperand<int width, string suffix = "">
+ : ImmAsmOperand<"U", width, suffix> {
+}
+
+class OImmAsmOperand<int width, string suffix = "">
+ : ImmAsmOperand<"O", width, suffix> {
+}
class oimm<int num> : Operand<i32>,
ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
let EncoderMethod = "getOImmOpValue";
+ let ParserMatchClass = OImmAsmOperand<num>;
}
class uimm<int num, int shift = 0> : Operand<i32>,
ImmLeaf<i32, "return isShiftedUInt<"#num#", "#shift#">(Imm);"> {
let EncoderMethod = "getImmOpValue<"#shift#">";
+ let ParserMatchClass =
+ !if(!ne(shift, 0),
+ UImmAsmOperand<num, "Shift"#shift>,
+ UImmAsmOperand<num>);
}
class simm<int num, int shift = 0> : Operand<i32>,
ImmLeaf<i32, "return isShiftedInt<"#num#", "#shift#">(Imm);"> {
let EncoderMethod = "getImmOpValue<"#shift#">";
+ let ParserMatchClass = SImmAsmOperand<num>;
}
def nimm_XFORM : SDNodeXForm<imm, [{
@@ -42,15 +66,76 @@
}]>;
class nimm<int num> : Operand<i32>,
ImmLeaf<i32, "return isUInt<"#num#">(~Imm);", nimm_XFORM> {
+ let ParserMatchClass = UImmAsmOperand<num>;
}
+def uimm32_hi16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 16) & 0xFFFF,
+ SDLoc(N), MVT::i32);
+}]>;
+def uimm16_16_xform : Operand<i32>,
+ ImmLeaf<i32, "return isShiftedUInt<16, 16>(Imm);", uimm32_hi16> {
+ let ParserMatchClass = UImmAsmOperand<16>;
+}
+
+def uimm_shift : Operand<i32>, ImmLeaf<i32, "return isUInt<2>(Imm);"> {
+ let EncoderMethod = "getImmShiftOpValue";
+ let ParserMatchClass = UImmAsmOperand<2>;
+}
+
+def CSKYSymbol : AsmOperandClass {
+ let Name = "CSKYSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidCSKYSymbol";
+ let ParserMethod = "parseCSKYSymbol";
+}
+
+def br_symbol : Operand<iPTR> {
+ let EncoderMethod =
+ "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm16_scale2>";
+ let ParserMatchClass = CSKYSymbol;
+}
+
+def call_symbol : Operand<iPTR> {
+ let ParserMatchClass = CSKYSymbol;
+ let EncoderMethod = "getCallSymbolOpValue";
+}
+
+def Constpool : AsmOperandClass {
+ let Name = "ConstpoolSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidConstpool";
+ let ParserMethod = "parseConstpoolSymbol";
+}
+
+def constpool_symbol : Operand<iPTR> {
+ let ParserMatchClass = Constpool;
+ let EncoderMethod =
+ "getConstpoolSymbolOpValue<CSKY::fixup_csky_pcrel_uimm16_scale4>";
+}
+
+def bare_symbol : Operand<iPTR> {
+ let ParserMatchClass = CSKYSymbol;
+ let EncoderMethod = "getBareSymbolOpValue";
+}
def oimm12 : oimm<12>;
+def oimm16 : oimm<16>;
def nimm12 : nimm<12>;
def uimm5 : uimm<5>;
def uimm12 : uimm<12>;
+def uimm12_1 : uimm<12, 1>;
+def uimm12_2 : uimm<12, 2>;
+def uimm16 : uimm<16>;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Formats
+//===----------------------------------------------------------------------===//
+
+include "CSKYInstrFormats.td"
//===----------------------------------------------------------------------===//
// Instruction definitions.
@@ -60,49 +145,229 @@
class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
-def ADDI32 : I_12<0x0, "addi32", add, oimm12>;
-def SUBI32 : I_12<0x1, "subi32", sub, oimm12>;
-def ANDI32 : I_12<0x2, "andi32", and, uimm12>;
-def ANDNI32 : I_12<0x3, "andni32", and, nimm12>;
-def XORI32 : I_12<0x4, "xori32", xor, uimm12>;
-def LSLI32 : I_5_XZ<0x12, 0x1, "lsli32",
- (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
- [(set GPR:$rz, (shl GPR:$rx, uimm5:$imm5))]>;
-def LSRI32 : I_5_XZ<0x12, 0x2, "lsri32",
- (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
- [(set GPR:$rz, (srl GPR:$rx, uimm5:$imm5))]>;
-def ASRI32 : I_5_XZ<0x12, 0x4, "asri32",
- (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
- [(set GPR:$rz, (sra GPR:$rx, uimm5:$imm5))]>;
+//===----------------------------------------------------------------------===//
+// Basic ALU instructions.
+//===----------------------------------------------------------------------===//
-def ADDU32 : R_YXZ_SP_F1<0x0, 0x1,
- BinOpFrag<(add node:$LHS, node:$RHS)>, "addu32", 1>;
-def SUBU32 : R_YXZ_SP_F1<0x0, 0x4,
- BinOpFrag<(sub node:$LHS, node:$RHS)>, "subu32">;
-def AND32 : R_YXZ_SP_F1<0x8, 0x1,
- BinOpFrag<(and node:$LHS, node:$RHS)>, "and32", 1>;
-def ANDN32 : R_YXZ_SP_F1<0x8, 0x2,
- BinOpFrag<(and node:$LHS, (not node:$RHS))>, "andn32">;
-def OR32: R_YXZ_SP_F1<0x9, 0x1,
- BinOpFrag<(or node:$LHS, node:$RHS)>, "or32", 1>;
-def XOR32 : R_YXZ_SP_F1<0x9, 0x2,
- BinOpFrag<(xor node:$LHS, node:$RHS)>, "xor32", 1>;
-def NOR32 : R_YXZ_SP_F1<0x9, 0x4,
- BinOpFrag<(not (or node:$LHS, node:$RHS))>, "nor32", 1>;
-def LSL32 : R_YXZ_SP_F1<0x10, 0x1,
- BinOpFrag<(shl node:$LHS, node:$RHS)>, "lsl32">;
-def LSR32 : R_YXZ_SP_F1<0x10, 0x2,
- BinOpFrag<(srl node:$LHS, node:$RHS)>, "lsr32">;
-def ASR32 : R_YXZ_SP_F1<0x10, 0x4,
- BinOpFrag<(sra node:$LHS, node:$RHS)>, "asr32">;
-def MULT32 : R_YXZ_SP_F1<0x21, 0x1,
- BinOpFrag<(mul node:$LHS, node:$RHS)>, "mult32", 1>;
-def DIVS32 : R_YXZ_SP_F1<0x20, 0x2,
- BinOpFrag<(sdiv node:$LHS, node:$RHS)>, "divs32">;
-def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
- BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
+ def ADDI32 : I_12<0x0, "addi32", add, oimm12>;
+ def SUBI32 : I_12<0x1, "subi32", sub, oimm12>;
+ def ORI32 : I_16_ZX<"ori32", uimm16,
+ [(set GPR:$rz, (or GPR:$rx, uimm16:$imm16))]>;
+ def XORI32 : I_12<0x4, "xori32", xor, uimm12>;
+ def ANDI32 : I_12<0x2, "andi32", and, uimm12>;
+ def ANDNI32 : I_12<0x3, "andni32", and, nimm12>;
+ def LSLI32 : I_5_XZ<0x12, 0x1, "lsli32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (shl GPR:$rx, uimm5:$imm5))]>;
+ def LSRI32 : I_5_XZ<0x12, 0x2, "lsri32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (srl GPR:$rx, uimm5:$imm5))]>;
+ def ASRI32 : I_5_XZ<0x12, 0x4, "asri32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (sra GPR:$rx, uimm5:$imm5))]>;
+ def ROTLI32 : I_5_XZ<0x12, 0x8, "rotli32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (rotl GPR:$rx, uimm5:$imm5))]>;
-def NOT32 : R_XXZ<0b001001, 0b00100, (outs GPR:$rz), (ins GPR:$rx),
- "not", [(set GPR:$rz, (not GPR:$rx))]>;
+
+ def ADDU32 : R_YXZ_SP_F1<0x0, 0x1,
+ BinOpFrag<(add node:$LHS, node:$RHS)>, "addu32", 1>;
+ def SUBU32 : R_YXZ_SP_F1<0x0, 0x4,
+ BinOpFrag<(sub node:$LHS, node:$RHS)>, "subu32">;
+ def MULT32 : R_YXZ_SP_F1<0x21, 0x1,
+ BinOpFrag<(mul node:$LHS, node:$RHS)>, "mult32", 1>;
+ def AND32 : R_YXZ_SP_F1<0x8, 0x1,
+ BinOpFrag<(and node:$LHS, node:$RHS)>, "and32", 1>;
+ def ANDN32 : R_YXZ_SP_F1<0x8, 0x2,
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>, "andn32">;
+ def OR32: R_YXZ_SP_F1<0x9, 0x1,
+ BinOpFrag<(or node:$LHS, node:$RHS)>, "or32", 1>;
+ def XOR32 : R_YXZ_SP_F1<0x9, 0x2,
+ BinOpFrag<(xor node:$LHS, node:$RHS)>, "xor32", 1>;
+ def NOR32 : R_YXZ_SP_F1<0x9, 0x4,
+ BinOpFrag<(not (or node:$LHS, node:$RHS))>, "nor32", 1>;
+ def NOT32 : R_XXZ<0b001001, 0b00100, (outs GPR:$rz), (ins GPR:$rx),
+ "not32", [(set GPR:$rz, (not GPR:$rx))]>;
+ def LSL32 : R_YXZ_SP_F1<0x10, 0x1,
+ BinOpFrag<(shl node:$LHS, node:$RHS)>, "lsl32">;
+ def LSR32 : R_YXZ_SP_F1<0x10, 0x2,
+ BinOpFrag<(srl node:$LHS, node:$RHS)>, "lsr32">;
+ def ASR32 : R_YXZ_SP_F1<0x10, 0x4,
+ BinOpFrag<(sra node:$LHS, node:$RHS)>, "asr32">;
+ def ROTL32 : R_YXZ_SP_F1<0x10, 0x8,
+ BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>, "rotl32">;
+
+ // TODO: Shift series instr. with carry.
+
+ def IXH32 : R_YXZ_SP_F1<0x2, 0x1,
+ BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 1)))>, "ixh32">;
+ def IXW32 : R_YXZ_SP_F1<0x2, 0x2,
+ BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 2)))>, "ixw32">;
+
+ def IXD32 : R_YXZ_SP_F1<0x2, 0x4,
+ BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 3)))>, "ixd32">;
+
+ let isCommutable = 1 in
+ def ADDC32 : R_YXZ<0x31, 0x0, 0x2, (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []>;
+ def SUBC32 : R_YXZ<0x31, 0x0, 0x8, (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []>;
+
+ // TODO: incf32.
+ def DIVS32 : R_YXZ_SP_F1<0x20, 0x2,
+ BinOpFrag<(sdiv node:$LHS, node:$RHS)>, "divs32">;
+ def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
+ BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
+
+ def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32",
+ (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
+ def DECLT32 : I_5_XZ<0x4, 0x2, "declt32",
+ (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
+ def DECNE32 : I_5_XZ<0x4, 0x4, "decne32",
+ (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
+
+ // TODO: s/zext.
+ def ZEXT32 : I_5_XZ_U<0x15, (outs GPR:$rz),
+ (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "zext32",[]>;
+ def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz),
+ (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>;
+
+//===----------------------------------------------------------------------===//
+// Load & Store instructions.
+//===----------------------------------------------------------------------===//
+
+def LD32B : I_LD<AddrMode32B, 0x0, "ld32.b", uimm12>;
+def LD32H : I_LD<AddrMode32H, 0x1, "ld32.h", uimm12_1>;
+def LD32W : I_LD<AddrMode32WD, 0x2, "ld32.w", uimm12_2>;
+
+
+ def LD32BS : I_LD<AddrMode32B, 0x4, "ld32.bs", uimm12>;
+ def LD32HS : I_LD<AddrMode32H, 0x5, "ld32.hs", uimm12_1>;
+
+ // TODO: LDM and STM.
+
+
+def ST32B : I_ST<AddrMode32B, 0x0, "st32.b", uimm12>;
+def ST32H : I_ST<AddrMode32H, 0x1, "st32.h", uimm12_1>;
+def ST32W : I_ST<AddrMode32WD, 0x2, "st32.w", uimm12_2>;
+
+
+ def LDR32B : I_LDR<0x0, "ldr32.b">;
+ def LDR32BS : I_LDR<0x4, "ldr32.bs">;
+ def LDR32H : I_LDR<0x1, "ldr32.h">;
+ def LDR32HS : I_LDR<0x5, "ldr32.hs">;
+ def LDR32W : I_LDR<0x2, "ldr32.w">;
+ def STR32B : I_STR<0x0, "str32.b">;
+ def STR32H : I_STR<0x1, "str32.h">;
+ def STR32W : I_STR<0x2, "str32.w">;
+
+ //TODO: SPILL_CARRY and RESTORE_CARRY.
+
+//===----------------------------------------------------------------------===//
+// Compare instructions.
+//===----------------------------------------------------------------------===//
+
+ def CMPNEI32 : I_16_X<0x1A, "cmpnei32", uimm16>;
+ def CMPHSI32 : I_16_X<0x18, "cmphsi32", oimm16>;
+ def CMPLTI32 : I_16_X<0x19, "cmplti32", oimm16>;
+
+
+ def CMPNE32 : R_YX<0x1, 0x4, "cmpne32">;
+ def CMPHS32 : R_YX<0x1, 0x1, "cmphs32">;
+ def CMPLT32 : R_YX<0x1, 0x2, "cmplt32">;
+
+ // TODO: setc and clrc.
+ // TODO: test32 and tstnbz.
+
+//===----------------------------------------------------------------------===//
+// Data move instructions.
+//===----------------------------------------------------------------------===//
+
+ def MOVT32 : R_ZX<0x3, 0x2, "movt32", []>;
+ def MOVF32 : R_ZX<0x3, 0x1, "movf32", []>;
+ def MOVI32 : I_16_MOV<0x10, "movi32", uimm16>;
+ def MOVIH32 : I_16_MOV<0x11, "movih32", uimm16_16_xform>;
+ def MVC32 : R_Z_1<0x1, 0x8, "mvc32">;
+ def MOV32 : R_XZ<0x12, 0x1, "mov32">;
+
+ // TODO: ISEL Pseudo.
+
+ def MVCV32 : R_Z_1<0x1, 0x10, "mvcv32">;
+ // TODO: clrf and clrt.
+ def CLRF32 : R_Z_2<0xB, 0x1, "clrf32", []>;
+ def CLRT32 : R_Z_2<0xB, 0x2, "clrt32", []>;
+
+//===----------------------------------------------------------------------===//
+// Branch and call instructions.
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1 in {
+ let isBarrier = 1, isPredicable = 1 in
+ def BR32 : I_16_L<0x0, (outs), (ins br_symbol:$imm16), "br32\t$imm16",
+ [(br bb:$imm16)]>;
+
+ def BT32 : I_16_L<0x3, (outs), (ins CARRY:$ca, br_symbol:$imm16),
+ "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>;
+ def BF32 : I_16_L<0x2, (outs), (ins CARRY:$ca, br_symbol:$imm16),
+ "bf32\t$imm16", []>;
+}
+
+
+ def BEZ32 : I_16_X_L<0x8, "bez32", br_symbol>;
+ def BNEZ32 : I_16_X_L<0x9, "bnez32", br_symbol>;
+ def BHZ32 : I_16_X_L<0xA, "bhz32", br_symbol>;
+ def BLSZ32 : I_16_X_L<0xB, "blsz32", br_symbol>;
+ def BLZ32 : I_16_X_L<0xC, "blz32", br_symbol>;
+ def BHSZ32 : I_16_X_L<0xD, "bhsz32", br_symbol>;
+
+ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP32 : I_16_JX<0x6, "jmp32", [(brind GPR:$rx)]>; // jmp to register
+ def JMPI32 : I_16_L<0x16, (outs), (ins constpool_symbol:$imm16),
+ "jmpi32\t$imm16", []>;
+ }
+
+ let isCall = 1, Defs = [ R15 ] in
+ def JSR32 : I_16_JX<0x7, "jsr32", []>;
+
+ let isCall = 1, Defs = [ R15 ] , mayLoad = 1 in
+ def JSRI32: I_16_L<0x17, (outs),
+ (ins constpool_symbol:$imm16), "jsri32\t$imm16", []>;
+
+
+def BSR32 : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>;
+
+def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{
+ let isCodeGenOnly = 1;
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isPredicable = 1;
+ let Defs = [ R15 ];
+}
+
+
+ def RTS32 : I_16_RET<0x6, 0xF, "rts32", [(CSKY_RET)]>;
+
+
+def RTE32 : I_16_RET_I<0, 0, "rte32", []>;
+
+//===----------------------------------------------------------------------===//
+// Symbol address instructions.
+//===----------------------------------------------------------------------===//
+
+def GRS32 : I_18_Z_L<0x3, "grs32\t$rz, $offset",
+ (outs GPR:$rz), (ins bare_symbol:$offset), []>;
+
+let mayLoad = 1, mayStore = 0 in {
+def LRW32 : I_16_Z_L<0x14, "lrw32", (ins constpool_symbol:$imm16), []>;
+let isCodeGenOnly = 1 in
+def LRW32_Gen : I_16_Z_L<0x14, "lrw32",
+ (ins bare_symbol:$src1, constpool_symbol:$imm16), []>;
+}
+
+// TODO: Atomic and fence instructions.
+// TODO: Other operations.
+// TODO: Special instructions.
+// TODO: Pseudo for assembly.
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CMakeLists.txt
index f5d7df0..df59a99 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CMakeLists.txt
@@ -1,13 +1,15 @@
add_llvm_component_library(LLVMCSKYDesc
CSKYAsmBackend.cpp
CSKYELFObjectWriter.cpp
+ CSKYInstPrinter.cpp
CSKYMCAsmInfo.cpp
+ CSKYMCExpr.cpp
CSKYMCTargetDesc.cpp
CSKYMCCodeEmitter.cpp
LINK_COMPONENTS
- MC
CSKYInfo
+ MC
Support
ADD_TO_COMPONENT
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index e30123d..7fb5f35 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -8,6 +8,7 @@
#include "CSKYAsmBackend.h"
#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -24,14 +25,113 @@
return createCSKYELFObjectWriter();
}
-unsigned int CSKYAsmBackend::getNumFixupKinds() const { return 1; }
+const MCFixupKindInfo &
+CSKYAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+
+ static llvm::DenseMap<unsigned, MCFixupKindInfo> Infos = {
+ {CSKY::Fixups::fixup_csky_addr32, {"fixup_csky_addr32", 0, 32, 0}},
+ {CSKY::Fixups::fixup_csky_pcrel_imm16_scale2,
+ {"fixup_csky_pcrel_imm16_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+ {CSKY::Fixups::fixup_csky_pcrel_uimm16_scale4,
+ {"fixup_csky_pcrel_uimm16_scale4", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+ {CSKY::Fixups::fixup_csky_pcrel_imm26_scale2,
+ {"fixup_csky_pcrel_imm26_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+ {CSKY::Fixups::fixup_csky_pcrel_imm18_scale2,
+ {"fixup_csky_pcrel_imm18_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}}};
+ assert(Infos.size() == CSKY::NumTargetFixupKinds &&
+ "Not all fixup kinds added to Infos array");
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ if (FirstTargetFixupKind <= Kind && Kind < FirstLiteralRelocationKind)
+ return Infos[Kind];
+ else if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+ else
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+}
+
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext &Ctx) {
+ switch (Fixup.getTargetKind()) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ return Value;
+ case CSKY::fixup_csky_addr32:
+ return Value & 0xffffffff;
+ case CSKY::fixup_csky_pcrel_imm16_scale2:
+ if (!isIntN(17, Value))
+ Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+ if (Value & 0x1)
+ Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned.");
+
+ return (Value >> 1) & 0xffff;
+ case CSKY::fixup_csky_pcrel_uimm16_scale4:
+ if (!isUIntN(18, Value))
+ Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+ if (Value & 0x3)
+ Ctx.reportError(Fixup.getLoc(), "fixup value must be 4-byte aligned.");
+
+ return (Value >> 2) & 0xffff;
+ case CSKY::fixup_csky_pcrel_imm26_scale2:
+ if (!isIntN(27, Value))
+ Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+ if (Value & 0x1)
+ Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned.");
+
+ return (Value >> 1) & 0x3ffffff;
+ case CSKY::fixup_csky_pcrel_imm18_scale2:
+ if (!isIntN(19, Value))
+ Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+ if (Value & 0x1)
+ Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned.");
+
+ return (Value >> 1) & 0x3ffff;
+ }
+}
void CSKYAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved,
const MCSubtargetInfo *STI) const {
- return;
+ MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
+ MCContext &Ctx = Asm.getContext();
+ MCFixupKindInfo Info = getFixupKindInfo(Kind);
+ if (!Value)
+ return; // Doesn't change encoding.
+ // Apply any target-specific value adjustments.
+ Value = adjustFixupValue(Fixup, Value, Ctx);
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
+
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ bool IsLittleEndian = (Endian == support::little);
+
+ if (IsLittleEndian && (NumBytes == 4)) {
+ Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff);
+ Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff);
+ Data[Offset + 2] |= uint8_t(Value & 0xff);
+ Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff);
+ } else {
+ for (unsigned I = 0; I != NumBytes; I++) {
+ unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I);
+ Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff);
+ }
+ }
}
bool CSKYAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index b4cba42..cdf688e 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
+#include "MCTargetDesc/CSKYFixupKinds.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCTargetOptions.h"
@@ -20,17 +21,26 @@
CSKYAsmBackend(const MCSubtargetInfo &STI, const MCTargetOptions &OP)
: MCAsmBackend(support::little) {}
- unsigned int getNumFixupKinds() const override;
+ unsigned int getNumFixupKinds() const override {
+ return CSKY::NumTargetFixupKinds;
+ }
+
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override;
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
+
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
+
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
};
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h
new file mode 100644
index 0000000..917f940
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h
@@ -0,0 +1,34 @@
+//===-- CSKYFixupKinds.h - CSKY Specific Fixup Entries ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYFIXUPKINDS_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace CSKY {
+enum Fixups {
+ fixup_csky_addr32 = FirstTargetFixupKind,
+
+ fixup_csky_pcrel_imm16_scale2,
+
+ fixup_csky_pcrel_uimm16_scale4,
+
+ fixup_csky_pcrel_imm26_scale2,
+
+ fixup_csky_pcrel_imm18_scale2,
+
+ // Marker
+ fixup_csky_invalid,
+ NumTargetFixupKinds = fixup_csky_invalid - FirstTargetFixupKind
+};
+} // end namespace CSKY
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYFIXUPKINDS_H
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
new file mode 100644
index 0000000..c8920fb
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
@@ -0,0 +1,101 @@
+//===-- CSKYInstPrinter.cpp - Convert CSKY MCInst to asm syntax ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an CSKY MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "CSKYGenAsmWriter.inc"
+
+static cl::opt<bool>
+ NoAliases("csky-no-aliases",
+ cl::desc("Disable the emission of assembler pseudo instructions"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+ ArchRegNames("csky-arch-reg-names",
+ cl::desc("Print architectural register names rather than the "
+ "ABI names (such as r14 instead of sp)"),
+ cl::init(false), cl::Hidden);
+
+// The command-line flags above are used by llvm-mc and llc. They can be used by
+// `llvm-objdump`, but we override their values here to handle options passed to
+// `llvm-objdump` with `-M` (which matches GNU objdump). There did not seem to
+// be an easier way to allow these options in all these tools, without doing it
+// this way.
+bool CSKYInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
+ if (Opt == "no-aliases") {
+ NoAliases = true;
+ return true;
+ }
+ if (Opt == "numeric") {
+ ArchRegNames = true;
+ return true;
+ }
+
+ return false;
+}
+
+void CSKYInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCInst *NewMI = MI;
+
+ if (NoAliases || !printAliasInstr(NewMI, Address, STI, O))
+ printInstruction(NewMI, Address, STI, O);
+ printAnnotation(O, Annot);
+}
+
+void CSKYInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+ O << getRegisterName(RegNo);
+}
+
+void CSKYInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O,
+ const char *Modifier) {
+ assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+ const MCOperand &MO = MI->getOperand(OpNo);
+
+ if (MO.isReg()) {
+ if (MO.getReg() == CSKY::C)
+ O << "";
+ else
+ printRegName(O, MO.getReg());
+ return;
+ }
+
+ if (MO.isImm()) {
+ O << formatImm(MO.getImm());
+ return;
+ }
+
+ assert(MO.isExpr() && "Unknown operand kind in printOperand");
+ MO.getExpr()->print(O, &MAI);
+}
+
+const char *CSKYInstPrinter::getRegisterName(unsigned RegNo) {
+ return getRegisterName(RegNo, ArchRegNames ? CSKY::NoRegAltName
+ : CSKY::ABIRegAltName);
+}
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
new file mode 100644
index 0000000..a28791a
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
@@ -0,0 +1,52 @@
+//===-- CSKYInstPrinter.h - Convert CSKY MCInst to asm syntax ---*- C++ -*----//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a CSKY MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYINSTPRINTER_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYINSTPRINTER_H
+
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class CSKYInstPrinter : public MCInstPrinter {
+public:
+ CSKYInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ bool applyTargetSpecificCLOption(StringRef Opt) override;
+
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &O) override;
+ void printRegName(raw_ostream &O, unsigned RegNo) const override;
+
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O, const char *Modifier = nullptr);
+
+ // Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ void printInstruction(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ static const char *getRegisterName(unsigned RegNo);
+ static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYINSTPRINTER_H
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
index ed2b0e7..1a5b022 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
@@ -62,6 +62,17 @@
return 0;
}
+MCFixupKind CSKYMCCodeEmitter::getTargetFixup(const MCExpr *Expr) const {
+ const CSKYMCExpr *CSKYExpr = cast<CSKYMCExpr>(Expr);
+
+ switch (CSKYExpr->getKind()) {
+ default:
+ llvm_unreachable("Unhandled fixup kind!");
+ case CSKYMCExpr::VK_CSKY_ADDR:
+ return MCFixupKind(CSKY::fixup_csky_addr32);
+ }
+}
+
MCCodeEmitter *llvm::createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx) {
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
index c850a4b..a4c50d9 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
@@ -13,6 +13,8 @@
#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
+#include "CSKYMCExpr.h"
+#include "MCTargetDesc/CSKYFixupKinds.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -54,6 +56,78 @@
unsigned getOImmOpValue(const MCInst &MI, unsigned Idx,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+
+ unsigned getImmShiftOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isImm() && "Unexpected MO type.");
+ return 1 << MO.getImm();
+ }
+
+ MCFixupKind getTargetFixup(const MCExpr *Expr) const;
+
+ template <llvm::CSKY::Fixups FIXUP>
+ unsigned getBranchSymbolOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+
+ if (MO.isImm())
+ return MO.getImm() >> 1;
+
+ assert(MO.isExpr() && "Unexpected MO type.");
+
+ MCFixupKind Kind = MCFixupKind(FIXUP);
+ if (MO.getExpr()->getKind() == MCExpr::Target)
+ Kind = getTargetFixup(MO.getExpr());
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ return 0;
+ }
+
+ template <llvm::CSKY::Fixups FIXUP>
+ unsigned getConstpoolSymbolOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isExpr() && "Unexpected MO type.");
+
+ MCFixupKind Kind = MCFixupKind(FIXUP);
+ if (MO.getExpr()->getKind() == MCExpr::Target)
+ Kind = getTargetFixup(MO.getExpr());
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ return 0;
+ }
+
+ unsigned getCallSymbolOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isExpr() && "Unexpected MO type.");
+
+ MCFixupKind Kind = MCFixupKind(CSKY::fixup_csky_pcrel_imm26_scale2);
+ if (MO.getExpr()->getKind() == MCExpr::Target)
+ Kind = getTargetFixup(MO.getExpr());
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ return 0;
+ }
+
+ unsigned getBareSymbolOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isExpr() && "Unexpected MO type.");
+
+ MCFixupKind Kind = MCFixupKind(CSKY::fixup_csky_pcrel_imm18_scale2);
+ if (MO.getExpr()->getKind() == MCExpr::Target)
+ Kind = getTargetFixup(MO.getExpr());
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ return 0;
+ }
};
} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
new file mode 100644
index 0000000..59e630f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
@@ -0,0 +1,122 @@
+//===-- CSKYMCExpr.cpp - CSKY specific MC expression classes -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCExpr.h"
+#include "CSKYFixupKinds.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-mc-expr"
+
+const CSKYMCExpr *CSKYMCExpr::create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx) {
+ return new (Ctx) CSKYMCExpr(Kind, Expr);
+}
+
+StringRef CSKYMCExpr::getVariantKindName(VariantKind Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Invalid ELF symbol kind");
+ case VK_CSKY_ADDR:
+ return "";
+ case VK_CSKY_PCREL:
+ return "";
+ case VK_CSKY_GOT:
+ return "@GOT";
+ case VK_CSKY_GOTPC:
+ return "@GOTPC";
+ case VK_CSKY_GOTOFF:
+ return "@GOTOFF";
+ case VK_CSKY_PLT:
+ return "@PLT";
+ case VK_CSKY_TPOFF:
+ return "@TPOFF";
+ case VK_CSKY_TLSGD:
+ return "@TLSGD";
+ }
+}
+
+void CSKYMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
+
+void CSKYMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ Expr->print(OS, MAI);
+ OS << getVariantKindName(getKind());
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ llvm_unreachable("Can't handle nested target expression");
+ break;
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+
+ case MCExpr::SymbolRef: {
+ // We're known to be under a TLS fixup, so any symbol should be
+ // modified. There should be only one.
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+ break;
+ }
+
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+}
+
+void CSKYMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch (getKind()) {
+ default:
+ return;
+ case VK_CSKY_TPOFF:
+ case VK_CSKY_TLSGD:
+ break;
+ }
+
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+bool CSKYMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ // Some custom fixup types are not valid with symbol difference expressions
+ if (Res.getSymA() && Res.getSymB()) {
+ switch (getKind()) {
+ default:
+ return true;
+
+ case VK_CSKY_ADDR:
+ case VK_CSKY_PCREL:
+ case VK_CSKY_GOT:
+ case VK_CSKY_GOTPC:
+ case VK_CSKY_GOTOFF:
+ case VK_CSKY_TPOFF:
+ case VK_CSKY_TLSGD:
+ return false;
+ }
+ }
+
+ return true;
+}
\ No newline at end of file
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
new file mode 100644
index 0000000..06fccad
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
@@ -0,0 +1,69 @@
+//===-- CSKYMCExpr.h - CSKY specific MC expression classes -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class CSKYMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_CSKY_None,
+ VK_CSKY_ADDR,
+ VK_CSKY_PCREL,
+ VK_CSKY_GOT,
+ VK_CSKY_GOTPC,
+ VK_CSKY_GOTOFF,
+ VK_CSKY_PLT,
+ VK_CSKY_TPOFF,
+ VK_CSKY_TLSGD,
+ VK_CSKY_Invalid
+ };
+
+private:
+ const VariantKind Kind;
+ const MCExpr *Expr;
+
+ explicit CSKYMCExpr(VariantKind Kind, const MCExpr *Expr)
+ : Kind(Kind), Expr(Expr) {}
+
+public:
+ static const CSKYMCExpr *create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx);
+
+ // Returns the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ // Returns the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+
+ static StringRef getVariantKindName(VariantKind Kind);
+};
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index 876000a..169e1e1 100644
--- a/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -12,6 +12,7 @@
#include "CSKYMCTargetDesc.h"
#include "CSKYAsmBackend.h"
+#include "CSKYInstPrinter.h"
#include "CSKYMCAsmInfo.h"
#include "CSKYMCCodeEmitter.h"
#include "TargetInfo/CSKYTargetInfo.h"
@@ -26,6 +27,9 @@
#define GET_REGINFO_MC_DESC
#include "CSKYGenRegisterInfo.inc"
+#define GET_SUBTARGETINFO_MC_DESC
+#include "CSKYGenSubtargetInfo.inc"
+
using namespace llvm;
static MCAsmInfo *createCSKYMCAsmInfo(const MCRegisterInfo &MRI,
@@ -46,12 +50,28 @@
return Info;
}
+static MCInstPrinter *createCSKYMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new CSKYInstPrinter(MAI, MII, MRI);
+}
+
static MCRegisterInfo *createCSKYMCRegisterInfo(const Triple &TT) {
MCRegisterInfo *Info = new MCRegisterInfo();
InitCSKYMCRegisterInfo(Info, CSKY::R15);
return Info;
}
+static MCSubtargetInfo *createCSKYMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string CPUName = std::string(CPU);
+ if (CPUName.empty())
+ CPUName = "generic";
+ return createCSKYMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU=*/CPUName, FS);
+}
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetMC() {
auto &CSKYTarget = getTheCSKYTarget();
TargetRegistry::RegisterMCAsmBackend(CSKYTarget, createCSKYAsmBackend);
@@ -59,4 +79,7 @@
TargetRegistry::RegisterMCInstrInfo(CSKYTarget, createCSKYMCInstrInfo);
TargetRegistry::RegisterMCRegInfo(CSKYTarget, createCSKYMCRegisterInfo);
TargetRegistry::RegisterMCCodeEmitter(CSKYTarget, createCSKYMCCodeEmitter);
+ TargetRegistry::RegisterMCInstPrinter(CSKYTarget, createCSKYMCInstPrinter);
+ TargetRegistry::RegisterMCSubtargetInfo(CSKYTarget,
+ createCSKYMCSubtargetInfo);
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index b6763fd..7edc2a0 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -6,8 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mcasmparser"
-
#include "HexagonTargetStreamer.h"
#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCELFStreamer.h"
@@ -58,6 +56,8 @@
#include <string>
#include <utility>
+#define DEBUG_TYPE "mcasmparser"
+
using namespace llvm;
static cl::opt<bool> WarnMissingParenthesis(
@@ -510,19 +510,19 @@
"supported with this architecture";
StringRef Option = Parser.getTok().getString();
auto IDLoc = Parser.getTok().getLoc();
- if (Option.compare_lower("endloop01") == 0) {
+ if (Option.compare_insensitive("endloop01") == 0) {
HexagonMCInstrInfo::setInnerLoop(MCB);
HexagonMCInstrInfo::setOuterLoop(MCB);
- } else if (Option.compare_lower("endloop0") == 0) {
+ } else if (Option.compare_insensitive("endloop0") == 0) {
HexagonMCInstrInfo::setInnerLoop(MCB);
- } else if (Option.compare_lower("endloop1") == 0) {
+ } else if (Option.compare_insensitive("endloop1") == 0) {
HexagonMCInstrInfo::setOuterLoop(MCB);
- } else if (Option.compare_lower("mem_noshuf") == 0) {
+ } else if (Option.compare_insensitive("mem_noshuf") == 0) {
if (getSTI().getFeatureBits()[Hexagon::FeatureMemNoShuf])
HexagonMCInstrInfo::setMemReorderDisabled(MCB);
else
return getParser().Error(IDLoc, MemNoShuffMsg);
- } else if (Option.compare_lower("mem_no_order") == 0) {
+ } else if (Option.compare_insensitive("mem_no_order") == 0) {
// Nothing.
} else
return getParser().Error(IDLoc, llvm::Twine("'") + Option +
@@ -838,7 +838,8 @@
MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1];
if (!Operand.isToken())
return false;
- return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String);
+ return static_cast<HexagonOperand &>(Operand).getToken().equals_insensitive(
+ String);
}
static bool previousIsLoop(OperandVector &Operands, size_t Index) {
@@ -892,7 +893,7 @@
HexagonOperand::CreateReg(getContext(), Register, Begin, End));
const AsmToken &MaybeDotNew = Lexer.getTok();
if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
- MaybeDotNew.getString().equals_lower(".new"))
+ MaybeDotNew.getString().equals_insensitive(".new"))
splitIdentifier(Operands);
Operands.push_back(
HexagonOperand::CreateToken(getContext(), RParen, Begin));
@@ -910,7 +911,7 @@
HexagonOperand::CreateReg(getContext(), Register, Begin, End));
const AsmToken &MaybeDotNew = Lexer.getTok();
if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
- MaybeDotNew.getString().equals_lower(".new"))
+ MaybeDotNew.getString().equals_insensitive(".new"))
splitIdentifier(Operands);
Operands.push_back(
HexagonOperand::CreateToken(getContext(), RParen, Begin));
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/Hexagon/CMakeLists.txt
index 5b20dd5..c6b8db2 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -69,7 +69,7 @@
RDFDeadCode.cpp
LINK_COMPONENTS
- Analysis
+ Analysis
AsmPrinter
CodeGen
Core
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index aeaeac6..80a987c 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -6,8 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-disassembler"
-
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
@@ -32,6 +30,8 @@
#include <cstdint>
#include <memory>
+#define DEBUG_TYPE "hexagon-disassembler"
+
using namespace llvm;
using namespace Hexagon;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/Hexagon.td b/src/llvm-project/llvm/lib/Target/Hexagon/Hexagon.td
index 2fadb0b..7518fd7 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/Hexagon.td
@@ -48,7 +48,10 @@
def ExtensionHVXV67: SubtargetFeature<"hvxv67", "HexagonHVXVersion",
"Hexagon::ArchEnum::V67", "Hexagon HVX instructions",
[ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66]>;
-
+def ExtensionHVXV68: SubtargetFeature<"hvxv68", "HexagonHVXVersion",
+ "Hexagon::ArchEnum::V68", "Hexagon HVX instructions",
+ [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
+ ExtensionHVXV67]>;
def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
"true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
@@ -107,6 +110,8 @@
AssemblerPredicate<(all_of ExtensionHVXV66)>;
def UseHVXV67 : Predicate<"HST->useHVXV67Ops()">,
AssemblerPredicate<(all_of ExtensionHVXV67)>;
+def UseHVXV68 : Predicate<"HST->useHVXV68Ops()">,
+ AssemblerPredicate<(all_of ExtensionHVXV68)>;
def UseAudio : Predicate<"HST->useAudioOps()">,
AssemblerPredicate<(all_of ExtensionAudio)>;
def UseZReg : Predicate<"HST->useZRegOps()">,
@@ -394,6 +399,11 @@
[ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv68", HexagonModelV68,
+ [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
+ ArchV68,
+ FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+ FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
// Need to update the correct features for tiny core.
// Disable NewValueJumps since the packetizer is unable to handle a packet with
// a new value jump and another SLOT0 instruction.
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index 6891455..faa4821 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -6,8 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-brelax"
-
#include "Hexagon.h"
#include "HexagonInstrInfo.h"
#include "HexagonSubtarget.h"
@@ -29,6 +27,8 @@
#include <cstdlib>
#include <iterator>
+#define DEBUG_TYPE "hexagon-brelax"
+
using namespace llvm;
// Since we have no exact knowledge of code layout, allow some safety buffer
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 11e7d5a..9f18d0b 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/PostDominators.h"
@@ -179,8 +180,20 @@
Root = 0x01,
Internal = 0x02,
Used = 0x04,
- InBounds = 0x08
+ InBounds = 0x08,
+ Pointer = 0x10, // See note below.
};
+ // Note: GEP indices generally traverse nested types, and so a GepNode
+ // (representing a single index) can be associated with some composite
+ // type. The exception is the GEP input, which is a pointer, and not
+ // a composite type (at least not in the sense of having sub-types).
+ // Also, the corresponding index plays a different role as well: it is
+ // simply added to the input pointer. Since pointer types are becoming
+ // opaque (i.e. are no longer going to include the pointee type), the
+ // two pieces of information (1) the fact that it's a pointer, and
+ // (2) the pointee type, need to be stored separately. The pointee type
+ // will be stored in the PTy member, while the fact that the node
+ // operates on a pointer will be reflected by the flag "Pointer".
uint32_t Flags = 0;
union {
@@ -188,7 +201,9 @@
Value *BaseVal;
};
Value *Idx = nullptr;
- Type *PTy = nullptr; // Type of the pointer operand.
+ Type *PTy = nullptr; // Type indexed by this node. For pointer nodes
+ // this is the "pointee" type, and indexing a
+ // pointer does not change the type.
GepNode() : Parent(nullptr) {}
GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) {
@@ -201,12 +216,6 @@
friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN);
};
- Type *next_type(Type *Ty, Value *Idx) {
- if (auto *PTy = dyn_cast<PointerType>(Ty))
- return PTy->getElementType();
- return GetElementPtrInst::getTypeAtIndex(Ty, Idx);
- }
-
raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
OS << "{ {";
bool Comma = false;
@@ -230,6 +239,11 @@
OS << ',';
OS << "inbounds";
}
+ if (GN.Flags & GepNode::Pointer) {
+ if (Comma)
+ OS << ',';
+ OS << "pointer";
+ }
OS << "} ";
if (GN.Flags & GepNode::Root)
OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
@@ -347,7 +361,8 @@
// chain. Link to it here.
N->Parent = F->second;
}
- N->PTy = PtrOp->getType();
+ N->PTy = GepI->getSourceElementType();
+ N->Flags |= GepNode::Pointer;
N->Idx = *GepI->idx_begin();
// Collect the list of users of this GEP instruction. Will add it to the
@@ -367,10 +382,10 @@
Nodes.push_back(N);
NodeOrder.insert(N);
- // Skip the first index operand, since we only handle 0. This dereferences
- // the pointer operand.
+ // Skip the first index operand, since it was already handled above. This
+ // dereferences the pointer operand.
GepNode *PN = N;
- Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType();
+ Type *PtrTy = GepI->getSourceElementType();
for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
OI != OE; ++OI) {
Value *Op = *OI;
@@ -383,7 +398,7 @@
NodeOrder.insert(Nx);
PN = Nx;
- PtrTy = next_type(PtrTy, Op);
+ PtrTy = GetElementPtrInst::getTypeAtIndex(PtrTy, Op);
}
// After last node has been created, update the use information.
@@ -503,16 +518,18 @@
return false;
// Not previously compared.
bool Root1 = N1->Flags & GepNode::Root;
- bool Root2 = N2->Flags & GepNode::Root;
+ uint32_t CmpFlags = GepNode::Root | GepNode::Pointer;
+ bool Different = (N1->Flags & CmpFlags) != (N2->Flags & CmpFlags);
NodePair P = node_pair(N1, N2);
- // If the Root flag has different values, the nodes are different.
+ // If the root/pointer flags have different values, the nodes are
+ // different.
// If both nodes are root nodes, but their base pointers differ,
// they are different.
- if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) {
+ if (Different || (Root1 && N1->BaseVal != N2->BaseVal)) {
Ne.insert(P);
return false;
}
- // Here the root flags are identical, and for root nodes the
+ // Here the root/pointer flags are identical, and for root nodes the
// base pointers are equal, so the root nodes are equal.
// For non-root nodes, compare their parent nodes.
if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) {
@@ -927,8 +944,10 @@
for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
I != E; ++I) {
OS << I->first << " -> ";
- BasicBlock *B = cast<BasicBlock>(I->second);
- OS << B->getName() << '(' << B << ')';
+ if (BasicBlock *B = cast_or_null<BasicBlock>(I->second))
+ OS << B->getName() << '(' << B << ')';
+ else
+ OS << "<null-block>";
OS << '\n';
}
return OS;
@@ -1088,41 +1107,39 @@
GetElementPtrInst *NewInst = nullptr;
Value *Input = RN->BaseVal;
- Value **IdxList = new Value*[Num+1];
- unsigned nax = 0;
+ Type *InpTy = RN->PTy;
+
+ unsigned Idx = 0;
do {
- unsigned IdxC = 0;
+ SmallVector<Value*, 4> IdxList;
// If the type of the input of the first node is not a pointer,
// we need to add an artificial i32 0 to the indices (because the
// actual input in the IR will be a pointer).
- if (!NA[nax]->PTy->isPointerTy()) {
+ if (!(NA[Idx]->Flags & GepNode::Pointer)) {
Type *Int32Ty = Type::getInt32Ty(*Ctx);
- IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0);
+ IdxList.push_back(ConstantInt::get(Int32Ty, 0));
}
// Keep adding indices from NA until we have to stop and generate
// an "intermediate" GEP.
- while (++nax <= Num) {
- GepNode *N = NA[nax-1];
- IdxList[IdxC++] = N->Idx;
- if (nax < Num) {
- // We have to stop, if the expected type of the output of this node
- // is not the same as the input type of the next node.
- Type *NextTy = next_type(N->PTy, N->Idx);
- if (NextTy != NA[nax]->PTy)
+ while (++Idx <= Num) {
+ GepNode *N = NA[Idx-1];
+ IdxList.push_back(N->Idx);
+ if (Idx < Num) {
+ // We have to stop if we reach a pointer.
+ if (NA[Idx]->Flags & GepNode::Pointer)
break;
}
}
- ArrayRef<Value*> A(IdxList, IdxC);
- Type *InpTy = Input->getType();
- Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
- NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
+ NewInst = GetElementPtrInst::Create(InpTy, Input, IdxList, "cgep", &*At);
NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
LLVM_DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
- Input = NewInst;
- } while (nax <= Num);
+ if (Idx < Num) {
+ Input = NewInst;
+ InpTy = NA[Idx]->PTy;
+ }
+ } while (Idx <= Num);
- delete[] IdxList;
return NewInst;
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 4a2b060..954e615 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -6,8 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hcp"
-
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
@@ -45,6 +43,8 @@
#include <utility>
#include <vector>
+#define DEBUG_TYPE "hcp"
+
using namespace llvm;
namespace {
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 45b4cf0..7a43a44 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -14,35 +14,76 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
+
#include <map>
+#include <string>
namespace llvm {
namespace Hexagon {
-enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67 };
+enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68 };
-static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67};
+static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68};
static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray);
-static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67" };
+static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68" };
static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray);
-static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t" };
+static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68" };
static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray);
-static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t" };
+static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68" };
static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray);
static const std::map<std::string, ArchEnum> CpuTable{
- {"generic", Hexagon::ArchEnum::V60},
- {"hexagonv5", Hexagon::ArchEnum::V5},
- {"hexagonv55", Hexagon::ArchEnum::V55},
- {"hexagonv60", Hexagon::ArchEnum::V60},
- {"hexagonv62", Hexagon::ArchEnum::V62},
- {"hexagonv65", Hexagon::ArchEnum::V65},
- {"hexagonv66", Hexagon::ArchEnum::V66},
- {"hexagonv67", Hexagon::ArchEnum::V67},
- {"hexagonv67t", Hexagon::ArchEnum::V67},
+ {"generic", Hexagon::ArchEnum::V5},
+ {"hexagonv5", Hexagon::ArchEnum::V5},
+ {"hexagonv55", Hexagon::ArchEnum::V55},
+ {"hexagonv60", Hexagon::ArchEnum::V60},
+ {"hexagonv62", Hexagon::ArchEnum::V62},
+ {"hexagonv65", Hexagon::ArchEnum::V65},
+ {"hexagonv66", Hexagon::ArchEnum::V66},
+ {"hexagonv67", Hexagon::ArchEnum::V67},
+ {"hexagonv67t", Hexagon::ArchEnum::V67},
+ {"hexagonv68", Hexagon::ArchEnum::V68},
};
+
+static const std::map<std::string, unsigned> ElfFlagsByCpuStr = {
+ {"generic", llvm::ELF::EF_HEXAGON_MACH_V5},
+ {"hexagonv5", llvm::ELF::EF_HEXAGON_MACH_V5},
+ {"hexagonv55", llvm::ELF::EF_HEXAGON_MACH_V55},
+ {"hexagonv60", llvm::ELF::EF_HEXAGON_MACH_V60},
+ {"hexagonv62", llvm::ELF::EF_HEXAGON_MACH_V62},
+ {"hexagonv65", llvm::ELF::EF_HEXAGON_MACH_V65},
+ {"hexagonv66", llvm::ELF::EF_HEXAGON_MACH_V66},
+ {"hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67},
+ {"hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T},
+ {"hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68},
+};
+static const std::map<unsigned, std::string> ElfArchByMachFlags = {
+ {llvm::ELF::EF_HEXAGON_MACH_V5, "V5"},
+ {llvm::ELF::EF_HEXAGON_MACH_V55, "V55"},
+ {llvm::ELF::EF_HEXAGON_MACH_V60, "V60"},
+ {llvm::ELF::EF_HEXAGON_MACH_V62, "V62"},
+ {llvm::ELF::EF_HEXAGON_MACH_V65, "V65"},
+ {llvm::ELF::EF_HEXAGON_MACH_V66, "V66"},
+ {llvm::ELF::EF_HEXAGON_MACH_V67, "V67"},
+ {llvm::ELF::EF_HEXAGON_MACH_V67T, "V67T"},
+ {llvm::ELF::EF_HEXAGON_MACH_V68, "V68"},
+};
+static const std::map<unsigned, std::string> ElfCpuByMachFlags = {
+ {llvm::ELF::EF_HEXAGON_MACH_V5, "hexagonv5"},
+ {llvm::ELF::EF_HEXAGON_MACH_V55, "hexagonv55"},
+ {llvm::ELF::EF_HEXAGON_MACH_V60, "hexagonv60"},
+ {llvm::ELF::EF_HEXAGON_MACH_V62, "hexagonv62"},
+ {llvm::ELF::EF_HEXAGON_MACH_V65, "hexagonv65"},
+ {llvm::ELF::EF_HEXAGON_MACH_V66, "hexagonv66"},
+ {llvm::ELF::EF_HEXAGON_MACH_V67, "hexagonv67"},
+ {llvm::ELF::EF_HEXAGON_MACH_V67T, "hexagonv67t"},
+ {llvm::ELF::EF_HEXAGON_MACH_V68, "hexagonv68"},
+};
+
} // namespace Hexagon
} // namespace llvm;
-#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 9374055..e743a29 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -22,3 +22,5 @@
def HasV66 : Predicate<"HST->hasV66Ops()">, AssemblerPredicate<(all_of ArchV66)>;
def ArchV67: SubtargetFeature<"v67", "HexagonArchVersion", "Hexagon::ArchEnum::V67", "Enable Hexagon V67 architecture">;
def HasV67 : Predicate<"HST->hasV67Ops()">, AssemblerPredicate<(all_of ArchV67)>;
+def ArchV68: SubtargetFeature<"v68", "HexagonArchVersion", "Hexagon::ArchEnum::V68", "Enable Hexagon V68 architecture">;
+def HasV68 : Predicate<"HST->hasV68Ops()">, AssemblerPredicate<(all_of ArchV68)>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
index ce7aa02..40f6e14 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
@@ -8,21 +8,14 @@
// Automatically generated file, do not edit!
//===----------------------------------------------------------------------===//
-// clang-format off
-
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-function"
#endif
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
- signedDecoder<8>(MI, tmp, Decoder);
- return MCDisassembler::Success;
-}
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
- uint64_t, const void *Decoder) {
- signedDecoder<4>(MI, tmp, Decoder);
+ signedDecoder<6>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
@@ -30,44 +23,49 @@
signedDecoder<12>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
- uint64_t, const void *Decoder) {
- signedDecoder<5>(MI, tmp, Decoder);
- return MCDisassembler::Success;
-}
static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
signedDecoder<13>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
- uint64_t, const void *Decoder) {
- signedDecoder<6>(MI, tmp, Decoder);
- return MCDisassembler::Success;
-}
static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
signedDecoder<14>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t, const void *Decoder) {
+ signedDecoder<3>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t, const void *Decoder) {
+ signedDecoder<4>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t, const void *Decoder) {
+ signedDecoder<5>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t, const void *Decoder) {
+ signedDecoder<6>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
signedDecoder<7>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
- uint64_t, const void *Decoder) {
- signedDecoder<6>(MI, tmp, Decoder);
- return MCDisassembler::Success;
-}
static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
signedDecoder<9>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
-static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t, const void *Decoder) {
- signedDecoder<3>(MI, tmp, Decoder);
+ signedDecoder<8>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
@@ -75,4 +73,3 @@
#pragma clang diagnostic pop
#endif
-// clang-format on
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index 1547e8f..a1db3ae 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -9,8 +9,6 @@
//===----------------------------------------------------------------------===//
def tc_04da405a : InstrItinClass;
-def tc_05058f6f : InstrItinClass;
-def tc_05ac6f98 : InstrItinClass;
def tc_05ca8cfd : InstrItinClass;
def tc_08a4f1b6 : InstrItinClass;
def tc_0b04c6c7 : InstrItinClass;
@@ -25,6 +23,7 @@
def tc_20a4bbec : InstrItinClass;
def tc_257f6f7c : InstrItinClass;
def tc_26a377fe : InstrItinClass;
+def tc_2b4c548e : InstrItinClass;
def tc_2c745bb8 : InstrItinClass;
def tc_2d4051cd : InstrItinClass;
def tc_2e8f5f6e : InstrItinClass;
@@ -53,12 +52,14 @@
def tc_663c80a7 : InstrItinClass;
def tc_6942b6e0 : InstrItinClass;
def tc_6e7fa133 : InstrItinClass;
+def tc_7095ecba : InstrItinClass;
def tc_71646d06 : InstrItinClass;
def tc_7177e272 : InstrItinClass;
def tc_718b5c53 : InstrItinClass;
def tc_7273323b : InstrItinClass;
def tc_7417e785 : InstrItinClass;
def tc_767c4e9d : InstrItinClass;
+def tc_7d68d5c2 : InstrItinClass;
def tc_7e6a3e89 : InstrItinClass;
def tc_8772086c : InstrItinClass;
def tc_87adc037 : InstrItinClass;
@@ -70,6 +71,8 @@
def tc_9f363d21 : InstrItinClass;
def tc_a02a10a8 : InstrItinClass;
def tc_a0dbea28 : InstrItinClass;
+def tc_a28f32b5 : InstrItinClass;
+def tc_a69eeee1 : InstrItinClass;
def tc_a7e6707d : InstrItinClass;
def tc_ab23f776 : InstrItinClass;
def tc_abe8c3b2 : InstrItinClass;
@@ -79,6 +82,7 @@
def tc_b28e51aa : InstrItinClass;
def tc_b4416217 : InstrItinClass;
def tc_b9db8205 : InstrItinClass;
+def tc_bb599486 : InstrItinClass;
def tc_c0749f3c : InstrItinClass;
def tc_c127de3a : InstrItinClass;
def tc_c4edf264 : InstrItinClass;
@@ -94,11 +98,9 @@
def tc_e3f68a46 : InstrItinClass;
def tc_e675c45a : InstrItinClass;
def tc_e699ae41 : InstrItinClass;
-def tc_e8797b98 : InstrItinClass;
def tc_e99d4c2e : InstrItinClass;
def tc_f1de44ef : InstrItinClass;
def tc_f21e8abb : InstrItinClass;
-def tc_fd7610da : InstrItinClass;
class DepHVXItinV55 {
list<InstrItinData> DepHVXItinV55_list = [
@@ -107,18 +109,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -194,6 +184,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -341,6 +336,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -372,6 +373,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -430,6 +437,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -478,6 +497,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -560,12 +584,6 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -581,13 +599,7 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
@@ -598,18 +610,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -685,6 +685,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -832,6 +837,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -863,6 +874,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -921,6 +938,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -969,6 +998,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -1051,12 +1085,6 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -1072,13 +1100,7 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
@@ -1089,18 +1111,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -1176,6 +1186,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -1323,6 +1338,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -1354,6 +1375,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -1412,6 +1439,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -1460,6 +1499,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -1542,12 +1586,6 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -1563,13 +1601,7 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
@@ -1580,18 +1612,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -1667,6 +1687,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -1814,6 +1839,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -1845,6 +1876,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -1903,6 +1940,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -1951,6 +2000,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -2033,12 +2087,6 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -2054,13 +2102,7 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
@@ -2071,18 +2113,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -2158,6 +2188,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -2305,6 +2340,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -2336,6 +2377,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -2394,6 +2441,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -2442,6 +2501,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -2524,12 +2588,6 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -2545,13 +2603,7 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
@@ -2562,18 +2614,6 @@
InstrStage<1, [CVI_XLSHF]>], [9, 5],
[HVX_FWD, HVX_FWD]>,
- InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
@@ -2649,6 +2689,11 @@
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
@@ -2796,6 +2841,12 @@
InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
[HVX_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7095ecba, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
@@ -2827,6 +2878,12 @@
InstrStage<1, [CVI_ALL]>], [3, 2],
[HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_7d68d5c2, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
@@ -2885,6 +2942,18 @@
InstrStage<1, [CVI_ZW]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT1,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [SLOT1], 0>,
@@ -2933,6 +3002,11 @@
InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
[HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
[InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
@@ -3015,12 +3089,507 @@
InstrStage<1, [CVI_ZW]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
- [InstrStage<1, [SLOT1], 0>,
+ InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
+ ];
+}
+
+class DepHVXItinV68 {
+ list<InstrItinData> DepHVXItinV68_list = [
+ InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [],
+ []>,
+
+ InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+ [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+ [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+ [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+ [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+ [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_649072c2, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+ [HVX_FWD]>,
+
+ InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [3, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
InstrStage<1, [CVI_LD], 0>,
InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
[Hex_FWD, Hex_FWD, HVX_FWD]>,
+ InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [1, 2, 5],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [3],
+ [HVX_FWD]>,
+
+ InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
@@ -3036,12 +3605,6 @@
InstrStage<1, [SLOT1], 0>,
InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
- [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
- InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
- [InstrStage<1, [SLOT1], 0>,
- InstrStage<1, [CVI_LD], 0>,
- InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
- [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index fecccb2..a376665 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -19,8 +19,6 @@
def tc_0ba0d5da : InstrItinClass;
def tc_0dfac0a7 : InstrItinClass;
def tc_0fac1eb8 : InstrItinClass;
-def tc_1044324a : InstrItinClass;
-def tc_10b884b7 : InstrItinClass;
def tc_112d30d6 : InstrItinClass;
def tc_1242dc2a : InstrItinClass;
def tc_1248597c : InstrItinClass;
@@ -29,26 +27,22 @@
def tc_158aa3f7 : InstrItinClass;
def tc_197dce51 : InstrItinClass;
def tc_1981450d : InstrItinClass;
-def tc_1b8138fc : InstrItinClass;
def tc_1c2c7a4a : InstrItinClass;
def tc_1c7522a8 : InstrItinClass;
def tc_1d41f8b7 : InstrItinClass;
-def tc_1e7875f0 : InstrItinClass;
def tc_1fcb8495 : InstrItinClass;
def tc_1fe4ab69 : InstrItinClass;
def tc_20131976 : InstrItinClass;
def tc_2237d952 : InstrItinClass;
-def tc_234f8560 : InstrItinClass;
def tc_23708a21 : InstrItinClass;
+def tc_2471c1c8 : InstrItinClass;
def tc_24e109c7 : InstrItinClass;
def tc_24f426ab : InstrItinClass;
-def tc_27106296 : InstrItinClass;
def tc_280f7fe1 : InstrItinClass;
def tc_28e55c6f : InstrItinClass;
def tc_2c13e7f5 : InstrItinClass;
def tc_2c3e17fc : InstrItinClass;
def tc_2f573607 : InstrItinClass;
-def tc_2f669c77 : InstrItinClass;
def tc_362b0be2 : InstrItinClass;
def tc_38382228 : InstrItinClass;
def tc_388f9897 : InstrItinClass;
@@ -70,7 +64,7 @@
def tc_4a55d03c : InstrItinClass;
def tc_4abdbdc6 : InstrItinClass;
def tc_4ac61d92 : InstrItinClass;
-def tc_4c1520ae : InstrItinClass;
+def tc_4bf903b0 : InstrItinClass;
def tc_503ce0f3 : InstrItinClass;
def tc_53c851ab : InstrItinClass;
def tc_5502c366 : InstrItinClass;
@@ -85,7 +79,6 @@
def tc_5a4b5e58 : InstrItinClass;
def tc_5b347363 : InstrItinClass;
def tc_5ceb2f9e : InstrItinClass;
-def tc_5d636bc7 : InstrItinClass;
def tc_5da50c4b : InstrItinClass;
def tc_5deb5e47 : InstrItinClass;
def tc_5e4cf0e8 : InstrItinClass;
@@ -101,7 +94,6 @@
def tc_6d861a95 : InstrItinClass;
def tc_6e20402a : InstrItinClass;
def tc_6f42bc60 : InstrItinClass;
-def tc_6fb32599 : InstrItinClass;
def tc_6fc5dbea : InstrItinClass;
def tc_711c805f : InstrItinClass;
def tc_713b66bf : InstrItinClass;
@@ -111,11 +103,10 @@
def tc_76bb5435 : InstrItinClass;
def tc_77f94a5e : InstrItinClass;
def tc_788b1d09 : InstrItinClass;
+def tc_7af3a37e : InstrItinClass;
def tc_7b9187d3 : InstrItinClass;
def tc_7c31e19a : InstrItinClass;
def tc_7c6d32e4 : InstrItinClass;
-def tc_7dc63b5c : InstrItinClass;
-def tc_7dcd9d89 : InstrItinClass;
def tc_7f7f45f5 : InstrItinClass;
def tc_7f8ae742 : InstrItinClass;
def tc_8035e91f : InstrItinClass;
@@ -130,7 +121,6 @@
def tc_8b5bd4f5 : InstrItinClass;
def tc_8e82e8ca : InstrItinClass;
def tc_9124c04f : InstrItinClass;
-def tc_9165014d : InstrItinClass;
def tc_92240447 : InstrItinClass;
def tc_934753bb : InstrItinClass;
def tc_937dd41c : InstrItinClass;
@@ -139,7 +129,6 @@
def tc_96ef76ef : InstrItinClass;
def tc_975a4e54 : InstrItinClass;
def tc_9783714b : InstrItinClass;
-def tc_988416e3 : InstrItinClass;
def tc_9b34f5e0 : InstrItinClass;
def tc_9b3c0462 : InstrItinClass;
def tc_9bcfb2ee : InstrItinClass;
@@ -167,9 +156,7 @@
def tc_ae5babd7 : InstrItinClass;
def tc_aee6250c : InstrItinClass;
def tc_b1ae5f67 : InstrItinClass;
-def tc_b34eb232 : InstrItinClass;
def tc_b4dc7630 : InstrItinClass;
-def tc_b570493d : InstrItinClass;
def tc_b7c4062a : InstrItinClass;
def tc_b837298f : InstrItinClass;
def tc_ba9255a6 : InstrItinClass;
@@ -186,7 +173,6 @@
def tc_d33e5eee : InstrItinClass;
def tc_d3632d88 : InstrItinClass;
def tc_d45ba9cd : InstrItinClass;
-def tc_d47648a2 : InstrItinClass;
def tc_d57d649c : InstrItinClass;
def tc_d61dfdc3 : InstrItinClass;
def tc_d68dca5c : InstrItinClass;
@@ -195,7 +181,6 @@
def tc_db96aa6b : InstrItinClass;
def tc_dc51281d : InstrItinClass;
def tc_decdde8a : InstrItinClass;
-def tc_df4536ae : InstrItinClass;
def tc_df5d53f9 : InstrItinClass;
def tc_e3d699e3 : InstrItinClass;
def tc_e9170fb7 : InstrItinClass;
@@ -228,8 +213,6 @@
InstrItinData <tc_0ba0d5da, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_0dfac0a7, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_0fac1eb8, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_1044324a, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_10b884b7, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_112d30d6, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_1242dc2a, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_1248597c, [InstrStage<1, [SLOT3]>]>,
@@ -238,26 +221,22 @@
InstrItinData <tc_158aa3f7, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_197dce51, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_1981450d, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_1b8138fc, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_1c2c7a4a, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_1c7522a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_1d41f8b7, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_1e7875f0, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_1fcb8495, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_1fe4ab69, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_20131976, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_2237d952, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_234f8560, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_23708a21, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData <tc_2471c1c8, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_24e109c7, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_24f426ab, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_27106296, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_280f7fe1, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_28e55c6f, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_2c13e7f5, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_2c3e17fc, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_2f573607, [InstrStage<1, [SLOT2]>]>,
- InstrItinData <tc_2f669c77, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_362b0be2, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_38382228, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_388f9897, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -279,7 +258,7 @@
InstrItinData <tc_4a55d03c, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_4abdbdc6, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_4ac61d92, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
- InstrItinData <tc_4c1520ae, [InstrStage<1, [SLOT3]>]>,
+ InstrItinData <tc_4bf903b0, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_503ce0f3, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_53c851ab, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_5502c366, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -294,7 +273,6 @@
InstrItinData <tc_5a4b5e58, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_5b347363, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_5ceb2f9e, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_5d636bc7, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_5da50c4b, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_5deb5e47, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_5e4cf0e8, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -310,7 +288,6 @@
InstrItinData <tc_6d861a95, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_6e20402a, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_6f42bc60, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_6fb32599, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_6fc5dbea, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_711c805f, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_713b66bf, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -320,11 +297,10 @@
InstrItinData <tc_76bb5435, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_77f94a5e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_788b1d09, [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData <tc_7af3a37e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_7b9187d3, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_7c31e19a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_7c6d32e4, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_7dc63b5c, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_7dcd9d89, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_7f7f45f5, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_7f8ae742, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_8035e91f, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -339,7 +315,6 @@
InstrItinData <tc_8b5bd4f5, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_8e82e8ca, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_9124c04f, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
- InstrItinData <tc_9165014d, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_92240447, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_934753bb, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_937dd41c, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -348,7 +323,6 @@
InstrItinData <tc_96ef76ef, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_975a4e54, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_9783714b, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_988416e3, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_9b34f5e0, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_9b3c0462, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_9bcfb2ee, [InstrStage<1, [SLOT0]>]>,
@@ -376,9 +350,7 @@
InstrItinData <tc_ae5babd7, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_aee6250c, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b1ae5f67, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_b34eb232, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_b4dc7630, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_b570493d, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_b7c4062a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b837298f, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_ba9255a6, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -395,7 +367,6 @@
InstrItinData <tc_d33e5eee, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_d3632d88, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_d45ba9cd, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_d47648a2, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_d57d649c, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_d61dfdc3, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_d68dca5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -404,7 +375,6 @@
InstrItinData <tc_db96aa6b, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_dc51281d, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_decdde8a, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_df4536ae, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_df5d53f9, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e3d699e3, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e9170fb7, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -471,14 +441,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -511,10 +473,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -527,10 +485,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -547,14 +501,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -563,10 +517,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -587,10 +537,6 @@
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -675,9 +621,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -735,10 +681,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -799,10 +741,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -839,6 +777,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -851,14 +793,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -915,10 +849,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -951,10 +881,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -1063,18 +989,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1139,10 +1057,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -1175,10 +1089,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [4, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1299,14 +1209,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -1339,10 +1241,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1355,10 +1253,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1375,14 +1269,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1391,10 +1285,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1415,10 +1305,6 @@
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -1503,9 +1389,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -1563,10 +1449,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1627,10 +1509,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1667,6 +1545,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1679,14 +1561,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1743,10 +1617,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1779,10 +1649,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -1891,18 +1757,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1967,10 +1825,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -2003,10 +1857,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2127,15 +1977,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -2170,10 +2011,6 @@
InstrStage<1, [CVI_ST]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2186,10 +2023,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2206,15 +2039,15 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_ST]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 2],
@@ -2225,10 +2058,6 @@
InstrStage<1, [CVI_ST]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2250,10 +2079,6 @@
InstrStage<1, [CVI_ST]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_2early*/
[InstrStage<1, [SLOT2], 0>,
InstrStage<1, [CVI_ST]>], [1],
@@ -2343,9 +2168,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -2407,10 +2232,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2473,10 +2294,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2514,6 +2331,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2],
@@ -2527,14 +2348,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2591,10 +2404,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2628,10 +2437,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -2742,18 +2547,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2819,10 +2616,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2], 0>,
InstrStage<1, [CVI_ST]>], [2],
@@ -2858,10 +2651,6 @@
InstrStage<1, [CVI_ST]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0], 0>,
InstrStage<1, [CVI_ST]>], [3, 2, 2],
@@ -2988,14 +2777,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -3028,10 +2809,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3044,10 +2821,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3064,14 +2837,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3080,10 +2853,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3104,10 +2873,6 @@
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_3*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -3192,9 +2957,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -3252,10 +3017,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3316,10 +3077,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3356,6 +3113,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3368,14 +3129,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3432,10 +3185,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3468,10 +3217,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -3580,18 +3325,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3656,10 +3393,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -3692,10 +3425,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3816,14 +3545,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -3856,10 +3577,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3872,10 +3589,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3892,14 +3605,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3908,10 +3621,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3932,10 +3641,6 @@
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_3*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -4020,9 +3725,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -4080,10 +3785,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4144,10 +3845,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4184,6 +3881,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -4196,14 +3897,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4260,10 +3953,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4296,10 +3985,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -4408,18 +4093,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4484,10 +4161,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -4520,10 +4193,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4644,14 +4313,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -4684,10 +4345,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4700,10 +4357,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4720,14 +4373,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4736,10 +4389,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4760,10 +4409,6 @@
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_3*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -4848,9 +4493,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -4908,10 +4553,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4972,10 +4613,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5012,6 +4649,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -5024,14 +4665,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 3],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5088,10 +4721,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5124,10 +4753,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -5236,18 +4861,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5312,10 +4929,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -5348,10 +4961,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5472,14 +5081,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -5512,10 +5113,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5528,10 +5125,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5548,14 +5141,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5564,10 +5157,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5588,10 +5177,6 @@
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_3*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -5676,9 +5261,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
@@ -5736,10 +5321,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5800,10 +5381,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5840,6 +5417,10 @@
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -5852,14 +5433,6 @@
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 3],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5916,10 +5489,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -5952,10 +5521,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -6064,18 +5629,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6140,10 +5697,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -6176,10 +5729,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6300,14 +5849,6 @@
[InstrStage<1, [SLOT0]>], [3, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1044324a, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_10b884b7, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_112d30d6, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
@@ -6340,10 +5881,6 @@
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
- InstrItinData <tc_1b8138fc, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1c2c7a4a, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6356,10 +5893,6 @@
[InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1e7875f0, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1fcb8495, /*tc_2*/
[InstrStage<1, [SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6376,14 +5909,14 @@
[InstrStage<1, [SLOT0]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_234f8560, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_23708a21, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_24e109c7, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6392,10 +5925,6 @@
[InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_27106296, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_280f7fe1, /*tc_st*/
[InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6416,10 +5945,6 @@
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2f669c77, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_362b0be2, /*tc_3*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -6504,9 +6029,9 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_4c1520ae, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
InstrItinData <tc_503ce0f3, /*tc_3x*/
[InstrStage<1, [SLOT3]>], [4, 2, 2, 1],
@@ -6564,10 +6089,6 @@
[InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_5d636bc7, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_5da50c4b, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6628,10 +6149,6 @@
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_6fb32599, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_6fc5dbea, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6668,6 +6185,10 @@
[InstrStage<1, [SLOT3]>], [4, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_7b9187d3, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
@@ -6680,14 +6201,6 @@
[InstrStage<1, [SLOT0]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_7dc63b5c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7dcd9d89, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 3],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_7f7f45f5, /*tc_4x*/
[InstrStage<1, [SLOT3]>], [5, 5, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6744,10 +6257,6 @@
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_9165014d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_92240447, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6780,10 +6289,6 @@
[InstrStage<1, [SLOT3]>], [5, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_988416e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3],
- [Hex_FWD]>,
-
InstrItinData <tc_9b34f5e0, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [],
[]>,
@@ -6892,18 +6397,10 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_b34eb232, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_b4dc7630, /*tc_st*/
[InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b570493d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b7c4062a, /*tc_ld*/
[InstrStage<1, [SLOT0]>], [4, 3, 1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -6968,10 +6465,6 @@
[InstrStage<1, [SLOT0]>], [1],
[Hex_FWD]>,
- InstrItinData <tc_d47648a2, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_d57d649c, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -7004,10 +6497,6 @@
[InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2],
[Hex_FWD]>,
- InstrItinData <tc_df4536ae, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_df5d53f9, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -7081,3 +6570,771 @@
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
];
}
+
+class DepScalarItinV68 {
+ list<InstrItinData> DepScalarItinV68_list = [
+ InstrItinData <tc_011e0e9d, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_01d44cb2, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_01e1be3b, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_02fe1c65, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0655b949, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_075c8dd8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0a195f2c, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0a6c20ae, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_0dfac0a7, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0fac1eb8, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_112d30d6, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1242dc2a, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1248597c, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_151bf368, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_158aa3f7, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_197dce51, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1981450d, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1c2c7a4a, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1c7522a8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1d41f8b7, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1fcb8495, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1fe4ab69, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_20131976, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2237d952, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_23708a21, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_24e109c7, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_24f426ab, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_280f7fe1, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_28e55c6f, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c13e7f5, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c3e17fc, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_2f573607, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_362b0be2, /*tc_3*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_38382228, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_388f9897, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_38e0bae9, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3d14a17b, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3edca78f, /*tc_2*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3fbf1042, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_407e96f9, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_40d64c94, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4222e6bf, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_42ff66ba, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_442395f3, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_44d5a428, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_44fffc58, /*tc_3*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_45791fb8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_45f9d1be, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4a55d03c, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4abdbdc6, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_503ce0f3, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_53c851ab, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5502c366, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55255f2b, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_556f6577, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55a9a350, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55b33fda, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_56a124a7, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_57a55b54, /*tc_1*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5944960d, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_59a7822c, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5a4b5e58, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5b347363, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5da50c4b, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5deb5e47, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5e4cf0e8, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_60e324ff, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_63567288, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_64b00d8a, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_651cbe02, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_65279839, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_65cbd974, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_69bfb303, /*tc_3*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6ae3426b, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6d861a95, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6e20402a, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6f42bc60, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6fc5dbea, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_711c805f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_713b66bf, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7401744f, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7476d766, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_74a42bda, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_76bb5435, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_77f94a5e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_788b1d09, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7c31e19a, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7c6d32e4, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7f7f45f5, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7f8ae742, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8035e91f, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_822c3c68, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_829d8a86, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_838c4d7a, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_84a7500d, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_86173609, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_887d1bb7, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8a6d0d94, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8a825db2, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8b5bd4f5, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8e82e8ca, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9124c04f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_92240447, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_934753bb, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_937dd41c, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [],
+ []>,
+
+ InstrItinData <tc_9406230a, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_95a33176, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_96ef76ef, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_975a4e54, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9783714b, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [],
+ []>,
+
+ InstrItinData <tc_9b3c0462, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9bcfb2ee, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9c52f549, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9e27f2f9, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9e72dc89, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9edb7c77, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9edefe01, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9f6cd987, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a08b630b, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a1297125, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a154b476, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a2b365d2, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a3070909, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a32e03e7, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a38c45dc, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a4e22bbd, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a4ee89db, /*tc_2early*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_a7a13fac, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a7bdb22c, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a9edeffa, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_abfd9a6d, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ac65613f, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_addc37a8, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ae5babd7, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_aee6250c, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b1ae5f67, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_b4dc7630, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b7c4062a, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b837298f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_ba9255a6, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb07f2c5, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb831a7c, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c20701f0, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c21d7447, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c57d9f39, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_ce59038e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_cfa0e29b, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d03278fd, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d33e5eee, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d3632d88, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d45ba9cd, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_d57d649c, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_d61dfdc3, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d68dca5c, /*tc_3stall*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d7718fbe, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_db596beb, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_db96aa6b, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_dc51281d, /*tc_3*/
+ [InstrStage<1, [SLOT2]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_decdde8a, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e3d699e3, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e9170fb7, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ed03645c, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_eed07714, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_eeda4109, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ef921005, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f098b237, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f0cdeccf, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f0e8e832, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f34c1c21, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+ [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f7569068, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f999c66e, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_fae9dfa5, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_fedb7e19, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+ ];
+}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.h
index b261b46..54e046d 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.h
@@ -8,6 +8,10 @@
// Automatically generated file, do not edit!
//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPITYPES_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPITYPES_H
+
namespace llvm {
namespace HexagonII {
enum Type {
@@ -16,48 +20,50 @@
TypeALU32_ADDI = 2,
TypeALU64 = 3,
TypeCJ = 4,
- TypeCR = 7,
- TypeCVI_4SLOT_MPY = 8,
- TypeCVI_GATHER = 9,
- TypeCVI_GATHER_DV = 10,
- TypeCVI_GATHER_RST = 11,
- TypeCVI_HIST = 12,
- TypeCVI_SCATTER = 13,
- TypeCVI_SCATTER_DV = 14,
- TypeCVI_SCATTER_NEW_RST = 15,
- TypeCVI_SCATTER_NEW_ST = 16,
- TypeCVI_SCATTER_RST = 17,
- TypeCVI_VA = 18,
- TypeCVI_VA_DV = 19,
- TypeCVI_VM_LD = 20,
- TypeCVI_VM_NEW_ST = 21,
- TypeCVI_VM_ST = 22,
- TypeCVI_VM_STU = 23,
- TypeCVI_VM_TMP_LD = 24,
- TypeCVI_VM_VP_LDU = 25,
- TypeCVI_VP = 26,
- TypeCVI_VP_VS = 27,
- TypeCVI_VS = 28,
- TypeCVI_VS_VX = 29,
- TypeCVI_VX = 30,
- TypeCVI_VX_DV = 31,
- TypeCVI_VX_LATE = 32,
- TypeCVI_ZW = 33,
- TypeDUPLEX = 34,
- TypeENDLOOP = 35,
- TypeEXTENDER = 36,
- TypeJ = 37,
- TypeLD = 38,
- TypeM = 39,
- TypeMAPPING = 40,
- TypeNCJ = 41,
- TypePSEUDO = 42,
- TypeST = 43,
- TypeSUBINSN = 44,
- TypeS_2op = 45,
- TypeS_3op = 46,
- TypeV2LDST = 49,
- TypeV4LDST = 50,
+ TypeCR = 5,
+ TypeCVI_4SLOT_MPY = 6,
+ TypeCVI_GATHER = 7,
+ TypeCVI_GATHER_DV = 8,
+ TypeCVI_GATHER_RST = 9,
+ TypeCVI_HIST = 10,
+ TypeCVI_SCATTER = 11,
+ TypeCVI_SCATTER_DV = 12,
+ TypeCVI_SCATTER_NEW_RST = 13,
+ TypeCVI_SCATTER_NEW_ST = 14,
+ TypeCVI_SCATTER_RST = 15,
+ TypeCVI_VA = 16,
+ TypeCVI_VA_DV = 17,
+ TypeCVI_VM_LD = 18,
+ TypeCVI_VM_NEW_ST = 19,
+ TypeCVI_VM_ST = 20,
+ TypeCVI_VM_STU = 21,
+ TypeCVI_VM_TMP_LD = 22,
+ TypeCVI_VM_VP_LDU = 23,
+ TypeCVI_VP = 24,
+ TypeCVI_VP_VS = 25,
+ TypeCVI_VS = 26,
+ TypeCVI_VS_VX = 27,
+ TypeCVI_VX = 28,
+ TypeCVI_VX_DV = 29,
+ TypeCVI_VX_LATE = 30,
+ TypeCVI_ZW = 31,
+ TypeDUPLEX = 32,
+ TypeENDLOOP = 33,
+ TypeEXTENDER = 34,
+ TypeJ = 35,
+ TypeLD = 36,
+ TypeM = 37,
+ TypeMAPPING = 38,
+ TypeNCJ = 39,
+ TypePSEUDO = 40,
+ TypeST = 41,
+ TypeSUBINSN = 42,
+ TypeS_2op = 43,
+ TypeS_3op = 44,
+ TypeV2LDST = 47,
+ TypeV4LDST = 48,
};
}
}
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPITYPES_H
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.td
index f251a29..8d2b46d 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepITypes.td
@@ -14,45 +14,45 @@
def TypeALU32_ADDI : IType<2>;
def TypeALU64 : IType<3>;
def TypeCJ : IType<4>;
-def TypeCR : IType<7>;
-def TypeCVI_4SLOT_MPY : IType<8>;
-def TypeCVI_GATHER : IType<9>;
-def TypeCVI_GATHER_DV : IType<10>;
-def TypeCVI_GATHER_RST : IType<11>;
-def TypeCVI_HIST : IType<12>;
-def TypeCVI_SCATTER : IType<13>;
-def TypeCVI_SCATTER_DV : IType<14>;
-def TypeCVI_SCATTER_NEW_RST : IType<15>;
-def TypeCVI_SCATTER_NEW_ST : IType<16>;
-def TypeCVI_SCATTER_RST : IType<17>;
-def TypeCVI_VA : IType<18>;
-def TypeCVI_VA_DV : IType<19>;
-def TypeCVI_VM_LD : IType<20>;
-def TypeCVI_VM_NEW_ST : IType<21>;
-def TypeCVI_VM_ST : IType<22>;
-def TypeCVI_VM_STU : IType<23>;
-def TypeCVI_VM_TMP_LD : IType<24>;
-def TypeCVI_VM_VP_LDU : IType<25>;
-def TypeCVI_VP : IType<26>;
-def TypeCVI_VP_VS : IType<27>;
-def TypeCVI_VS : IType<28>;
-def TypeCVI_VS_VX : IType<29>;
-def TypeCVI_VX : IType<30>;
-def TypeCVI_VX_DV : IType<31>;
-def TypeCVI_VX_LATE : IType<32>;
-def TypeCVI_ZW : IType<33>;
-def TypeDUPLEX : IType<34>;
-def TypeENDLOOP : IType<35>;
-def TypeEXTENDER : IType<36>;
-def TypeJ : IType<37>;
-def TypeLD : IType<38>;
-def TypeM : IType<39>;
-def TypeMAPPING : IType<40>;
-def TypeNCJ : IType<41>;
-def TypePSEUDO : IType<42>;
-def TypeST : IType<43>;
-def TypeSUBINSN : IType<44>;
-def TypeS_2op : IType<45>;
-def TypeS_3op : IType<46>;
-def TypeV2LDST : IType<49>;
-def TypeV4LDST : IType<50>;
+def TypeCR : IType<5>;
+def TypeCVI_4SLOT_MPY : IType<6>;
+def TypeCVI_GATHER : IType<7>;
+def TypeCVI_GATHER_DV : IType<8>;
+def TypeCVI_GATHER_RST : IType<9>;
+def TypeCVI_HIST : IType<10>;
+def TypeCVI_SCATTER : IType<11>;
+def TypeCVI_SCATTER_DV : IType<12>;
+def TypeCVI_SCATTER_NEW_RST : IType<13>;
+def TypeCVI_SCATTER_NEW_ST : IType<14>;
+def TypeCVI_SCATTER_RST : IType<15>;
+def TypeCVI_VA : IType<16>;
+def TypeCVI_VA_DV : IType<17>;
+def TypeCVI_VM_LD : IType<18>;
+def TypeCVI_VM_NEW_ST : IType<19>;
+def TypeCVI_VM_ST : IType<20>;
+def TypeCVI_VM_STU : IType<21>;
+def TypeCVI_VM_TMP_LD : IType<22>;
+def TypeCVI_VM_VP_LDU : IType<23>;
+def TypeCVI_VP : IType<24>;
+def TypeCVI_VP_VS : IType<25>;
+def TypeCVI_VS : IType<26>;
+def TypeCVI_VS_VX : IType<27>;
+def TypeCVI_VX : IType<28>;
+def TypeCVI_VX_DV : IType<29>;
+def TypeCVI_VX_LATE : IType<30>;
+def TypeCVI_ZW : IType<31>;
+def TypeDUPLEX : IType<32>;
+def TypeENDLOOP : IType<33>;
+def TypeEXTENDER : IType<34>;
+def TypeJ : IType<35>;
+def TypeLD : IType<36>;
+def TypeM : IType<37>;
+def TypeMAPPING : IType<38>;
+def TypeNCJ : IType<39>;
+def TypePSEUDO : IType<40>;
+def TypeST : IType<41>;
+def TypeSUBINSN : IType<42>;
+def TypeS_2op : IType<43>;
+def TypeS_3op : IType<44>;
+def TypeV2LDST : IType<47>;
+def TypeV4LDST : IType<48>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index 305115d..4dd0110 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -8,50 +8,197 @@
// Automatically generated file, do not edit!
//===----------------------------------------------------------------------===//
-class Enc_5e2823 : OpcodeHexagon {
+class Enc_01d3d0 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_02553a : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{11-5} = Ii{6-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_03833b : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_041d7b : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{24-23} = n1{3-2};
+ let Inst{13-13} = n1{1-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_04c959 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <6> II;
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_0527db : OpcodeHexagon {
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rx16;
+ let Inst{3-0} = Rx16{3-0};
+}
+class Enc_052c7d : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{6-3} = Ii{4-1};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_08d755 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_0aa344 : OpcodeHexagon {
+ bits <5> Gss32;
+ let Inst{20-16} = Gss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_0b2e5b : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_0b51ce : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Qv4;
+ let Inst{12-11} = Qv4{1-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_0cb018 : OpcodeHexagon {
+ bits <5> Cs32;
+ let Inst{20-16} = Cs32{4-0};
bits <5> Rd32;
let Inst{4-0} = Rd32{4-0};
}
-class Enc_b9c5fb : OpcodeHexagon {
+class Enc_0d8870 : OpcodeHexagon {
+ bits <12> Ii;
+ let Inst{26-25} = Ii{11-10};
+ let Inst{13-13} = Ii{9-9};
+ let Inst{7-0} = Ii{8-1};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_0d8adb : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_0e41fa : OpcodeHexagon {
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_0ed752 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Cdd32;
+ let Inst{4-0} = Cdd32{4-0};
+}
+class Enc_0f8bab : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <2> Qd4;
+ let Inst{1-0} = Qd4{1-0};
+}
+class Enc_0fa531 : OpcodeHexagon {
+ bits <15> Ii;
+ let Inst{21-21} = Ii{14-14};
+ let Inst{13-13} = Ii{13-13};
+ let Inst{11-1} = Ii{12-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_10bc21 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1178da : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_11a146 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{11-8} = Ii{3-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12b6e9 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{11-8} = Ii{3-0};
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_5ab2be : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_bd6011 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_cb9321 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{27-21} = Ii{15-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_a56825 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
+class Enc_134437 : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{9-8} = Qs4{1-0};
+ bits <2> Qt4;
+ let Inst{23-22} = Qt4{1-0};
+ bits <2> Qd4;
+ let Inst{1-0} = Qd4{1-0};
}
class Enc_140c83 : OpcodeHexagon {
bits <10> Ii;
@@ -62,6 +209,103 @@
bits <5> Rd32;
let Inst{4-0} = Rd32{4-0};
}
+class Enc_143445 : OpcodeHexagon {
+ bits <13> Ii;
+ let Inst{26-25} = Ii{12-11};
+ let Inst{13-13} = Ii{10-10};
+ let Inst{7-0} = Ii{9-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_143a3c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <6> II;
+ let Inst{23-21} = II{5-3};
+ let Inst{7-5} = II{2-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_14640c : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{24-22} = n1{3-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_14d27a : OpcodeHexagon {
+ bits <5> II;
+ let Inst{12-8} = II{4-0};
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+}
+class Enc_152467 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{8-5} = Ii{4-1};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_158beb : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{6-5} = Qs4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vv32;
+ let Inst{4-0} = Vv32{4-0};
+}
+class Enc_163a3c : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{12-7} = Ii{6-1};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{4-0} = Rt32{4-0};
+}
+class Enc_16c48b : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vv32;
+ let Inst{12-8} = Vv32{4-0};
+ bits <5> Vw32;
+ let Inst{4-0} = Vw32{4-0};
+}
+class Enc_178717 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <6> n1;
+ let Inst{28-28} = n1{5-5};
+ let Inst{25-23} = n1{4-2};
+ let Inst{13-13} = n1{1-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_179b35 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
class Enc_18c338 : OpcodeHexagon {
bits <8> Ii;
let Inst{12-5} = Ii{7-0};
@@ -71,109 +315,271 @@
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_be32a5 : OpcodeHexagon {
+class Enc_1a9974 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_ea23e4 : OpcodeHexagon {
+ bits <5> Ru32;
+ let Inst{12-8} = Ru32{4-0};
bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+ let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_1aa186 : OpcodeHexagon {
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_e3b0c4 : OpcodeHexagon {
-
-}
-class Enc_ea4c54 : OpcodeHexagon {
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
bits <5> Rt32;
let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
}
-class Enc_e38e1f : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <2> Pu4;
- let Inst{22-21} = Pu4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_9b0bc1 : OpcodeHexagon {
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_90cd8b : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_3a3d62 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_0cb018 : OpcodeHexagon {
- bits <5> Cs32;
- let Inst{20-16} = Cs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_51436c : OpcodeHexagon {
- bits <16> Ii;
- let Inst{23-22} = Ii{15-14};
- let Inst{13-0} = Ii{13-0};
+class Enc_1aaec1 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <3> Os8;
+ let Inst{2-0} = Os8{2-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
-class Enc_bd811a : OpcodeHexagon {
+class Enc_1b64fb : OpcodeHexagon {
+ bits <16> Ii;
+ let Inst{26-25} = Ii{15-14};
+ let Inst{20-16} = Ii{13-9};
+ let Inst{13-13} = Ii{8-8};
+ let Inst{7-0} = Ii{7-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_1bd127 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vdddd32;
+ let Inst{4-0} = Vdddd32{4-0};
+}
+class Enc_1cf4ca : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{17-16} = Ii{5-4};
+ let Inst{6-3} = Ii{3-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_1de724 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> n1;
+ let Inst{28-28} = n1{3-3};
+ let Inst{24-22} = n1{2-0};
+}
+class Enc_1ef990 : OpcodeHexagon {
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1f19b5 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{9-5} = Ii{4-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_1f5ba6 : OpcodeHexagon {
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_1f5d8f : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_211aaa : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{26-25} = Ii{10-9};
+ let Inst{13-5} = Ii{8-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
- bits <5> Cd32;
- let Inst{4-0} = Cd32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
}
-class Enc_5e87ce : OpcodeHexagon {
+class Enc_217147 : OpcodeHexagon {
+ bits <2> Qv4;
+ let Inst{23-22} = Qv4{1-0};
+}
+class Enc_222336 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{8-5} = Ii{3-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_223005 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_226535 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-7} = Ii{7-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{4-0} = Rt32{4-0};
+}
+class Enc_22c845 : OpcodeHexagon {
+ bits <14> Ii;
+ let Inst{10-0} = Ii{13-3};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2301d6 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{20-16} = Ii{5-1};
+ let Inst{8-8} = Ii{0-0};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_245865 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{23-19} = Vv32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_24a7dc : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{23-19} = Vv32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_25bef0 : OpcodeHexagon {
bits <16> Ii;
- let Inst{23-22} = Ii{15-14};
+ let Inst{26-25} = Ii{15-14};
let Inst{20-16} = Ii{13-9};
let Inst{13-5} = Ii{8-0};
bits <5> Rd32;
let Inst{4-0} = Rd32{4-0};
}
-class Enc_fcf7a7 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
+class Enc_263841 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+ let Inst{20-16} = Rtt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_277737 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{22-21} = Ii{7-6};
+ let Inst{13-13} = Ii{5-5};
+ let Inst{7-5} = Ii{4-2};
+ bits <5> Ru32;
+ let Inst{4-0} = Ru32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{12-8} = Rd32{4-0};
+}
+class Enc_27b757 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+}
+class Enc_27fd0e : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{8-5} = Ii{5-2};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_284ebb : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <2> Pt4;
+ let Inst{9-8} = Pt4{1-0};
bits <2> Pd4;
let Inst{1-0} = Pd4{1-0};
}
-class Enc_88c16c : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
+class Enc_28a2dc : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_28dcbb : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vvv32;
+ let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_2a3787 : OpcodeHexagon {
+ bits <13> Ii;
+ let Inst{26-25} = Ii{12-11};
+ let Inst{13-5} = Ii{10-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2a7b91 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{20-16} = Ii{5-1};
+ let Inst{8-8} = Ii{0-0};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2ae154 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
}
class Enc_2b3f60 : OpcodeHexagon {
bits <5> Rss32;
@@ -185,6 +591,98 @@
bits <2> Px4;
let Inst{6-5} = Px4{1-0};
}
+class Enc_2b518f : OpcodeHexagon {
+ bits <32> Ii;
+ let Inst{27-16} = Ii{31-20};
+ let Inst{13-0} = Ii{19-6};
+}
+class Enc_2bae10 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{10-8} = Ii{3-1};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_2d7491 : OpcodeHexagon {
+ bits <13> Ii;
+ let Inst{26-25} = Ii{12-11};
+ let Inst{13-5} = Ii{10-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2d829e : OpcodeHexagon {
+ bits <14> Ii;
+ let Inst{10-0} = Ii{13-3};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_2df31d : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{9-4} = Ii{7-2};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_2e1979 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2ea740 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Qv4;
+ let Inst{12-11} = Qv4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+}
+class Enc_2ebe3b : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2f2f04 : OpcodeHexagon {
+ bits <1> Ii;
+ let Inst{5-5} = Ii{0-0};
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_2fbf3c : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_310ba1 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rtt32;
+ let Inst{20-16} = Rtt32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
class Enc_311abd : OpcodeHexagon {
bits <5> Ii;
let Inst{12-8} = Ii{4-0};
@@ -193,103 +691,64 @@
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_c2b48e : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_08d755 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_02553a : OpcodeHexagon {
- bits <7> Ii;
- let Inst{11-5} = Ii{6-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_f0cca7 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <6> II;
- let Inst{20-16} = II{5-1};
- let Inst{13-13} = II{0-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_9cdba7 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_a05677 : OpcodeHexagon {
+class Enc_31aa6a : OpcodeHexagon {
bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
+ let Inst{6-3} = Ii{4-1};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_31db33 : OpcodeHexagon {
+ bits <2> Qt4;
+ let Inst{6-5} = Qt4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_322e1b : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{22-21} = Ii{5-4};
+ let Inst{13-13} = Ii{3-3};
+ let Inst{7-5} = Ii{2-0};
+ bits <6> II;
+ let Inst{23-23} = II{5-5};
+ let Inst{4-0} = II{4-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
bits <5> Rd32;
+ let Inst{12-8} = Rd32{4-0};
+}
+class Enc_323f2d : OpcodeHexagon {
+ bits <6> II;
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Rd32;
let Inst{4-0} = Rd32{4-0};
+ bits <5> Re32;
+ let Inst{20-16} = Re32{4-0};
}
-class Enc_2b518f : OpcodeHexagon {
- bits <32> Ii;
- let Inst{27-16} = Ii{31-20};
- let Inst{13-0} = Ii{19-6};
-}
-class Enc_fb6577 : OpcodeHexagon {
+class Enc_329361 : OpcodeHexagon {
bits <2> Pu4;
- let Inst{9-8} = Pu4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b8c967 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_667b39 : OpcodeHexagon {
- bits <5> Css32;
- let Inst{20-16} = Css32{4-0};
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_0ed752 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Cdd32;
- let Inst{4-0} = Cdd32{4-0};
-}
-class Enc_03833b : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_0d8adb : OpcodeHexagon {
+class Enc_33f8ba : OpcodeHexagon {
bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
+ let Inst{12-8} = Ii{7-3};
+ let Inst{4-2} = Ii{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
}
class Enc_3680c2 : OpcodeHexagon {
bits <7> Ii;
@@ -299,6 +758,156 @@
bits <2> Pd4;
let Inst{1-0} = Pd4{1-0};
}
+class Enc_3694bd : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <5> n1;
+ let Inst{29-29} = n1{4-4};
+ let Inst{26-25} = n1{3-2};
+ let Inst{23-22} = n1{1-0};
+}
+class Enc_372c9d : OpcodeHexagon {
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Os8;
+ let Inst{2-0} = Os8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_395cc4 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{6-3} = Ii{6-3};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_397f23 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{13-13} = Ii{7-7};
+ let Inst{7-3} = Ii{6-2};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_399e12 : OpcodeHexagon {
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <3> Rdd8;
+ let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_3a2484 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> n1;
+ let Inst{28-28} = n1{3-3};
+ let Inst{24-23} = n1{2-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_3a3d62 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3b7631 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vdddd32;
+ let Inst{4-0} = Vdddd32{4-0};
+ bits <3> Rx8;
+ let Inst{18-16} = Rx8{2-0};
+}
+class Enc_3d5b28 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3d6d37 : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{6-5} = Qs4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vvv32;
+ let Inst{12-8} = Vvv32{4-0};
+ bits <5> Vw32;
+ let Inst{4-0} = Vw32{4-0};
+}
+class Enc_3d920a : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{8-5} = Ii{5-2};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3dac0b : OpcodeHexagon {
+ bits <2> Qt4;
+ let Inst{6-5} = Qt4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_3e3989 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <6> n1;
+ let Inst{28-28} = n1{5-5};
+ let Inst{25-22} = n1{4-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_3f97c8 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3fc427 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_405228 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <3> n1;
+ let Inst{28-28} = n1{2-2};
+ let Inst{24-23} = n1{1-0};
+}
class Enc_412ff0 : OpcodeHexagon {
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
@@ -307,6 +916,886 @@
bits <5> Rxx32;
let Inst{12-8} = Rxx32{4-0};
}
+class Enc_420cf3 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{22-21} = Ii{5-4};
+ let Inst{13-13} = Ii{3-3};
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Ru32;
+ let Inst{4-0} = Ru32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{12-8} = Rd32{4-0};
+}
+class Enc_437f33 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <2> Pu4;
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_44215c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{17-16} = Ii{5-4};
+ let Inst{6-3} = Ii{3-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_44271f : OpcodeHexagon {
+ bits <5> Gs32;
+ let Inst{20-16} = Gs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_44661f : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_448f7f : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{26-25} = Ii{10-9};
+ let Inst{13-13} = Ii{8-8};
+ let Inst{7-0} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_45364e : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_454a26 : OpcodeHexagon {
+ bits <2> Pt4;
+ let Inst{9-8} = Pt4{1-0};
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_46c951 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{12-7} = Ii{5-0};
+ bits <5> II;
+ let Inst{4-0} = II{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_47ee5e : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ru32;
+ let Inst{12-8} = Ru32{4-0};
+ bits <3> Nt8;
+ let Inst{2-0} = Nt8{2-0};
+}
+class Enc_47ef61 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_48b75f : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4aca3a : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <3> n1;
+ let Inst{29-29} = n1{2-2};
+ let Inst{26-25} = n1{1-0};
+}
+class Enc_4b39e4 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_4dc228 : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{12-8} = Ii{8-4};
+ let Inst{4-3} = Ii{3-2};
+ bits <10> II;
+ let Inst{20-16} = II{9-5};
+ let Inst{7-5} = II{4-2};
+ let Inst{1-0} = II{1-0};
+}
+class Enc_4df4e9 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{26-25} = Ii{10-9};
+ let Inst{13-13} = Ii{8-8};
+ let Inst{7-0} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_4dff07 : OpcodeHexagon {
+ bits <2> Qv4;
+ let Inst{12-11} = Qv4{1-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4e4a80 : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{6-5} = Qs4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vvv32;
+ let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_4f4ed7 : OpcodeHexagon {
+ bits <18> Ii;
+ let Inst{26-25} = Ii{17-16};
+ let Inst{20-16} = Ii{15-11};
+ let Inst{13-5} = Ii{10-2};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4f677b : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <6> II;
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_500cb0 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_509701 : OpcodeHexagon {
+ bits <19> Ii;
+ let Inst{26-25} = Ii{18-17};
+ let Inst{20-16} = Ii{16-12};
+ let Inst{13-5} = Ii{11-3};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_50b5ac : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{17-16} = Ii{5-4};
+ let Inst{6-3} = Ii{3-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_50e578 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5138b3 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_51436c : OpcodeHexagon {
+ bits <16> Ii;
+ let Inst{23-22} = Ii{15-14};
+ let Inst{13-0} = Ii{13-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_51635c : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{8-4} = Ii{6-2};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_527412 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <2> Pt4;
+ let Inst{9-8} = Pt4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_52a5dd : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_53dca9 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{11-8} = Ii{5-2};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_541f26 : OpcodeHexagon {
+ bits <18> Ii;
+ let Inst{26-25} = Ii{17-16};
+ let Inst{20-16} = Ii{15-11};
+ let Inst{13-13} = Ii{10-10};
+ let Inst{7-0} = Ii{9-2};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_55355c : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ru32;
+ let Inst{12-8} = Ru32{4-0};
+ bits <5> Rtt32;
+ let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_569cfe : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_57a33e : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{13-13} = Ii{8-8};
+ let Inst{7-3} = Ii{7-3};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_585242 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-13} = Ii{5-5};
+ let Inst{7-3} = Ii{4-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_58a8bf : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5a18b3 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <5> n1;
+ let Inst{29-29} = n1{4-4};
+ let Inst{26-25} = n1{3-2};
+ let Inst{22-22} = n1{1-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_5ab2be : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5bdd42 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{8-5} = Ii{6-3};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5c124a : OpcodeHexagon {
+ bits <19> Ii;
+ let Inst{26-25} = Ii{18-17};
+ let Inst{20-16} = Ii{16-12};
+ let Inst{13-13} = Ii{11-11};
+ let Inst{7-0} = Ii{10-3};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_5ccba9 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-7} = Ii{7-2};
+ bits <6> II;
+ let Inst{13-13} = II{5-5};
+ let Inst{4-0} = II{4-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5cd7e9 : OpcodeHexagon {
+ bits <12> Ii;
+ let Inst{26-25} = Ii{11-10};
+ let Inst{13-5} = Ii{9-1};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_5d6c34 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_5de85f : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+}
+class Enc_5e2823 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5e8512 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_5e87ce : OpcodeHexagon {
+ bits <16> Ii;
+ let Inst{23-22} = Ii{15-14};
+ let Inst{20-16} = Ii{13-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5eac98 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_607661 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{12-7} = Ii{5-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6185fe : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <6> II;
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_61f0b0 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_621fba : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Gd32;
+ let Inst{4-0} = Gd32{4-0};
+}
+class Enc_625deb : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{10-8} = Ii{3-1};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rt16;
+ let Inst{3-0} = Rt16{3-0};
+}
+class Enc_6339d5 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ru32;
+ let Inst{12-8} = Ru32{4-0};
+ bits <5> Rt32;
+ let Inst{4-0} = Rt32{4-0};
+}
+class Enc_63eaeb : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{1-0} = Ii{1-0};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+}
+class Enc_6413b6 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <5> n1;
+ let Inst{29-29} = n1{4-4};
+ let Inst{26-25} = n1{3-2};
+ let Inst{23-23} = n1{1-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_645d54 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{5-5} = Ii{0-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_65d691 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_65f095 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_667b39 : OpcodeHexagon {
+ bits <5> Css32;
+ let Inst{20-16} = Css32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_668704 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{25-22} = n1{3-0};
+}
+class Enc_66bce1 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{11-8} = Rd16{3-0};
+}
+class Enc_690862 : OpcodeHexagon {
+ bits <13> Ii;
+ let Inst{26-25} = Ii{12-11};
+ let Inst{13-13} = Ii{10-10};
+ let Inst{7-0} = Ii{9-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_691712 : OpcodeHexagon {
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_69d63b : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+}
+class Enc_6a5972 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> Rt16;
+ let Inst{11-8} = Rt16{3-0};
+}
+class Enc_6b197f : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{8-5} = Ii{3-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6baed4 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6c9440 : OpcodeHexagon {
+ bits <10> Ii;
+ let Inst{21-21} = Ii{9-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6c9ee0 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6f70ca : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{8-4} = Ii{7-3};
+}
+class Enc_6f83e7 : OpcodeHexagon {
+ bits <2> Qv4;
+ let Inst{23-22} = Qv4{1-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_70b24b : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{8-5} = Ii{5-2};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_70fb07 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_71bb9b : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_71f1b4 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{8-5} = Ii{5-2};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7222b7 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <2> Qd4;
+ let Inst{1-0} = Qd4{1-0};
+}
+class Enc_724154 : OpcodeHexagon {
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Re32;
+ let Inst{20-16} = Re32{4-0};
+}
+class Enc_729ff7 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_733b27 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{8-5} = Ii{4-1};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_736575 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> n1;
+ let Inst{28-28} = n1{3-3};
+ let Inst{25-23} = n1{2-0};
+}
+class Enc_74aef2 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{8-5} = Ii{3-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_74d4e5 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_770858 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{6-5} = Ps4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_784502 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <3> Os8;
+ let Inst{2-0} = Os8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_78cbf0 : OpcodeHexagon {
+ bits <18> Ii;
+ let Inst{26-25} = Ii{17-16};
+ let Inst{20-16} = Ii{15-11};
+ let Inst{13-13} = Ii{10-10};
+ let Inst{7-0} = Ii{9-2};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_78e566 : OpcodeHexagon {
+ bits <2> Pt4;
+ let Inst{9-8} = Pt4{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_79b8c8 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7a0ea6 : OpcodeHexagon {
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+ bits <1> n1;
+ let Inst{9-9} = n1{0-0};
+}
+class Enc_7b523d : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{23-19} = Vv32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_7b7ba8 : OpcodeHexagon {
+ bits <2> Qu4;
+ let Inst{9-8} = Qu4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_7e5a82 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_7eaeb6 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7eb485 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{6-6} = Ii{0-0};
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <5> Ru32;
+ let Inst{20-16} = Ru32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_7eee72 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7f1a05 : OpcodeHexagon {
+ bits <5> Ru32;
+ let Inst{4-0} = Ru32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ry32;
+ let Inst{12-8} = Ry32{4-0};
+}
+class Enc_7fa7f6 : OpcodeHexagon {
+ bits <6> II;
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Re32;
+ let Inst{20-16} = Re32{4-0};
+}
+class Enc_800e04 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <6> n1;
+ let Inst{28-28} = n1{5-5};
+ let Inst{25-22} = n1{4-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_802dc0 : OpcodeHexagon {
+ bits <1> Ii;
+ let Inst{8-8} = Ii{0-0};
+ bits <2> Qv4;
+ let Inst{23-22} = Qv4{1-0};
+}
+class Enc_81ac1d : OpcodeHexagon {
+ bits <24> Ii;
+ let Inst{24-16} = Ii{23-15};
+ let Inst{13-1} = Ii{14-2};
+}
+class Enc_8203bb : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{12-7} = Ii{5-0};
+ bits <8> II;
+ let Inst{13-13} = II{7-7};
+ let Inst{6-0} = II{6-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_830e5d : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <8> II;
+ let Inst{22-16} = II{7-1};
+ let Inst{13-13} = II{0-0};
+ bits <2> Pu4;
+ let Inst{24-23} = Pu4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
class Enc_831a7d : OpcodeHexagon {
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
@@ -317,6 +1806,1138 @@
bits <2> Pe4;
let Inst{6-5} = Pe4{1-0};
}
+class Enc_83ee64 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_84b2cd : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-7} = Ii{7-2};
+ bits <5> II;
+ let Inst{4-0} = II{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_84bff1 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_84d359 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{3-0} = Ii{3-0};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+}
+class Enc_85bf58 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{6-3} = Ii{6-3};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_864a5a : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{12-8} = Ii{8-4};
+ let Inst{4-3} = Ii{3-2};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_865390 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_86a14b : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{7-3} = Ii{7-3};
+ bits <3> Rdd8;
+ let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_87c142 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{8-4} = Ii{6-2};
+ bits <4> Rt16;
+ let Inst{3-0} = Rt16{3-0};
+}
+class Enc_88c16c : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_88d4d9 : OpcodeHexagon {
+ bits <2> Pu4;
+ let Inst{9-8} = Pu4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_890909 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <2> Pe4;
+ let Inst{6-5} = Pe4{1-0};
+}
+class Enc_895bd9 : OpcodeHexagon {
+ bits <2> Qu4;
+ let Inst{9-8} = Qu4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_8b8927 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vv32;
+ let Inst{4-0} = Vv32{4-0};
+}
+class Enc_8b8d61 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{22-21} = Ii{5-4};
+ let Inst{13-13} = Ii{3-3};
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ru32;
+ let Inst{4-0} = Ru32{4-0};
+ bits <5> Rd32;
+ let Inst{12-8} = Rd32{4-0};
+}
+class Enc_8bcba4 : OpcodeHexagon {
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Re32;
+ let Inst{20-16} = Re32{4-0};
+}
+class Enc_8c2412 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{6-5} = Ps4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8c6530 : OpcodeHexagon {
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <2> Pu4;
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8d8a30 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_8dbdfe : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{13-13} = Ii{7-7};
+ let Inst{7-3} = Ii{6-2};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_8dbe85 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8dec2e : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8df4be : OpcodeHexagon {
+ bits <17> Ii;
+ let Inst{26-25} = Ii{16-15};
+ let Inst{20-16} = Ii{14-10};
+ let Inst{13-5} = Ii{9-1};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8e583a : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{25-23} = n1{3-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_90cd8b : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_91b9fe : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{6-3} = Ii{4-1};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_927852 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_928ca1 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_935d9b : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{6-3} = Ii{4-1};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_93af4c : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{10-4} = Ii{6-0};
+ bits <4> Rx16;
+ let Inst{3-0} = Rx16{3-0};
+}
+class Enc_95441f : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <2> Qd4;
+ let Inst{1-0} = Qd4{1-0};
+}
+class Enc_96ce4f : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_97d666 : OpcodeHexagon {
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_989021 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vy32;
+ let Inst{12-8} = Vy32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_98c0b8 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9a33d5 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{6-3} = Ii{6-3};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9ac432 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <2> Pt4;
+ let Inst{9-8} = Pt4{1-0};
+ bits <2> Pu4;
+ let Inst{7-6} = Pu4{1-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_9b0bc1 : OpcodeHexagon {
+ bits <2> Pu4;
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_9be1de : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{6-5} = Qs4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vv32;
+ let Inst{12-8} = Vv32{4-0};
+ bits <5> Vw32;
+ let Inst{4-0} = Vw32{4-0};
+}
+class Enc_9cdba7 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9d1247 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{8-5} = Ii{6-3};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9e2e1c : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{8-5} = Ii{4-1};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9e4c3f : OpcodeHexagon {
+ bits <6> II;
+ let Inst{13-8} = II{5-0};
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rd16;
+ let Inst{19-16} = Rd16{3-0};
+}
+class Enc_9ea4cf : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{6-6} = Ii{0-0};
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <5> Ru32;
+ let Inst{20-16} = Ru32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_9fae8a : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a05677 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a1640c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a198f6 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{10-5} = Ii{6-1};
+ bits <2> Pt4;
+ let Inst{12-11} = Pt4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a1e29d : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> II;
+ let Inst{22-21} = II{4-3};
+ let Inst{7-5} = II{2-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_a21d47 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{10-5} = Ii{5-0};
+ bits <2> Pt4;
+ let Inst{12-11} = Pt4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a255dc : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a27588 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{26-25} = Ii{10-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_a30110 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{23-19} = Vv32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_a42857 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{24-22} = n1{3-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_a4ef14 : OpcodeHexagon {
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a51a9a : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-8} = Ii{7-3};
+ let Inst{4-2} = Ii{2-0};
+}
+class Enc_a56825 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_a568d4 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_a5ed8a : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_a641d0 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vvv32;
+ let Inst{12-8} = Vvv32{4-0};
+ bits <5> Vw32;
+ let Inst{4-0} = Vw32{4-0};
+}
+class Enc_a6853f : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <6> n1;
+ let Inst{29-29} = n1{5-5};
+ let Inst{26-25} = n1{4-3};
+ let Inst{23-22} = n1{2-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_a6ce9c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{3-0} = Ii{5-2};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+}
+class Enc_a7341a : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_a75aa6 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+}
+class Enc_a7b8e8 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{22-21} = Ii{5-4};
+ let Inst{13-13} = Ii{3-3};
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a803e0 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{12-7} = Ii{6-1};
+ bits <8> II;
+ let Inst{13-13} = II{7-7};
+ let Inst{6-0} = II{6-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_a90628 : OpcodeHexagon {
+ bits <2> Qv4;
+ let Inst{23-22} = Qv4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_a94f3b : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <2> Pe4;
+ let Inst{6-5} = Pe4{1-0};
+}
+class Enc_aad80c : OpcodeHexagon {
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_acd6ed : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{10-5} = Ii{8-3};
+ bits <2> Pt4;
+ let Inst{12-11} = Pt4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_ad1831 : OpcodeHexagon {
+ bits <16> Ii;
+ let Inst{26-25} = Ii{15-14};
+ let Inst{20-16} = Ii{13-9};
+ let Inst{13-13} = Ii{8-8};
+ let Inst{7-0} = Ii{7-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_ad1c74 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+}
+class Enc_ad9bef : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rtt32;
+ let Inst{20-16} = Rtt32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_adf111 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <2> Qx4;
+ let Inst{1-0} = Qx4{1-0};
+}
+class Enc_b00112 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_b05839 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{8-5} = Ii{6-3};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b087ac : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_b0e9d8 : OpcodeHexagon {
+ bits <10> Ii;
+ let Inst{21-21} = Ii{9-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_b15941 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b1e1fb : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <5> n1;
+ let Inst{28-28} = n1{4-4};
+ let Inst{25-23} = n1{3-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_b388cf : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> II;
+ let Inst{22-21} = II{4-3};
+ let Inst{7-5} = II{2-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_b38ffc : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{11-8} = Ii{3-0};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rt16;
+ let Inst{3-0} = Rt16{3-0};
+}
+class Enc_b43b67 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+ bits <2> Qx4;
+ let Inst{6-5} = Qx4{1-0};
+}
+class Enc_b4e6cf : OpcodeHexagon {
+ bits <10> Ii;
+ let Inst{21-21} = Ii{9-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Ru32;
+ let Inst{4-0} = Ru32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b62ef7 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{10-8} = Ii{2-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b72622 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{5-5} = Ii{0-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rxx32;
+ let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_b78edd : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <4> n1;
+ let Inst{28-28} = n1{3-3};
+ let Inst{24-23} = n1{2-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_b7fad3 : OpcodeHexagon {
+ bits <2> Pv4;
+ let Inst{9-8} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_b8309d : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{8-3} = Ii{8-3};
+ bits <3> Rtt8;
+ let Inst{2-0} = Rtt8{2-0};
+}
+class Enc_b84c4c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{13-8} = Ii{5-0};
+ bits <6> II;
+ let Inst{23-21} = II{5-3};
+ let Inst{7-5} = II{2-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_b886fd : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{6-3} = Ii{4-1};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b8c967 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_b909d2 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <7> n1;
+ let Inst{28-28} = n1{6-6};
+ let Inst{25-22} = n1{5-2};
+ let Inst{13-13} = n1{1-1};
+ let Inst{8-8} = n1{0-0};
+}
+class Enc_b91167 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{6-5} = Ii{1-0};
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Vvv32;
+ let Inst{20-16} = Vvv32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_b97f71 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{8-5} = Ii{5-2};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b9c5fb : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_bc03e5 : OpcodeHexagon {
+ bits <17> Ii;
+ let Inst{26-25} = Ii{16-15};
+ let Inst{20-16} = Ii{14-10};
+ let Inst{13-13} = Ii{9-9};
+ let Inst{7-0} = Ii{8-1};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+}
+class Enc_bd0b33 : OpcodeHexagon {
+ bits <10> Ii;
+ let Inst{21-21} = Ii{9-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_bd1cbc : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{8-5} = Ii{4-1};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_bd6011 : OpcodeHexagon {
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_bd811a : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Cd32;
+ let Inst{4-0} = Cd32{4-0};
+}
+class Enc_bddee3 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vyyyy32;
+ let Inst{4-0} = Vyyyy32{4-0};
+ bits <3> Rx8;
+ let Inst{18-16} = Rx8{2-0};
+}
+class Enc_be32a5 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_bfbf03 : OpcodeHexagon {
+ bits <2> Qs4;
+ let Inst{9-8} = Qs4{1-0};
+ bits <2> Qd4;
+ let Inst{1-0} = Qd4{1-0};
+}
+class Enc_c0cdde : OpcodeHexagon {
+ bits <9> Ii;
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_c175d0 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{11-8} = Ii{3-0};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_c1d806 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+ bits <2> Qe4;
+ let Inst{6-5} = Qe4{1-0};
+}
+class Enc_c2b48e : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
+}
+class Enc_c31910 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{23-21} = Ii{7-5};
+ let Inst{13-13} = Ii{4-4};
+ let Inst{7-5} = Ii{3-1};
+ let Inst{3-3} = Ii{0-0};
+ bits <5> II;
+ let Inst{12-8} = II{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_c4dc92 : OpcodeHexagon {
+ bits <2> Qv4;
+ let Inst{23-22} = Qv4{1-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_c6220b : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{7-7} = Ii{0-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Ru32;
+ let Inst{12-8} = Ru32{4-0};
+ bits <3> Nt8;
+ let Inst{2-0} = Nt8{2-0};
+}
+class Enc_c7a204 : OpcodeHexagon {
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Re32;
+ let Inst{20-16} = Re32{4-0};
+}
+class Enc_c7cd90 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_c85e2a : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{12-8} = Ii{4-0};
+ bits <5> II;
+ let Inst{22-21} = II{4-3};
+ let Inst{7-5} = II{2-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_c90aca : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_c9a18e : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_c9e3bc : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+}
+class Enc_ca3887 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_cb4b4e : OpcodeHexagon {
+ bits <2> Pu4;
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_cb785b : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Rtt32;
+ let Inst{20-16} = Rtt32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_cb9321 : OpcodeHexagon {
+ bits <16> Ii;
+ let Inst{27-21} = Ii{15-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_cc449f : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{6-3} = Ii{3-0};
+ bits <2> Pv4;
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_cc857d : OpcodeHexagon {
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_cd4705 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <5> Vx32;
+ let Inst{4-0} = Vx32{4-0};
+}
+class Enc_cd82bc : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{21-21} = Ii{3-3};
+ let Inst{7-5} = Ii{2-0};
+ bits <6> II;
+ let Inst{13-8} = II{5-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rx32;
+ let Inst{4-0} = Rx32{4-0};
+}
+class Enc_cda00a : OpcodeHexagon {
+ bits <12> Ii;
+ let Inst{19-16} = Ii{11-8};
+ let Inst{12-5} = Ii{7-0};
+ bits <2> Pu4;
+ let Inst{22-21} = Pu4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_ce6828 : OpcodeHexagon {
+ bits <14> Ii;
+ let Inst{26-25} = Ii{13-12};
+ let Inst{13-13} = Ii{11-11};
+ let Inst{7-0} = Ii{10-3};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_cf1927 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <3> Os8;
+ let Inst{2-0} = Os8{2-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_d15d19 : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Vs32;
+ let Inst{4-0} = Vs32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
class Enc_d2216a : OpcodeHexagon {
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
@@ -335,977 +2956,6 @@
bits <2> Pe4;
let Inst{6-5} = Pe4{1-0};
}
-class Enc_5eac98 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_927852 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_7e5a82 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_65d691 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_454a26 : OpcodeHexagon {
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_5d6c34 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_cb4b4e : OpcodeHexagon {
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_cda00a : OpcodeHexagon {
- bits <12> Ii;
- let Inst{19-16} = Ii{11-8};
- let Inst{12-5} = Ii{7-0};
- bits <2> Pu4;
- let Inst{22-21} = Pu4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_bd0b33 : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_c0cdde : OpcodeHexagon {
- bits <9> Ii;
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_78e566 : OpcodeHexagon {
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_830e5d : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <8> II;
- let Inst{22-16} = II{7-1};
- let Inst{13-13} = II{0-0};
- bits <2> Pu4;
- let Inst{24-23} = Pu4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_f5e933 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_48b75f : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_527412 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_329361 : OpcodeHexagon {
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_284ebb : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_607661 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{12-7} = Ii{5-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_9ac432 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{17-16} = Ps4{1-0};
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <2> Pu4;
- let Inst{7-6} = Pu4{1-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_1f19b5 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{9-5} = Ii{4-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_e6c957 : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_83ee64 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_2ae154 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_437f33 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_6c9440 : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_890909 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <2> Pe4;
- let Inst{6-5} = Pe4{1-0};
-}
-class Enc_a94f3b : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <2> Pe4;
- let Inst{6-5} = Pe4{1-0};
-}
-class Enc_0aa344 : OpcodeHexagon {
- bits <5> Gss32;
- let Inst{20-16} = Gss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_44271f : OpcodeHexagon {
- bits <5> Gs32;
- let Inst{20-16} = Gs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_ed5027 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Gdd32;
- let Inst{4-0} = Gdd32{4-0};
-}
-class Enc_621fba : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Gd32;
- let Inst{4-0} = Gd32{4-0};
-}
-class Enc_81ac1d : OpcodeHexagon {
- bits <24> Ii;
- let Inst{24-16} = Ii{23-15};
- let Inst{13-1} = Ii{14-2};
-}
-class Enc_daea09 : OpcodeHexagon {
- bits <17> Ii;
- let Inst{23-22} = Ii{16-15};
- let Inst{20-16} = Ii{14-10};
- let Inst{13-13} = Ii{9-9};
- let Inst{7-1} = Ii{8-2};
- bits <2> Pu4;
- let Inst{9-8} = Pu4{1-0};
-}
-class Enc_ecbcc8 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_88d4d9 : OpcodeHexagon {
- bits <2> Pu4;
- let Inst{9-8} = Pu4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_0fa531 : OpcodeHexagon {
- bits <15> Ii;
- let Inst{21-21} = Ii{14-14};
- let Inst{13-13} = Ii{13-13};
- let Inst{11-1} = Ii{12-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_4dc228 : OpcodeHexagon {
- bits <9> Ii;
- let Inst{12-8} = Ii{8-4};
- let Inst{4-3} = Ii{3-2};
- bits <10> II;
- let Inst{20-16} = II{9-5};
- let Inst{7-5} = II{4-2};
- let Inst{1-0} = II{1-0};
-}
-class Enc_864a5a : OpcodeHexagon {
- bits <9> Ii;
- let Inst{12-8} = Ii{8-4};
- let Inst{4-3} = Ii{3-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_a51a9a : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-8} = Ii{7-3};
- let Inst{4-2} = Ii{2-0};
-}
-class Enc_33f8ba : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-8} = Ii{7-3};
- let Inst{4-2} = Ii{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_c9a18e : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_6a5972 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> Rt16;
- let Inst{11-8} = Rt16{3-0};
-}
-class Enc_eafd18 : OpcodeHexagon {
- bits <5> II;
- let Inst{12-8} = II{4-0};
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
-}
-class Enc_14d27a : OpcodeHexagon {
- bits <5> II;
- let Inst{12-8} = II{4-0};
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
-}
-class Enc_e90a15 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{22-22} = n1{0-0};
-}
-class Enc_5a18b3 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{22-22} = n1{1-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_1de724 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-22} = n1{2-0};
-}
-class Enc_14640c : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-22} = n1{3-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_668704 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-22} = n1{3-0};
-}
-class Enc_800e04 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-22} = n1{4-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_4aca3a : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <3> n1;
- let Inst{29-29} = n1{2-2};
- let Inst{26-25} = n1{1-0};
-}
-class Enc_f7ea77 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_405228 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <3> n1;
- let Inst{28-28} = n1{2-2};
- let Inst{24-23} = n1{1-0};
-}
-class Enc_3a2484 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-23} = n1{2-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_736575 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{25-23} = n1{2-0};
-}
-class Enc_8e583a : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-23} = n1{3-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_3694bd : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{23-22} = n1{1-0};
-}
-class Enc_a6853f : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <6> n1;
- let Inst{29-29} = n1{5-5};
- let Inst{26-25} = n1{4-3};
- let Inst{23-22} = n1{2-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_a42857 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-22} = n1{3-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_f6fe0b : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{24-22} = n1{4-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_3e3989 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-22} = n1{4-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_b909d2 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <7> n1;
- let Inst{28-28} = n1{6-6};
- let Inst{25-22} = n1{5-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_f82302 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{23-23} = n1{0-0};
-}
-class Enc_6413b6 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{23-23} = n1{1-1};
- let Inst{13-13} = n1{0-0};
-}
-class Enc_b78edd : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-23} = n1{2-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_041d7b : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-23} = n1{3-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_b1e1fb : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-23} = n1{3-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_178717 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-23} = n1{4-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
-}
-class Enc_5de85f : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
-}
-class Enc_9e4c3f : OpcodeHexagon {
- bits <6> II;
- let Inst{13-8} = II{5-0};
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rd16;
- let Inst{19-16} = Rd16{3-0};
-}
-class Enc_66bce1 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{11-8} = Rd16{3-0};
-}
-class Enc_69d63b : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <3> Ns8;
- let Inst{18-16} = Ns8{2-0};
-}
-class Enc_ad1c74 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{21-20} = Ii{10-9};
- let Inst{7-1} = Ii{8-2};
- bits <4> Rs16;
- let Inst{19-16} = Rs16{3-0};
-}
-class Enc_a27588 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{26-25} = Ii{10-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
-}
-class Enc_1f5d8f : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_74aef2 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{8-5} = Ii{3-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_6b197f : OpcodeHexagon {
- bits <4> Ii;
- let Inst{8-5} = Ii{3-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_5cd7e9 : OpcodeHexagon {
- bits <12> Ii;
- let Inst{26-25} = Ii{11-10};
- let Inst{13-5} = Ii{9-1};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
-}
-class Enc_9e2e1c : OpcodeHexagon {
- bits <5> Ii;
- let Inst{8-5} = Ii{4-1};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_bd1cbc : OpcodeHexagon {
- bits <5> Ii;
- let Inst{8-5} = Ii{4-1};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_de0214 : OpcodeHexagon {
- bits <12> Ii;
- let Inst{26-25} = Ii{11-10};
- let Inst{13-5} = Ii{9-1};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_74d4e5 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_e83554 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{8-5} = Ii{4-1};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_152467 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{8-5} = Ii{4-1};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_2d7491 : OpcodeHexagon {
- bits <13> Ii;
- let Inst{26-25} = Ii{12-11};
- let Inst{13-5} = Ii{10-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_7eee72 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_70b24b : OpcodeHexagon {
- bits <6> Ii;
- let Inst{8-5} = Ii{5-2};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_71f1b4 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{8-5} = Ii{5-2};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_211aaa : OpcodeHexagon {
- bits <11> Ii;
- let Inst{26-25} = Ii{10-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_e0a47a : OpcodeHexagon {
- bits <4> Ii;
- let Inst{8-5} = Ii{3-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_222336 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{8-5} = Ii{3-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_25bef0 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{26-25} = Ii{15-14};
- let Inst{20-16} = Ii{13-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_fa3ba4 : OpcodeHexagon {
- bits <14> Ii;
- let Inst{26-25} = Ii{13-12};
- let Inst{13-5} = Ii{11-3};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_b05839 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{8-5} = Ii{6-3};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_5bdd42 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{8-5} = Ii{6-3};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_509701 : OpcodeHexagon {
- bits <19> Ii;
- let Inst{26-25} = Ii{18-17};
- let Inst{20-16} = Ii{16-12};
- let Inst{13-5} = Ii{11-3};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_8df4be : OpcodeHexagon {
- bits <17> Ii;
- let Inst{26-25} = Ii{16-15};
- let Inst{20-16} = Ii{14-10};
- let Inst{13-5} = Ii{9-1};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_2a3787 : OpcodeHexagon {
- bits <13> Ii;
- let Inst{26-25} = Ii{12-11};
- let Inst{13-5} = Ii{10-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_27fd0e : OpcodeHexagon {
- bits <6> Ii;
- let Inst{8-5} = Ii{5-2};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_3d920a : OpcodeHexagon {
- bits <6> Ii;
- let Inst{8-5} = Ii{5-2};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_4f4ed7 : OpcodeHexagon {
- bits <18> Ii;
- let Inst{26-25} = Ii{17-16};
- let Inst{20-16} = Ii{15-11};
- let Inst{13-5} = Ii{10-2};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_a21d47 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{10-5} = Ii{5-0};
- bits <2> Pt4;
- let Inst{12-11} = Pt4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_f4413a : OpcodeHexagon {
- bits <4> Ii;
- let Inst{8-5} = Ii{3-0};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_acd6ed : OpcodeHexagon {
- bits <9> Ii;
- let Inst{10-5} = Ii{8-3};
- bits <2> Pt4;
- let Inst{12-11} = Pt4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_9d1247 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{8-5} = Ii{6-3};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_a198f6 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{10-5} = Ii{6-1};
- bits <2> Pt4;
- let Inst{12-11} = Pt4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_733b27 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{8-5} = Ii{4-1};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_f82eaf : OpcodeHexagon {
- bits <8> Ii;
- let Inst{10-5} = Ii{7-2};
- bits <2> Pt4;
- let Inst{12-11} = Pt4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b97f71 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{8-5} = Ii{5-2};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
class Enc_d44e31 : OpcodeHexagon {
bits <6> Ii;
let Inst{12-7} = Ii{5-0};
@@ -1314,108 +2964,68 @@
bits <5> Rt32;
let Inst{4-0} = Rt32{4-0};
}
-class Enc_163a3c : OpcodeHexagon {
- bits <7> Ii;
- let Inst{12-7} = Ii{6-1};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
+class Enc_d483b9 : OpcodeHexagon {
+ bits <1> Ii;
+ let Inst{5-5} = Ii{0-0};
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
bits <5> Rt32;
- let Inst{4-0} = Rt32{4-0};
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
}
-class Enc_226535 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-7} = Ii{7-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
+class Enc_d50cd3 : OpcodeHexagon {
+ bits <3> Ii;
+ let Inst{7-5} = Ii{2-0};
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_d5c73f : OpcodeHexagon {
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
bits <5> Rt32;
- let Inst{4-0} = Rt32{4-0};
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
}
-class Enc_46c951 : OpcodeHexagon {
+class Enc_d6990d : OpcodeHexagon {
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_d7a65e : OpcodeHexagon {
bits <6> Ii;
let Inst{12-7} = Ii{5-0};
- bits <5> II;
+ bits <6> II;
+ let Inst{13-13} = II{5-5};
let Inst{4-0} = II{4-0};
+ bits <2> Pv4;
+ let Inst{6-5} = Pv4{1-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
}
-class Enc_e66a97 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{12-7} = Ii{6-1};
- bits <5> II;
- let Inst{4-0} = II{4-0};
+class Enc_d7bc34 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <3> Rt8;
+ let Inst{18-16} = Rt8{2-0};
+ bits <5> Vyyyy32;
+ let Inst{4-0} = Vyyyy32{4-0};
+}
+class Enc_d7dc10 : OpcodeHexagon {
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
-}
-class Enc_84b2cd : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-7} = Ii{7-2};
- bits <5> II;
- let Inst{4-0} = II{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_f394d3 : OpcodeHexagon {
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
- bits <5> Re32;
- let Inst{20-16} = Re32{4-0};
-}
-class Enc_04c959 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Ryy32;
- let Inst{4-0} = Ryy32{4-0};
-}
-class Enc_323f2d : OpcodeHexagon {
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
- bits <5> Re32;
- let Inst{20-16} = Re32{4-0};
-}
-class Enc_4f677b : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_7fa7f6 : OpcodeHexagon {
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
- bits <5> Re32;
- let Inst{20-16} = Re32{4-0};
-}
-class Enc_6185fe : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <6> II;
- let Inst{11-8} = II{5-2};
- let Inst{6-5} = II{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
}
class Enc_da664b : OpcodeHexagon {
bits <2> Ii;
@@ -1428,354 +3038,6 @@
bits <5> Rd32;
let Inst{4-0} = Rd32{4-0};
}
-class Enc_84bff1 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_2301d6 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{20-16} = Ii{5-1};
- let Inst{8-8} = Ii{0-0};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_2e1979 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_2a7b91 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{20-16} = Ii{5-1};
- let Inst{8-8} = Ii{0-0};
- bits <2> Pt4;
- let Inst{10-9} = Pt4{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_98c0b8 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_b7fad3 : OpcodeHexagon {
- bits <2> Pv4;
- let Inst{9-8} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_a75aa6 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
-}
-class Enc_c90aca : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-5} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_61f0b0 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
-}
-class Enc_a568d4 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_3d5b28 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_322e1b : OpcodeHexagon {
- bits <6> Ii;
- let Inst{22-21} = Ii{5-4};
- let Inst{13-13} = Ii{3-3};
- let Inst{7-5} = Ii{2-0};
- bits <6> II;
- let Inst{23-23} = II{5-5};
- let Inst{4-0} = II{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{12-8} = Rd32{4-0};
-}
-class Enc_420cf3 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{22-21} = Ii{5-4};
- let Inst{13-13} = Ii{3-3};
- let Inst{7-5} = Ii{2-0};
- bits <5> Ru32;
- let Inst{4-0} = Ru32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{12-8} = Rd32{4-0};
-}
-class Enc_277737 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{22-21} = Ii{7-6};
- let Inst{13-13} = Ii{5-5};
- let Inst{7-5} = Ii{4-2};
- bits <5> Ru32;
- let Inst{4-0} = Ru32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{12-8} = Rd32{4-0};
-}
-class Enc_a7b8e8 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{22-21} = Ii{5-4};
- let Inst{13-13} = Ii{3-3};
- let Inst{7-5} = Ii{2-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_7f1a05 : OpcodeHexagon {
- bits <5> Ru32;
- let Inst{4-0} = Ru32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ry32;
- let Inst{12-8} = Ry32{4-0};
-}
-class Enc_1b64fb : OpcodeHexagon {
- bits <16> Ii;
- let Inst{26-25} = Ii{15-14};
- let Inst{20-16} = Ii{13-9};
- let Inst{13-13} = Ii{8-8};
- let Inst{7-0} = Ii{7-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_ad1831 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{26-25} = Ii{15-14};
- let Inst{20-16} = Ii{13-9};
- let Inst{13-13} = Ii{8-8};
- let Inst{7-0} = Ii{7-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_5c124a : OpcodeHexagon {
- bits <19> Ii;
- let Inst{26-25} = Ii{18-17};
- let Inst{20-16} = Ii{16-12};
- let Inst{13-13} = Ii{11-11};
- let Inst{7-0} = Ii{10-3};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
-}
-class Enc_fda92c : OpcodeHexagon {
- bits <17> Ii;
- let Inst{26-25} = Ii{16-15};
- let Inst{20-16} = Ii{14-10};
- let Inst{13-13} = Ii{9-9};
- let Inst{7-0} = Ii{8-1};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_bc03e5 : OpcodeHexagon {
- bits <17> Ii;
- let Inst{26-25} = Ii{16-15};
- let Inst{20-16} = Ii{14-10};
- let Inst{13-13} = Ii{9-9};
- let Inst{7-0} = Ii{8-1};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_541f26 : OpcodeHexagon {
- bits <18> Ii;
- let Inst{26-25} = Ii{17-16};
- let Inst{20-16} = Ii{15-11};
- let Inst{13-13} = Ii{10-10};
- let Inst{7-0} = Ii{9-2};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_78cbf0 : OpcodeHexagon {
- bits <18> Ii;
- let Inst{26-25} = Ii{17-16};
- let Inst{20-16} = Ii{15-11};
- let Inst{13-13} = Ii{10-10};
- let Inst{7-0} = Ii{9-2};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_47ef61 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_22c845 : OpcodeHexagon {
- bits <14> Ii;
- let Inst{10-0} = Ii{13-3};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_70fb07 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
-}
-class Enc_28a2dc : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_12b6e9 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{11-8} = Ii{3-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_1aa186 : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
-}
-class Enc_8dec2e : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b388cf : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> II;
- let Inst{22-21} = II{4-3};
- let Inst{7-5} = II{2-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_e07374 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b84c4c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <6> II;
- let Inst{23-21} = II{5-3};
- let Inst{7-5} = II{2-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_a1e29d : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> II;
- let Inst{22-21} = II{4-3};
- let Inst{7-5} = II{2-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_179b35 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_143a3c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <6> II;
- let Inst{23-21} = II{5-3};
- let Inst{7-5} = II{2-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
-}
-class Enc_c85e2a : OpcodeHexagon {
- bits <5> Ii;
- let Inst{12-8} = Ii{4-0};
- bits <5> II;
- let Inst{22-21} = II{4-3};
- let Inst{7-5} = II{2-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
class Enc_da8d43 : OpcodeHexagon {
bits <6> Ii;
let Inst{13-13} = Ii{5-5};
@@ -1787,55 +3049,137 @@
bits <5> Rt32;
let Inst{12-8} = Rt32{4-0};
}
-class Enc_cc449f : OpcodeHexagon {
- bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
+class Enc_daea09 : OpcodeHexagon {
+ bits <17> Ii;
+ let Inst{23-22} = Ii{16-15};
+ let Inst{20-16} = Ii{14-10};
+ let Inst{13-13} = Ii{9-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <2> Pu4;
+ let Inst{9-8} = Pu4{1-0};
+}
+class Enc_db40cd : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{6-3} = Ii{5-2};
bits <5> Rt32;
let Inst{12-8} = Rt32{4-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
-class Enc_585242 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-13} = Ii{5-5};
- let Inst{7-3} = Ii{4-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
+class Enc_dbd70c : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <2> Pu4;
+ let Inst{6-5} = Pu4{1-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_dd766a : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vdd32;
+ let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_de0214 : OpcodeHexagon {
+ bits <12> Ii;
+ let Inst{26-25} = Ii{11-10};
+ let Inst{13-5} = Ii{9-1};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
}
-class Enc_52a5dd : OpcodeHexagon {
+class Enc_e07374 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_e0820b : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <2> Qs4;
+ let Inst{6-5} = Qs4{1-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_e0a47a : OpcodeHexagon {
bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
+ let Inst{8-5} = Ii{3-0};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+ bits <5> Rx32;
+ let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e26546 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{6-3} = Ii{4-1};
bits <3> Nt8;
let Inst{10-8} = Nt8{2-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
-class Enc_57a33e : OpcodeHexagon {
- bits <9> Ii;
- let Inst{13-13} = Ii{8-8};
- let Inst{7-3} = Ii{7-3};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
+class Enc_e38e1f : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
+ bits <2> Pu4;
+ let Inst{22-21} = Pu4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_e39bb2 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{9-4} = Ii{5-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+}
+class Enc_e3b0c4 : OpcodeHexagon {
+
+}
+class Enc_e66a97 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{12-7} = Ii{6-1};
+ bits <5> II;
+ let Inst{4-0} = II{4-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+}
+class Enc_e6abcf : OpcodeHexagon {
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
bits <5> Rtt32;
let Inst{12-8} = Rtt32{4-0};
}
-class Enc_9a33d5 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{6-3} = Ii{6-3};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+class Enc_e6c957 : OpcodeHexagon {
+ bits <10> Ii;
+ let Inst{21-21} = Ii{9-9};
+ let Inst{13-5} = Ii{8-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_e7581c : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
+}
+class Enc_e83554 : OpcodeHexagon {
+ bits <5> Ii;
+ let Inst{8-5} = Ii{4-1};
+ bits <1> Mu2;
+ let Inst{13-13} = Mu2{0-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
@@ -1850,186 +3194,16 @@
bits <5> Rt32;
let Inst{12-8} = Rt32{4-0};
}
-class Enc_b886fd : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_f44229 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{13-13} = Ii{6-6};
- let Inst{7-3} = Ii{5-1};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_31aa6a : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_397f23 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{13-13} = Ii{7-7};
- let Inst{7-3} = Ii{6-2};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_7eaeb6 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_8dbdfe : OpcodeHexagon {
- bits <8> Ii;
- let Inst{13-13} = Ii{7-7};
- let Inst{7-3} = Ii{6-2};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_65f095 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_448f7f : OpcodeHexagon {
+class Enc_e90a15 : OpcodeHexagon {
bits <11> Ii;
- let Inst{26-25} = Ii{10-9};
- let Inst{13-13} = Ii{8-8};
- let Inst{7-0} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_d5c73f : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_b15941 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_10bc21 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_4df4e9 : OpcodeHexagon {
- bits <11> Ii;
- let Inst{26-25} = Ii{10-9};
- let Inst{13-13} = Ii{8-8};
- let Inst{7-0} = Ii{7-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_8dbe85 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_96ce4f : OpcodeHexagon {
- bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_c7cd90 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{6-3} = Ii{3-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_ce6828 : OpcodeHexagon {
- bits <14> Ii;
- let Inst{26-25} = Ii{13-12};
- let Inst{13-13} = Ii{11-11};
- let Inst{7-0} = Ii{10-3};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
-}
-class Enc_928ca1 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_395cc4 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{6-3} = Ii{6-3};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_85bf58 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{6-3} = Ii{6-3};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <4> n1;
+ let Inst{29-29} = n1{3-3};
+ let Inst{26-25} = n1{2-1};
+ let Inst{22-22} = n1{0-0};
}
class Enc_e957fb : OpcodeHexagon {
bits <12> Ii;
@@ -2041,122 +3215,7 @@
bits <5> Rt32;
let Inst{12-8} = Rt32{4-0};
}
-class Enc_935d9b : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_052c7d : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_0d8870 : OpcodeHexagon {
- bits <12> Ii;
- let Inst{26-25} = Ii{11-10};
- let Inst{13-13} = Ii{9-9};
- let Inst{7-0} = Ii{8-1};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_91b9fe : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_e26546 : OpcodeHexagon {
- bits <5> Ii;
- let Inst{6-3} = Ii{4-1};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_143445 : OpcodeHexagon {
- bits <13> Ii;
- let Inst{26-25} = Ii{12-11};
- let Inst{13-13} = Ii{10-10};
- let Inst{7-0} = Ii{9-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_79b8c8 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_db40cd : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_690862 : OpcodeHexagon {
- bits <13> Ii;
- let Inst{26-25} = Ii{12-11};
- let Inst{13-13} = Ii{10-10};
- let Inst{7-0} = Ii{9-2};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_3f97c8 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_223005 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{6-3} = Ii{5-2};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_cd82bc : OpcodeHexagon {
- bits <4> Ii;
- let Inst{21-21} = Ii{3-3};
- let Inst{7-5} = Ii{2-0};
- bits <6> II;
- let Inst{13-8} = II{5-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_729ff7 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
+class Enc_ea23e4 : OpcodeHexagon {
bits <5> Rtt32;
let Inst{12-8} = Rtt32{4-0};
bits <5> Rss32;
@@ -2164,117 +3223,37 @@
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_8c6530 : OpcodeHexagon {
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
+class Enc_ea4c54 : OpcodeHexagon {
bits <2> Pu4;
let Inst{6-5} = Pu4{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_d50cd3 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_dbd70c : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <2> Pu4;
- let Inst{6-5} = Pu4{1-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_8b8d61 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{22-21} = Ii{5-4};
- let Inst{13-13} = Ii{3-3};
- let Inst{7-5} = Ii{2-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{4-0} = Ru32{4-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
bits <5> Rd32;
- let Inst{12-8} = Rd32{4-0};
+ let Inst{4-0} = Rd32{4-0};
}
-class Enc_c31910 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{23-21} = Ii{7-5};
- let Inst{13-13} = Ii{4-4};
- let Inst{7-5} = Ii{3-1};
- let Inst{3-3} = Ii{0-0};
+class Enc_eaa9f8 : OpcodeHexagon {
+ bits <5> Vu32;
+ let Inst{12-8} = Vu32{4-0};
+ bits <5> Vv32;
+ let Inst{20-16} = Vv32{4-0};
+ bits <2> Qx4;
+ let Inst{1-0} = Qx4{1-0};
+}
+class Enc_eafd18 : OpcodeHexagon {
bits <5> II;
let Inst{12-8} = II{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
}
-class Enc_9fae8a : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_a1640c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{13-8} = Ii{5-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_fef969 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{20-16} = Ii{5-1};
- let Inst{5-5} = Ii{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b0e9d8 : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rx32;
- let Inst{4-0} = Rx32{4-0};
-}
-class Enc_b4e6cf : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-5} = Ii{8-0};
- bits <5> Ru32;
- let Inst{4-0} = Ru32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_1cf4ca : OpcodeHexagon {
- bits <6> Ii;
- let Inst{17-16} = Ii{5-4};
- let Inst{6-3} = Ii{3-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_6339d5 : OpcodeHexagon {
+class Enc_eca7c8 : OpcodeHexagon {
bits <2> Ii;
let Inst{13-13} = Ii{1-1};
let Inst{7-7} = Ii{0-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
bits <5> Ru32;
@@ -2282,86 +3261,51 @@
bits <5> Rt32;
let Inst{4-0} = Rt32{4-0};
}
-class Enc_44215c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{17-16} = Ii{5-4};
- let Inst{6-3} = Ii{3-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
+class Enc_ecbcc8 : OpcodeHexagon {
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
}
-class Enc_47ee5e : OpcodeHexagon {
+class Enc_ed48be : OpcodeHexagon {
bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
+ let Inst{6-5} = Ii{1-0};
+ bits <3> Rdd8;
+ let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_ed5027 : OpcodeHexagon {
+ bits <5> Rss32;
+ let Inst{20-16} = Rss32{4-0};
+ bits <5> Gdd32;
+ let Inst{4-0} = Gdd32{4-0};
+}
+class Enc_ee5ed0 : OpcodeHexagon {
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rd16;
+ let Inst{3-0} = Rd16{3-0};
+ bits <2> n1;
+ let Inst{9-8} = n1{1-0};
+}
+class Enc_ef601b : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{12-8} = Ru32{4-0};
- bits <3> Nt8;
- let Inst{2-0} = Nt8{2-0};
+ let Inst{12-11} = Pv4{1-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
}
-class Enc_50b5ac : OpcodeHexagon {
- bits <6> Ii;
- let Inst{17-16} = Ii{5-4};
- let Inst{6-3} = Ii{3-0};
- bits <2> Pv4;
- let Inst{1-0} = Pv4{1-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+class Enc_efaed8 : OpcodeHexagon {
+ bits <1> Ii;
+ let Inst{8-8} = Ii{0-0};
}
-class Enc_1a9974 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{12-8} = Ru32{4-0};
- bits <5> Rtt32;
- let Inst{4-0} = Rtt32{4-0};
-}
-class Enc_d7dc10 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <2> Pd4;
- let Inst{1-0} = Pd4{1-0};
-}
-class Enc_8203bb : OpcodeHexagon {
- bits <6> Ii;
- let Inst{12-7} = Ii{5-0};
- bits <8> II;
- let Inst{13-13} = II{7-7};
- let Inst{6-0} = II{6-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_d7a65e : OpcodeHexagon {
- bits <6> Ii;
- let Inst{12-7} = Ii{5-0};
+class Enc_f0cca7 : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{12-5} = Ii{7-0};
bits <6> II;
- let Inst{13-13} = II{5-5};
- let Inst{4-0} = II{4-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_a803e0 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{12-7} = Ii{6-1};
- bits <8> II;
- let Inst{13-13} = II{7-7};
- let Inst{6-0} = II{6-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
+ let Inst{20-16} = II{5-1};
+ let Inst{13-13} = II{0-0};
+ bits <5> Rdd32;
+ let Inst{4-0} = Rdd32{4-0};
}
class Enc_f20719 : OpcodeHexagon {
bits <7> Ii;
@@ -2383,965 +3327,15 @@
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
}
-class Enc_5ccba9 : OpcodeHexagon {
- bits <8> Ii;
- let Inst{12-7} = Ii{7-2};
+class Enc_f394d3 : OpcodeHexagon {
bits <6> II;
- let Inst{13-13} = II{5-5};
- let Inst{4-0} = II{4-0};
- bits <2> Pv4;
- let Inst{6-5} = Pv4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_8bcba4 : OpcodeHexagon {
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
+ let Inst{11-8} = II{5-2};
+ let Inst{6-5} = II{1-0};
+ bits <5> Ryy32;
+ let Inst{4-0} = Ryy32{4-0};
bits <5> Re32;
let Inst{20-16} = Re32{4-0};
}
-class Enc_eca7c8 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{12-8} = Ru32{4-0};
- bits <5> Rt32;
- let Inst{4-0} = Rt32{4-0};
-}
-class Enc_9ea4cf : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{6-6} = Ii{0-0};
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <5> Ru32;
- let Inst{20-16} = Ru32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_724154 : OpcodeHexagon {
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
- bits <5> Re32;
- let Inst{20-16} = Re32{4-0};
-}
-class Enc_c6220b : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{12-8} = Ru32{4-0};
- bits <3> Nt8;
- let Inst{2-0} = Nt8{2-0};
-}
-class Enc_7eb485 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{6-6} = Ii{0-0};
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <5> Ru32;
- let Inst{20-16} = Ru32{4-0};
- bits <3> Nt8;
- let Inst{10-8} = Nt8{2-0};
-}
-class Enc_c7a204 : OpcodeHexagon {
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
- bits <5> Re32;
- let Inst{20-16} = Re32{4-0};
-}
-class Enc_55355c : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{7-7} = Ii{0-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Ru32;
- let Inst{12-8} = Ru32{4-0};
- bits <5> Rtt32;
- let Inst{4-0} = Rtt32{4-0};
-}
-class Enc_f79415 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{6-6} = Ii{0-0};
- bits <6> II;
- let Inst{5-0} = II{5-0};
- bits <5> Ru32;
- let Inst{20-16} = Ru32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
-}
-class Enc_645d54 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{5-5} = Ii{0-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_b72622 : OpcodeHexagon {
- bits <2> Ii;
- let Inst{13-13} = Ii{1-1};
- let Inst{5-5} = Ii{0-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Rxx32;
- let Inst{4-0} = Rxx32{4-0};
-}
-class Enc_11a146 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{11-8} = Ii{3-0};
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_93af4c : OpcodeHexagon {
- bits <7> Ii;
- let Inst{10-4} = Ii{6-0};
- bits <4> Rx16;
- let Inst{3-0} = Rx16{3-0};
-}
-class Enc_0527db : OpcodeHexagon {
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rx16;
- let Inst{3-0} = Rx16{3-0};
-}
-class Enc_2df31d : OpcodeHexagon {
- bits <8> Ii;
- let Inst{9-4} = Ii{7-2};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_97d666 : OpcodeHexagon {
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_1f5ba6 : OpcodeHexagon {
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_63eaeb : OpcodeHexagon {
- bits <2> Ii;
- let Inst{1-0} = Ii{1-0};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
-}
-class Enc_ed48be : OpcodeHexagon {
- bits <2> Ii;
- let Inst{6-5} = Ii{1-0};
- bits <3> Rdd8;
- let Inst{2-0} = Rdd8{2-0};
-}
-class Enc_399e12 : OpcodeHexagon {
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <3> Rdd8;
- let Inst{2-0} = Rdd8{2-0};
-}
-class Enc_ee5ed0 : OpcodeHexagon {
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
- bits <2> n1;
- let Inst{9-8} = n1{1-0};
-}
-class Enc_e39bb2 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{9-4} = Ii{5-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_7a0ea6 : OpcodeHexagon {
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
- bits <1> n1;
- let Inst{9-9} = n1{0-0};
-}
-class Enc_53dca9 : OpcodeHexagon {
- bits <6> Ii;
- let Inst{11-8} = Ii{5-2};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_c175d0 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{11-8} = Ii{3-0};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_2fbf3c : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_86a14b : OpcodeHexagon {
- bits <8> Ii;
- let Inst{7-3} = Ii{7-3};
- bits <3> Rdd8;
- let Inst{2-0} = Rdd8{2-0};
-}
-class Enc_2bae10 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{10-8} = Ii{3-1};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_51635c : OpcodeHexagon {
- bits <7> Ii;
- let Inst{8-4} = Ii{6-2};
- bits <4> Rd16;
- let Inst{3-0} = Rd16{3-0};
-}
-class Enc_b38ffc : OpcodeHexagon {
- bits <4> Ii;
- let Inst{11-8} = Ii{3-0};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rt16;
- let Inst{3-0} = Rt16{3-0};
-}
-class Enc_f55a0c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{11-8} = Ii{5-2};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rt16;
- let Inst{3-0} = Rt16{3-0};
-}
-class Enc_6f70ca : OpcodeHexagon {
- bits <8> Ii;
- let Inst{8-4} = Ii{7-3};
-}
-class Enc_84d359 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{3-0} = Ii{3-0};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
-}
-class Enc_b8309d : OpcodeHexagon {
- bits <9> Ii;
- let Inst{8-3} = Ii{8-3};
- bits <3> Rtt8;
- let Inst{2-0} = Rtt8{2-0};
-}
-class Enc_625deb : OpcodeHexagon {
- bits <4> Ii;
- let Inst{10-8} = Ii{3-1};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
- bits <4> Rt16;
- let Inst{3-0} = Rt16{3-0};
-}
-class Enc_87c142 : OpcodeHexagon {
- bits <7> Ii;
- let Inst{8-4} = Ii{6-2};
- bits <4> Rt16;
- let Inst{3-0} = Rt16{3-0};
-}
-class Enc_a6ce9c : OpcodeHexagon {
- bits <6> Ii;
- let Inst{3-0} = Ii{5-2};
- bits <4> Rs16;
- let Inst{7-4} = Rs16{3-0};
-}
-class Enc_2146c1 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <3> Qss8;
- let Inst{2-0} = Qss8{2-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_843e80 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
- bits <3> Qxx8;
- let Inst{2-0} = Qxx8{2-0};
-}
-class Enc_1f3376 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_8e9fbd : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
- bits <5> Vy32;
- let Inst{12-8} = Vy32{4-0};
-}
-class Enc_57e245 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
- bits <5> Vy32;
- let Inst{12-8} = Vy32{4-0};
-}
-class Enc_274a4c : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
- bits <5> Vy32;
- let Inst{12-8} = Vy32{4-0};
-}
-class Enc_fbacc2 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
- bits <5> Vy32;
- let Inst{12-8} = Vy32{4-0};
-}
-class Enc_2a736a : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_b8513b : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_b5e54d : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rdd32;
- let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_50e578 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b5b643 : OpcodeHexagon {
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_2516bf : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_8d04c3 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_2ad23d : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_85daf5 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_e570b0 : OpcodeHexagon {
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_41dcc3 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_3126d7 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_1cd70f : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_12dd8f : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_8d5d98 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_fc563d : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_c84567 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_334c2b : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_3c46e8 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_129701 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_790d6e : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_880793 : OpcodeHexagon {
- bits <3> Qt8;
- let Inst{2-0} = Qt8{2-0};
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_a265b7 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_6b1bc4 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <3> Qt8;
- let Inst{10-8} = Qt8{2-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_b2ffce : OpcodeHexagon {
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_fde0e3 : OpcodeHexagon {
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_b3bac4 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_e7c9de : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
-}
-class Enc_5c3a80 : OpcodeHexagon {
- bits <3> Qt8;
- let Inst{10-8} = Qt8{2-0};
- bits <3> Qd8;
- let Inst{5-3} = Qd8{2-0};
-}
-class Enc_8f7cc3 : OpcodeHexagon {
- bits <3> Qtt8;
- let Inst{10-8} = Qtt8{2-0};
- bits <3> Qdd8;
- let Inst{5-3} = Qdd8{2-0};
-}
-class Enc_f106e0 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{8-4} = Vv32{4-0};
- bits <5> Vt32;
- let Inst{13-9} = Vt32{4-0};
- bits <4> Vdd16;
- let Inst{3-0} = Vdd16{3-0};
-}
-class Enc_7db2f8 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{13-9} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{8-4} = Vv32{4-0};
- bits <4> Vdd16;
- let Inst{3-0} = Vdd16{3-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_37c406 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <4> Vdd16;
- let Inst{7-4} = Vdd16{3-0};
-}
-class Enc_72a92d : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_d7e8ba : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_ce4c54 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_3a81ac : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_6c4697 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_b0e553 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_5883d0 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_9a895f : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_f3adb6 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_b5d5a7 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vs32;
- let Inst{7-3} = Vs32{4-0};
-}
-class Enc_5b76ab : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-8} = Ii{8-3};
- let Inst{2-0} = Ii{2-0};
- bits <5> Vs32;
- let Inst{7-3} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_17a474 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vs32;
- let Inst{7-3} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_9a9d62 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
- bits <5> Vs32;
- let Inst{7-3} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_3a527f : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Vs32;
- let Inst{7-3} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_c39a8b : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vss32;
- let Inst{7-3} = Vss32{4-0};
-}
-class Enc_908985 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vss32;
- let Inst{7-3} = Vss32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_e8ddd5 : OpcodeHexagon {
- bits <16> Ii;
- let Inst{21-21} = Ii{15-15};
- let Inst{13-8} = Ii{14-9};
- let Inst{2-0} = Ii{8-6};
- bits <5> Vss32;
- let Inst{7-3} = Vss32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_6a4549 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_932b58 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
-}
-class Enc_124cac : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_aceeef : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_2c3281 : OpcodeHexagon {
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_a4ae28 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Qd8;
- let Inst{5-3} = Qd8{2-0};
-}
-class Enc_c1652e : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <3> Qd8;
- let Inst{5-3} = Qd8{2-0};
-}
-class Enc_9aae4a : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
- bits <3> Qd8;
- let Inst{2-0} = Qd8{2-0};
-}
-class Enc_dcfcbb : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_a7ca29 : OpcodeHexagon {
- bits <3> Qt8;
- let Inst{2-0} = Qt8{2-0};
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{7-3} = Vd32{4-0};
-}
-class Enc_dd5f9f : OpcodeHexagon {
- bits <3> Qtt8;
- let Inst{2-0} = Qtt8{2-0};
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_7dc746 : OpcodeHexagon {
- bits <3> Quu8;
- let Inst{10-8} = Quu8{2-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <3> Qdd8;
- let Inst{5-3} = Qdd8{2-0};
-}
-class Enc_fa5efc : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_aac08c : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
-}
-class Enc_9a8c1f : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_a9eee0 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{20-16} = Vu32{4-0};
- bits <5> Vxx32;
- let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_9ce456 : OpcodeHexagon {
- bits <10> Ii;
- let Inst{21-21} = Ii{9-9};
- let Inst{13-8} = Ii{8-3};
- let Inst{2-0} = Ii{2-0};
- bits <5> Vss32;
- let Inst{7-3} = Vss32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_96f0fd : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{7-3} = Vx32{4-0};
- bits <3> Qdd8;
- let Inst{2-0} = Qdd8{2-0};
-}
-class Enc_a662ae : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <3> Rt8;
- let Inst{2-0} = Rt8{2-0};
- bits <5> Vdd32;
- let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_ec09c9 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{20-16} = Vuu32{4-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <3> Qdd8;
- let Inst{5-3} = Qdd8{2-0};
-}
-class Enc_400b42 : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <3> Qdd8;
- let Inst{5-3} = Qdd8{2-0};
-}
-class Enc_a5ed8a : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_134437 : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{9-8} = Qs4{1-0};
- bits <2> Qt4;
- let Inst{23-22} = Qt4{1-0};
- bits <2> Qd4;
- let Inst{1-0} = Qd4{1-0};
-}
-class Enc_bfbf03 : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{9-8} = Qs4{1-0};
- bits <2> Qd4;
- let Inst{1-0} = Qd4{1-0};
-}
-class Enc_7222b7 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <2> Qd4;
- let Inst{1-0} = Qd4{1-0};
-}
class Enc_f3f408 : OpcodeHexagon {
bits <4> Ii;
let Inst{13-13} = Ii{3-3};
@@ -3351,117 +3345,62 @@
bits <5> Vd32;
let Inst{4-0} = Vd32{4-0};
}
-class Enc_a255dc : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_2ebe3b : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_8d8a30 : OpcodeHexagon {
+class Enc_f4413a : OpcodeHexagon {
bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_58a8bf : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
+ let Inst{8-5} = Ii{3-0};
+ bits <2> Pt4;
+ let Inst{10-9} = Pt4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
-class Enc_f8c1c4 : OpcodeHexagon {
+class Enc_f44229 : OpcodeHexagon {
+ bits <7> Ii;
+ let Inst{13-13} = Ii{6-6};
+ let Inst{7-3} = Ii{5-1};
bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
+ let Inst{1-0} = Pv4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <3> Nt8;
+ let Inst{10-8} = Nt8{2-0};
}
-class Enc_c9e3bc : OpcodeHexagon {
- bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
+class Enc_f4f57b : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{6-5} = Ii{1-0};
+ bits <5> Vuu32;
+ let Inst{12-8} = Vuu32{4-0};
+ bits <5> Vvv32;
+ let Inst{20-16} = Vvv32{4-0};
+ bits <5> Vxx32;
+ let Inst{4-0} = Vxx32{4-0};
}
-class Enc_27b757 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
+class Enc_f55a0c : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{11-8} = Ii{5-2};
+ bits <4> Rs16;
+ let Inst{7-4} = Rs16{3-0};
+ bits <4> Rt16;
+ let Inst{3-0} = Rt16{3-0};
}
-class Enc_865390 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
+class Enc_f5e933 : OpcodeHexagon {
+ bits <2> Ps4;
+ let Inst{17-16} = Ps4{1-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
}
-class Enc_1ef990 : OpcodeHexagon {
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_b62ef7 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_d15d19 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_f77fbc : OpcodeHexagon {
- bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <3> Os8;
- let Inst{2-0} = Os8{2-0};
+class Enc_f6fe0b : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <4> Rs16;
+ let Inst{19-16} = Rs16{3-0};
+ bits <6> n1;
+ let Inst{28-28} = n1{5-5};
+ let Inst{24-22} = n1{4-2};
+ let Inst{13-13} = n1{1-1};
+ let Inst{8-8} = n1{0-0};
}
class Enc_f7430e : OpcodeHexagon {
bits <4> Ii;
@@ -3474,106 +3413,68 @@
bits <3> Os8;
let Inst{2-0} = Os8{2-0};
}
-class Enc_784502 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <3> Os8;
- let Inst{2-0} = Os8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_372c9d : OpcodeHexagon {
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Os8;
- let Inst{2-0} = Os8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_1aaec1 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <3> Os8;
- let Inst{2-0} = Os8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_cf1927 : OpcodeHexagon {
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <3> Os8;
- let Inst{2-0} = Os8{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_2ea740 : OpcodeHexagon {
- bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <2> Qv4;
- let Inst{12-11} = Qv4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
-}
-class Enc_0b51ce : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <2> Qv4;
- let Inst{12-11} = Qv4{1-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_4dff07 : OpcodeHexagon {
- bits <2> Qv4;
- let Inst{12-11} = Qv4{1-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vs32;
- let Inst{4-0} = Vs32{4-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_ff3442 : OpcodeHexagon {
+class Enc_f77fbc : OpcodeHexagon {
bits <4> Ii;
let Inst{13-13} = Ii{3-3};
let Inst{10-8} = Ii{2-0};
bits <5> Rt32;
let Inst{20-16} = Rt32{4-0};
+ bits <3> Os8;
+ let Inst{2-0} = Os8{2-0};
}
-class Enc_6c9ee0 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
+class Enc_f79415 : OpcodeHexagon {
+ bits <2> Ii;
+ let Inst{13-13} = Ii{1-1};
+ let Inst{6-6} = Ii{0-0};
+ bits <6> II;
+ let Inst{5-0} = II{5-0};
+ bits <5> Ru32;
+ let Inst{20-16} = Ru32{4-0};
+ bits <5> Rtt32;
+ let Inst{12-8} = Rtt32{4-0};
}
-class Enc_44661f : OpcodeHexagon {
+class Enc_f7ea77 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <4> n1;
+ let Inst{29-29} = n1{3-3};
+ let Inst{26-25} = n1{2-1};
+ let Inst{13-13} = n1{0-0};
+}
+class Enc_f82302 : OpcodeHexagon {
+ bits <11> Ii;
+ let Inst{21-20} = Ii{10-9};
+ let Inst{7-1} = Ii{8-2};
+ bits <3> Ns8;
+ let Inst{18-16} = Ns8{2-0};
+ bits <4> n1;
+ let Inst{29-29} = n1{3-3};
+ let Inst{26-25} = n1{2-1};
+ let Inst{23-23} = n1{0-0};
+}
+class Enc_f82eaf : OpcodeHexagon {
+ bits <8> Ii;
+ let Inst{10-5} = Ii{7-2};
+ bits <2> Pt4;
+ let Inst{12-11} = Pt4{1-0};
+ bits <5> Rs32;
+ let Inst{20-16} = Rs32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_f8c1c4 : OpcodeHexagon {
+ bits <2> Pv4;
+ let Inst{12-11} = Pv4{1-0};
bits <1> Mu2;
let Inst{13-13} = Mu2{0-0};
+ bits <5> Vd32;
+ let Inst{4-0} = Vd32{4-0};
bits <5> Rx32;
let Inst{20-16} = Rx32{4-0};
}
-class Enc_e7581c : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_45364e : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
class Enc_f8ecf9 : OpcodeHexagon {
bits <5> Vuu32;
let Inst{12-8} = Vuu32{4-0};
@@ -3582,611 +3483,53 @@
bits <5> Vdd32;
let Inst{4-0} = Vdd32{4-0};
}
-class Enc_a90628 : OpcodeHexagon {
- bits <2> Qv4;
- let Inst{23-22} = Qv4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_b43b67 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
- bits <2> Qx4;
- let Inst{6-5} = Qx4{1-0};
-}
-class Enc_c1d806 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
- bits <2> Qe4;
- let Inst{6-5} = Qe4{1-0};
-}
-class Enc_e0820b : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <2> Qs4;
- let Inst{6-5} = Qs4{1-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_71bb9b : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_3fc427 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_a30110 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{23-19} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_0b2e5b : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_7b7ba8 : OpcodeHexagon {
- bits <2> Qu4;
- let Inst{9-8} = Qu4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_895bd9 : OpcodeHexagon {
- bits <2> Qu4;
- let Inst{9-8} = Qu4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_c4dc92 : OpcodeHexagon {
- bits <2> Qv4;
- let Inst{23-22} = Qv4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_0f8bab : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <2> Qd4;
- let Inst{1-0} = Qd4{1-0};
-}
-class Enc_adf111 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <2> Qx4;
- let Inst{1-0} = Qx4{1-0};
-}
-class Enc_b087ac : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_5138b3 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_8c2412 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{6-5} = Ps4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_770858 : OpcodeHexagon {
- bits <2> Ps4;
- let Inst{6-5} = Ps4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_989021 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vy32;
- let Inst{12-8} = Vy32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_24a7dc : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{23-19} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_aad80c : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_d6990d : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_0e41fa : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_cc857d : OpcodeHexagon {
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_a7341a : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_95441f : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <2> Qd4;
- let Inst{1-0} = Qd4{1-0};
-}
-class Enc_eaa9f8 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <2> Qx4;
- let Inst{1-0} = Qx4{1-0};
-}
-class Enc_8b8927 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vv32;
- let Inst{4-0} = Vv32{4-0};
-}
-class Enc_158beb : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{6-5} = Qs4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vv32;
- let Inst{4-0} = Vv32{4-0};
-}
-class Enc_28dcbb : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vvv32;
- let Inst{4-0} = Vvv32{4-0};
-}
-class Enc_4e4a80 : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{6-5} = Qs4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vvv32;
- let Inst{4-0} = Vvv32{4-0};
-}
-class Enc_217147 : OpcodeHexagon {
- bits <2> Qv4;
- let Inst{23-22} = Qv4{1-0};
-}
-class Enc_569cfe : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_263841 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_245865 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{23-19} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_cd4705 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_7b523d : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{23-19} = Vv32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_1178da : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_4b39e4 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{7-5} = Ii{2-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_310ba1 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vx32;
- let Inst{4-0} = Vx32{4-0};
-}
-class Enc_01d3d0 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_5e8512 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_31db33 : OpcodeHexagon {
- bits <2> Qt4;
- let Inst{6-5} = Qt4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_6f83e7 : OpcodeHexagon {
- bits <2> Qv4;
- let Inst{23-22} = Qv4{1-0};
- bits <5> Vd32;
- let Inst{4-0} = Vd32{4-0};
-}
-class Enc_cb785b : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_ad9bef : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Rtt32;
- let Inst{20-16} = Rtt32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_2f2f04 : OpcodeHexagon {
- bits <1> Ii;
- let Inst{5-5} = Ii{0-0};
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_d483b9 : OpcodeHexagon {
- bits <1> Ii;
- let Inst{5-5} = Ii{0-0};
- bits <5> Vuu32;
- let Inst{12-8} = Vuu32{4-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_1bd127 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vdddd32;
- let Inst{4-0} = Vdddd32{4-0};
-}
-class Enc_d7bc34 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <3> Rt8;
- let Inst{18-16} = Rt8{2-0};
- bits <5> Vyyyy32;
- let Inst{4-0} = Vyyyy32{4-0};
-}
-class Enc_3b7631 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vdddd32;
- let Inst{4-0} = Vdddd32{4-0};
- bits <3> Rx8;
- let Inst{18-16} = Rx8{2-0};
-}
-class Enc_bddee3 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vyyyy32;
- let Inst{4-0} = Vyyyy32{4-0};
- bits <3> Rx8;
- let Inst{18-16} = Rx8{2-0};
-}
-class Enc_dd766a : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_16c48b : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vw32;
- let Inst{4-0} = Vw32{4-0};
-}
-class Enc_9be1de : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{6-5} = Qs4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vv32;
- let Inst{12-8} = Vv32{4-0};
- bits <5> Vw32;
- let Inst{4-0} = Vw32{4-0};
-}
-class Enc_a641d0 : OpcodeHexagon {
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vw32;
- let Inst{4-0} = Vw32{4-0};
-}
-class Enc_3d6d37 : OpcodeHexagon {
- bits <2> Qs4;
- let Inst{6-5} = Qs4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Vvv32;
- let Inst{12-8} = Vvv32{4-0};
- bits <5> Vw32;
- let Inst{4-0} = Vw32{4-0};
-}
-class Enc_3dac0b : OpcodeHexagon {
- bits <2> Qt4;
- let Inst{6-5} = Qt4{1-0};
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vv32;
- let Inst{20-16} = Vv32{4-0};
- bits <5> Vdd32;
- let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_500cb0 : OpcodeHexagon {
- bits <5> Vu32;
- let Inst{12-8} = Vu32{4-0};
- bits <5> Vxx32;
- let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_efaed8 : OpcodeHexagon {
- bits <1> Ii;
- let Inst{8-8} = Ii{0-0};
-}
-class Enc_802dc0 : OpcodeHexagon {
- bits <1> Ii;
- let Inst{8-8} = Ii{0-0};
- bits <2> Qv4;
- let Inst{23-22} = Qv4{1-0};
-}
-class Enc_ef601b : OpcodeHexagon {
- bits <4> Ii;
- let Inst{13-13} = Ii{3-3};
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Rt32;
- let Inst{20-16} = Rt32{4-0};
-}
-class Enc_6baed4 : OpcodeHexagon {
- bits <3> Ii;
- let Inst{10-8} = Ii{2-0};
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_691712 : OpcodeHexagon {
- bits <2> Pv4;
- let Inst{12-11} = Pv4{1-0};
- bits <1> Mu2;
- let Inst{13-13} = Mu2{0-0};
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_403871 : OpcodeHexagon {
- bits <5> Rx32;
- let Inst{20-16} = Rx32{4-0};
-}
-class Enc_2d829e : OpcodeHexagon {
+class Enc_fa3ba4 : OpcodeHexagon {
bits <14> Ii;
- let Inst{10-0} = Ii{13-3};
+ let Inst{26-25} = Ii{13-12};
+ let Inst{13-5} = Ii{11-3};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
-}
-class Enc_ca3887 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_9e9047 : OpcodeHexagon {
- bits <2> Pt4;
- let Inst{9-8} = Pt4{1-0};
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
-}
-class Enc_7d1542 : OpcodeHexagon {
- bits <7> Ss128;
- let Inst{22-16} = Ss128{6-0};
- bits <5> Rd32;
- let Inst{4-0} = Rd32{4-0};
-}
-class Enc_8f7633 : OpcodeHexagon {
- bits <5> Rs32;
- let Inst{20-16} = Rs32{4-0};
- bits <7> Sd128;
- let Inst{6-0} = Sd128{6-0};
-}
-class Enc_46f33d : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <5> Rt32;
- let Inst{12-8} = Rt32{4-0};
-}
-class Enc_d0fe02 : OpcodeHexagon {
- bits <5> Rxx32;
- let Inst{20-16} = Rxx32{4-0};
- bits <0> sgp10;
-}
-class Enc_e32517 : OpcodeHexagon {
- bits <7> Sss128;
- let Inst{22-16} = Sss128{6-0};
bits <5> Rdd32;
let Inst{4-0} = Rdd32{4-0};
}
-class Enc_a705fc : OpcodeHexagon {
- bits <5> Rss32;
- let Inst{20-16} = Rss32{4-0};
- bits <7> Sdd128;
- let Inst{6-0} = Sdd128{6-0};
-}
-class Enc_e6abcf : OpcodeHexagon {
+class Enc_fb6577 : OpcodeHexagon {
+ bits <2> Pu4;
+ let Inst{9-8} = Pu4{1-0};
bits <5> Rs32;
let Inst{20-16} = Rs32{4-0};
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
}
-class Enc_b00112 : OpcodeHexagon {
+class Enc_fcf7a7 : OpcodeHexagon {
bits <5> Rss32;
let Inst{20-16} = Rss32{4-0};
bits <5> Rtt32;
let Inst{12-8} = Rtt32{4-0};
+ bits <2> Pd4;
+ let Inst{1-0} = Pd4{1-0};
}
-class Enc_598f6c : OpcodeHexagon {
- bits <5> Rtt32;
- let Inst{12-8} = Rtt32{4-0};
+class Enc_fda92c : OpcodeHexagon {
+ bits <17> Ii;
+ let Inst{26-25} = Ii{16-15};
+ let Inst{20-16} = Ii{14-10};
+ let Inst{13-13} = Ii{9-9};
+ let Inst{7-0} = Ii{8-1};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+}
+class Enc_fef969 : OpcodeHexagon {
+ bits <6> Ii;
+ let Inst{20-16} = Ii{5-1};
+ let Inst{5-5} = Ii{0-0};
+ bits <5> Rt32;
+ let Inst{12-8} = Rt32{4-0};
+ bits <5> Rd32;
+ let Inst{4-0} = Rd32{4-0};
+}
+class Enc_ff3442 : OpcodeHexagon {
+ bits <4> Ii;
+ let Inst{13-13} = Ii{3-3};
+ let Inst{10-8} = Ii{2-0};
+ bits <5> Rt32;
+ let Inst{20-16} = Rt32{4-0};
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index ccc3f98..bba3635 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -217,8 +217,8 @@
let BaseOpcode = "A2_addi";
let CextOpcode = "A2_add";
let InputType = "imm";
-let isPredicable = 1;
let isAdd = 1;
+let isPredicable = 1;
let isExtendable = 1;
let opExtendable = 2;
let isExtentSigned = 1;
@@ -233,8 +233,8 @@
let Inst{7-5} = 0b111;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11010011000;
-let isCommutable = 1;
let isAdd = 1;
+let isCommutable = 1;
}
def A2_addpsat : HInst<
(outs DoubleRegs:$Rdd32),
@@ -410,9 +410,9 @@
"$Rdd32 = combine(#$Ii,#$II)",
tc_713b66bf, TypeALU32_2op>, Enc_18c338 {
let Inst{31-23} = 0b011111000;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
+let isReMaterializable = 1;
let isExtendable = 1;
let opExtendable = 1;
let isExtentSigned = 1;
@@ -1533,9 +1533,9 @@
(ins s8_0Imm:$Ii),
"$Rdd32 = #$Ii",
tc_713b66bf, TypeALU64> {
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
+let isReMaterializable = 1;
let isPseudo = 1;
}
def A2_tfrpt : HInst<
@@ -1579,10 +1579,10 @@
let BaseOpcode = "A2_tfrsi";
let CextOpcode = "A2_tfr";
let InputType = "imm";
-let isPredicable = 1;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
+let isPredicable = 1;
+let isReMaterializable = 1;
let isExtendable = 1;
let opExtendable = 1;
let isExtentSigned = 1;
@@ -4892,8 +4892,8 @@
let Uses = [R29];
let Defs = [PC, R31];
let BaseOpcode = "J2_call";
-let isPredicable = 1;
let hasSideEffects = 1;
+let isPredicable = 1;
let isExtendable = 1;
let opExtendable = 0;
let isExtentSigned = 1;
@@ -10131,6 +10131,18 @@
let opExtentBits = 17;
let opExtentAlign = 1;
}
+def L2_loadw_aq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw_aq($Rs32)",
+tc_2471c1c8, TypeLD>, Enc_5e2823, Requires<[HasV68]> {
+let Inst{13-5} = 0b001000000;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+}
def L2_loadw_locked : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
@@ -12022,6 +12034,16 @@
let opExtentBits = 6;
let opExtentAlign = 0;
}
+def L4_loadd_aq : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd_aq($Rs32)",
+tc_2471c1c8, TypeLD>, Enc_3a3d62, Requires<[HasV68]> {
+let Inst{13-5} = 0b011000000;
+let Inst{31-21} = 0b10010010000;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+}
def L4_loadd_locked : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
@@ -18015,8 +18037,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storerbabs";
let CextOpcode = "S2_storerb";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let DecoderNamespace = "MustExtend";
let isExtended = 1;
let opExtendable = 0;
@@ -18105,8 +18127,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storerhabs";
let CextOpcode = "S2_storerh";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let DecoderNamespace = "MustExtend";
let isExtended = 1;
let opExtendable = 0;
@@ -18153,8 +18175,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storeriabs";
let CextOpcode = "S2_storeri";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let DecoderNamespace = "MustExtend";
let isExtended = 1;
let opExtendable = 0;
@@ -18197,7 +18219,28 @@
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-16} = 0b0101010010000000;
+}
+def R6_release_at_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"release($Rs32):at",
+tc_db96aa6b, TypeST>, Enc_ecbcc8, Requires<[HasV68]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
let isSolo = 1;
+let mayStore = 1;
+}
+def R6_release_st_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"release($Rs32):st",
+tc_db96aa6b, TypeST>, Enc_ecbcc8, Requires<[HasV68]> {
+let Inst{7-2} = 0b001011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let isSolo = 1;
+let mayStore = 1;
}
def S2_addasl_rrri : HInst<
(outs IntRegs:$Rd32),
@@ -20856,8 +20899,8 @@
let BaseOpcode = "S2_storerb_io";
let CextOpcode = "S2_storerb";
let InputType = "imm";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let isExtendable = 1;
let opExtendable = 1;
let isExtentSigned = 1;
@@ -20923,8 +20966,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storerb_pi";
let CextOpcode = "S2_storerb";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let Constraints = "$Rx32 = $Rx32in";
}
def S2_storerb_pr : HInst<
@@ -20960,8 +21003,8 @@
let mayStore = 1;
let Uses = [GP];
let BaseOpcode = "S2_storerbabs";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let opExtendable = 0;
let isExtentSigned = 0;
let opExtentBits = 16;
@@ -21065,8 +21108,8 @@
let isRestrictNoSlot1Store = 1;
let mayStore = 1;
let BaseOpcode = "S2_storerb_pi";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let opNewValue = 3;
let Constraints = "$Rx32 = $Rx32in";
}
@@ -21358,8 +21401,8 @@
let BaseOpcode = "S2_storerh_io";
let CextOpcode = "S2_storerh";
let InputType = "imm";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let isExtendable = 1;
let opExtendable = 1;
let isExtentSigned = 1;
@@ -21425,8 +21468,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storerh_pi";
let CextOpcode = "S2_storerh";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let Constraints = "$Rx32 = $Rx32in";
}
def S2_storerh_pr : HInst<
@@ -21462,8 +21505,8 @@
let mayStore = 1;
let Uses = [GP];
let BaseOpcode = "S2_storerhabs";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let opExtendable = 0;
let isExtentSigned = 0;
let opExtentBits = 17;
@@ -21634,8 +21677,8 @@
let BaseOpcode = "S2_storeri_io";
let CextOpcode = "S2_storeri";
let InputType = "imm";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let isExtendable = 1;
let opExtendable = 1;
let isExtentSigned = 1;
@@ -21701,8 +21744,8 @@
let mayStore = 1;
let BaseOpcode = "S2_storeri_pi";
let CextOpcode = "S2_storeri";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let Constraints = "$Rx32 = $Rx32in";
}
def S2_storeri_pr : HInst<
@@ -21738,8 +21781,8 @@
let mayStore = 1;
let Uses = [GP];
let BaseOpcode = "S2_storeriabs";
-let isPredicable = 1;
let isNVStorable = 1;
+let isPredicable = 1;
let opExtendable = 0;
let isExtentSigned = 0;
let opExtentBits = 18;
@@ -21909,6 +21952,30 @@
let isSoloAX = 1;
let mayStore = 1;
}
+def S2_storew_rl_at_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw_rl($Rs32):at = $Rt32",
+tc_7af3a37e, TypeST>, Enc_ca3887, Requires<[HasV68]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000101;
+let accessSize = WordAccess;
+let isSolo = 1;
+let mayStore = 1;
+}
+def S2_storew_rl_st_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw_rl($Rs32):st = $Rt32",
+tc_7af3a37e, TypeST>, Enc_ca3887, Requires<[HasV68]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000101;
+let accessSize = WordAccess;
+let isSolo = 1;
+let mayStore = 1;
+}
def S2_svsathb : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
@@ -22218,8 +22285,8 @@
let Inst{31-21} = 0b10001100010;
let hasNewValue = 1;
let opNewValue = 0;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S2_vsplatrh : HInst<
(outs DoubleRegs:$Rdd32),
@@ -22228,8 +22295,8 @@
tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
let Inst{13-5} = 0b000000010;
let Inst{31-21} = 0b10000100010;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S2_vspliceib : HInst<
(outs DoubleRegs:$Rdd32),
@@ -22255,8 +22322,8 @@
tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10000100000;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S2_vsxthw : HInst<
(outs DoubleRegs:$Rdd32),
@@ -22265,8 +22332,8 @@
tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
let Inst{13-5} = 0b000000100;
let Inst{31-21} = 0b10000100000;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S2_vtrunehb : HInst<
(outs IntRegs:$Rd32),
@@ -22313,8 +22380,8 @@
tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
let Inst{13-5} = 0b000000010;
let Inst{31-21} = 0b10000100000;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S2_vzxthw : HInst<
(outs DoubleRegs:$Rdd32),
@@ -22323,8 +22390,8 @@
tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
let Inst{13-5} = 0b000000110;
let Inst{31-21} = 0b10000100000;
-let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
+let isReMaterializable = 1;
}
def S4_addaddi : HInst<
(outs IntRegs:$Rd32),
@@ -24521,6 +24588,30 @@
let isSoloAX = 1;
let mayStore = 1;
}
+def S4_stored_rl_at_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd_rl($Rs32):at = $Rtt32",
+tc_7af3a37e, TypeST>, Enc_e6abcf, Requires<[HasV68]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let accessSize = DoubleWordAccess;
+let isSolo = 1;
+let mayStore = 1;
+}
+def S4_stored_rl_st_vi : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd_rl($Rs32):st = $Rtt32",
+tc_7af3a37e, TypeST>, Enc_e6abcf, Requires<[HasV68]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let accessSize = DoubleWordAccess;
+let isSolo = 1;
+let mayStore = 1;
+}
def S4_storeirb_io : HInst<
(outs),
(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
@@ -25800,7 +25891,7 @@
}
def SA1_addi : HInst<
(outs GeneralSubRegs:$Rx16),
-(ins IntRegs:$Rx16in, s32_0Imm:$Ii),
+(ins GeneralSubRegs:$Rx16in, s32_0Imm:$Ii),
"$Rx16 = add($Rx16in,#$Ii)",
tc_5b347363, TypeSUBINSN>, Enc_93af4c {
let Inst{12-11} = 0b00;
@@ -25817,7 +25908,7 @@
}
def SA1_addrx : HInst<
(outs GeneralSubRegs:$Rx16),
-(ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
+(ins GeneralSubRegs:$Rx16in, GeneralSubRegs:$Rs16),
"$Rx16 = add($Rx16in,$Rs16)",
tc_5b347363, TypeSUBINSN>, Enc_0527db {
let Inst{12-8} = 0b11000;
@@ -27221,6 +27312,86 @@
let isCodeGenOnly = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_v6mpyhubs10 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b,#$Ii):h",
+tc_2b4c548e, TypeCVI_VX_DV>, Enc_b91167, Requires<[UseHVXV68]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_v6mpyhubs10_alt : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b10,#$Ii):h",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV68]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_v6mpyhubs10_vxx : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vxx32.w += v6mpy($Vuu32.ub,$Vvv32.b,#$Ii):h",
+tc_bb599486, TypeCVI_VX_DV>, Enc_f4f57b, Requires<[UseHVXV68]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_v6mpyvubs10 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b,#$Ii):v",
+tc_2b4c548e, TypeCVI_VX_DV>, Enc_b91167, Requires<[UseHVXV68]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_v6mpyvubs10_alt : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b10,#$Ii):v",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV68]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_v6mpyvubs10_vxx : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
+"$Vxx32.w += v6mpy($Vuu32.ub,$Vvv32.b,#$Ii):v",
+tc_bb599486, TypeCVI_VX_DV>, Enc_f4f57b, Requires<[UseHVXV68]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
def V6_vL32Ub_ai : HInst<
(outs HvxVR:$Vd32),
(ins IntRegs:$Rt32, s4_0Imm:$Ii),
@@ -27966,6 +28137,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -27988,6 +28160,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28010,6 +28183,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28032,6 +28206,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28053,6 +28228,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28074,6 +28250,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28096,6 +28273,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28117,6 +28295,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28138,6 +28317,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isNonTemporal = 1;
let isRestrictNoSlot1Store = 1;
@@ -28262,6 +28442,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -28283,6 +28464,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -28304,6 +28486,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -28325,6 +28508,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -28345,6 +28529,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -28365,6 +28550,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -28386,6 +28572,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -28406,6 +28593,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -28426,6 +28614,7 @@
let accessSize = HVXVectorAccess;
let isCVLoad = 1;
let isCVI = 1;
+let hasTmpDst = 1;
let mayLoad = 1;
let isRestrictNoSlot1Store = 1;
let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -32646,7 +32835,7 @@
(outs),
(ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
"vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h",
-tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
+tc_a28f32b5, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
let Inst{12-5} = 0b00001000;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -32663,7 +32852,7 @@
(outs),
(ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
"if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h",
-tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
+tc_7d68d5c2, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
let Inst{12-7} = 0b001010;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -32680,7 +32869,7 @@
(outs),
(ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
"vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_05058f6f, TypeCVI_GATHER_DV>, Enc_28dcbb, Requires<[UseHVXV65]> {
+tc_7095ecba, TypeCVI_GATHER_DV>, Enc_28dcbb, Requires<[UseHVXV65]> {
let Inst{12-5} = 0b00010000;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -32697,7 +32886,7 @@
(outs),
(ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
"if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_fd7610da, TypeCVI_GATHER_DV>, Enc_4e4a80, Requires<[UseHVXV65]> {
+tc_a69eeee1, TypeCVI_GATHER_DV>, Enc_4e4a80, Requires<[UseHVXV65]> {
let Inst{12-7} = 0b001100;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -32714,7 +32903,7 @@
(outs),
(ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
"vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w",
-tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
+tc_a28f32b5, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
let Inst{12-5} = 0b00000000;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -32731,7 +32920,7 @@
(outs),
(ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
"if ($Qs4) vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w",
-tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
+tc_7d68d5c2, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
let Inst{12-7} = 0b001000;
let Inst{31-21} = 0b00101111000;
let hasNewValue = 1;
@@ -36155,7 +36344,7 @@
}
def V6_vrmpyzbb_rx : HInst<
(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
-(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.b++)",
tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b000;
@@ -36169,7 +36358,7 @@
}
def V6_vrmpyzbb_rx_acc : HInst<
(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
-(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.b++)",
tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b010;
@@ -36212,7 +36401,7 @@
}
def V6_vrmpyzbub_rx : HInst<
(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
-(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.ub++)",
tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b010;
@@ -36226,7 +36415,7 @@
}
def V6_vrmpyzbub_rx_acc : HInst<
(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
-(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.ub++)",
tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b001;
@@ -36269,7 +36458,7 @@
}
def V6_vrmpyzcb_rx : HInst<
(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
-(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vdddd32.w = vr16mpyz($Vu32.c,$Rx8.b++)",
tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b001;
@@ -36283,7 +36472,7 @@
}
def V6_vrmpyzcb_rx_acc : HInst<
(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
-(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vyyyy32.w += vr16mpyz($Vu32.c,$Rx8.b++)",
tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b011;
@@ -36326,7 +36515,7 @@
}
def V6_vrmpyzcbs_rx : HInst<
(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
-(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vdddd32.w = vr16mpyzs($Vu32.c,$Rx8.b++)",
tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b010;
@@ -36340,7 +36529,7 @@
}
def V6_vrmpyzcbs_rx_acc : HInst<
(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
-(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vyyyy32.w += vr16mpyzs($Vu32.c,$Rx8.b++)",
tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b001;
@@ -36383,7 +36572,7 @@
}
def V6_vrmpyznb_rx : HInst<
(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
-(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vdddd32.w = vr8mpyz($Vu32.n,$Rx8.b++)",
tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b000;
@@ -36397,7 +36586,7 @@
}
def V6_vrmpyznb_rx_acc : HInst<
(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
-(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rx8in),
"$Vyyyy32.w += vr8mpyz($Vu32.n,$Rx8.b++)",
tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
let Inst{7-5} = 0b010;
@@ -38685,7 +38874,7 @@
(outs),
(ins IntRegs:$Rs32),
"wait($Rs32)",
-tc_d7718fbe, TypeCR>, Enc_ecbcc8 {
+tc_2c3e17fc, TypeCR>, Enc_ecbcc8, Requires<[HasV65]> {
let Inst{13-0} = 0b00000000000000;
let Inst{31-21} = 0b01100100010;
let isSolo = 1;
@@ -38699,8 +38888,8 @@
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10100110000;
let isSoloAX = 1;
-let mayStore = 1;
let hasSideEffects = 1;
+let mayStore = 1;
}
def Y4_trace : HInst<
(outs),
@@ -38720,8 +38909,8 @@
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10100110100;
let isSoloAX = 1;
-let mayStore = 1;
let hasSideEffects = 1;
+let mayStore = 1;
}
def Y6_diag : HInst<
(outs),
@@ -38749,6 +38938,74 @@
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b01100010010;
}
+def Y6_dmlink : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"dmlink($Rs32,$Rt32)",
+tc_7af3a37e, TypeST>, Enc_ca3887, Requires<[HasV68]> {
+let Inst{7-0} = 0b01000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110000;
+let hasSideEffects = 1;
+let isSolo = 1;
+let mayStore = 1;
+}
+def Y6_dmpause : HInst<
+(outs IntRegs:$Rd32),
+(ins),
+"$Rd32 = dmpause",
+tc_4bf903b0, TypeST>, Enc_a4ef14, Requires<[HasV68]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-16} = 0b1010100000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasSideEffects = 1;
+let isSolo = 1;
+}
+def Y6_dmpoll : HInst<
+(outs IntRegs:$Rd32),
+(ins),
+"$Rd32 = dmpoll",
+tc_4bf903b0, TypeST>, Enc_a4ef14, Requires<[HasV68]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-16} = 0b1010100000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasSideEffects = 1;
+let isSolo = 1;
+}
+def Y6_dmresume : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dmresume($Rs32)",
+tc_db96aa6b, TypeST>, Enc_ecbcc8, Requires<[HasV68]> {
+let Inst{13-0} = 0b00000010000000;
+let Inst{31-21} = 0b10100110000;
+let hasSideEffects = 1;
+let isSolo = 1;
+}
+def Y6_dmstart : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dmstart($Rs32)",
+tc_db96aa6b, TypeST>, Enc_ecbcc8, Requires<[HasV68]> {
+let Inst{13-0} = 0b00000000100000;
+let Inst{31-21} = 0b10100110000;
+let hasSideEffects = 1;
+let isSolo = 1;
+}
+def Y6_dmwait : HInst<
+(outs IntRegs:$Rd32),
+(ins),
+"$Rd32 = dmwait",
+tc_4bf903b0, TypeST>, Enc_a4ef14, Requires<[HasV68]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-16} = 0b1010100000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasSideEffects = 1;
+let isSolo = 1;
+}
def dep_A2_addsat : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
@@ -38788,7 +39045,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = add($Rs32,$Rt32)",
-tc_388f9897, TypeALU32_3op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_3op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38798,7 +39055,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Rd32 = add($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_ADDI>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_ADDI>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38813,7 +39070,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Rd32 = and($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38828,7 +39085,7 @@
(outs DoubleRegs:$Rdd32),
(ins s32_0Imm:$Ii, s8_0Imm:$II),
"$Rdd32 = combine(#$Ii,#$II)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let AsmVariantName = "NonParsable";
let isPseudo = 1;
let isExtendable = 1;
@@ -38841,7 +39098,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = sxtb($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38851,7 +39108,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = sxth($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38861,7 +39118,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = $Rs32",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38871,7 +39128,7 @@
(outs IntRegs:$Rd32),
(ins s32_0Imm:$Ii),
"$Rd32 = #$Ii",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38886,7 +39143,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = zxtb($Rs32)",
-PSEUDO, TypeMAPPING>, Requires<[HasV67]> {
+PSEUDO, TypeMAPPING>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38896,7 +39153,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = zxth($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let AsmVariantName = "NonParsable";
@@ -38906,7 +39163,7 @@
(outs DoubleRegs:$Rdd32),
(ins s8_0Imm:$Ii, u32_0Imm:$II),
"$Rdd32 = combine(#$Ii,#$II)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let AsmVariantName = "NonParsable";
let isPseudo = 1;
let isExtendable = 1;
@@ -38919,7 +39176,7 @@
(outs DoubleRegs:$Rdd32),
(ins s32_0Imm:$Ii, IntRegs:$Rs32),
"$Rdd32 = combine(#$Ii,$Rs32)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let AsmVariantName = "NonParsable";
let isPseudo = 1;
let isExtendable = 1;
@@ -38932,7 +39189,7 @@
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Rdd32 = combine($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let AsmVariantName = "NonParsable";
let isPseudo = 1;
let isExtendable = 1;
@@ -38945,7 +39202,7 @@
(outs IntRegs:$Rd32),
(ins PredRegs:$Pu4, s32_0Imm:$Ii),
"if (!$Pu4) $Rd32 = #$Ii",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let isPredicated = 1;
let isPredicatedFalse = 1;
let hasNewValue = 1;
@@ -38962,7 +39219,7 @@
(outs IntRegs:$Rd32),
(ins PredRegs:$Pu4, s32_0Imm:$Ii),
"if ($Pu4) $Rd32 = #$Ii",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let isPredicated = 1;
let hasNewValue = 1;
let opNewValue = 0;
@@ -38978,7 +39235,7 @@
(outs IntRegs:$Rd32),
(ins PredRegs:$Pu4, s32_0Imm:$Ii),
"if (!$Pu4.new) $Rd32 = #$Ii",
-tc_4ac61d92, TypeALU32_2op>, Requires<[HasV67]> {
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> {
let isPredicated = 1;
let isPredicatedFalse = 1;
let hasNewValue = 1;
@@ -38996,7 +39253,7 @@
(outs IntRegs:$Rd32),
(ins PredRegs:$Pu4, s32_0Imm:$Ii),
"if ($Pu4.new) $Rd32 = #$Ii",
-tc_4ac61d92, TypeALU32_2op>, Requires<[HasV67]> {
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> {
let isPredicated = 1;
let hasNewValue = 1;
let opNewValue = 0;
@@ -39013,7 +39270,7 @@
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Pd4 = cmp.eq($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
let AsmVariantName = "NonParsable";
let isPseudo = 1;
let isExtendable = 1;
@@ -39026,7 +39283,7 @@
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = deallocframe($Rs32):raw",
-tc_aee6250c, TypeLD>, Requires<[HasV67]> {
+tc_aee6250c, TypeLD>, Requires<[HasV68]> {
let accessSize = DoubleWordAccess;
let AsmVariantName = "NonParsable";
let mayLoad = 1;
@@ -39038,7 +39295,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Rd32 = memb($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39056,7 +39313,7 @@
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32, s29_3Imm:$Ii),
"$Rdd32 = memd($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = DoubleWordAccess;
let AsmVariantName = "NonParsable";
@@ -39072,7 +39329,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s31_1Imm:$Ii),
"$Rd32 = memh($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39090,7 +39347,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s30_2Imm:$Ii),
"$Rd32 = memw($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39108,7 +39365,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s32_0Imm:$Ii),
"$Rd32 = memub($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39126,7 +39383,7 @@
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, s31_1Imm:$Ii),
"$Rd32 = memuh($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV67]> {
+tc_eed07714, TypeLD>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39144,7 +39401,7 @@
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, u11_3Imm:$Ii),
"allocframe($Rx32,#$Ii):raw",
-tc_74a42bda, TypeST>, Requires<[HasV67]> {
+tc_74a42bda, TypeST>, Requires<[HasV68]> {
let hasNewValue = 1;
let opNewValue = 0;
let addrMode = BaseImmOffset;
@@ -39160,7 +39417,7 @@
(outs),
(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
"memb($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+tc_a9edeffa, TypeST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = ByteAccess;
let AsmVariantName = "NonParsable";
@@ -39176,7 +39433,7 @@
(outs),
(ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
"memd($Rs32+#$Ii) = $Rtt32",
-tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+tc_a9edeffa, TypeST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = DoubleWordAccess;
let AsmVariantName = "NonParsable";
@@ -39192,7 +39449,7 @@
(outs),
(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
"memh($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+tc_a9edeffa, TypeST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = HalfWordAccess;
let AsmVariantName = "NonParsable";
@@ -39208,7 +39465,7 @@
(outs),
(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
"memw($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+tc_a9edeffa, TypeST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = WordAccess;
let AsmVariantName = "NonParsable";
@@ -39224,7 +39481,7 @@
(outs),
(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
"memb($Rs32+#$Ii) = #$II",
-tc_838c4d7a, TypeV4LDST>, Requires<[HasV67]> {
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = ByteAccess;
let AsmVariantName = "NonParsable";
@@ -39240,7 +39497,7 @@
(outs),
(ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
"memw($Rs32+#$Ii) = #$II",
-tc_838c4d7a, TypeV4LDST>, Requires<[HasV67]> {
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> {
let addrMode = BaseImmOffset;
let accessSize = WordAccess;
let AsmVariantName = "NonParsable";
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 0143d6f..e5c78d1 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -11,92 +11,452 @@
// V5 Scalar Instructions.
-def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C2_cmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C2_cmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C2_cmpgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abs IntRegs:$src1),
+ (A2_abs IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1),
+ (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abssat IntRegs:$src1),
+ (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2),
+ (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2),
+ (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
+ (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2),
+ (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_aslh IntRegs:$src1),
+ (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_asrh IntRegs:$src1),
+ (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2),
+ (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2),
+ (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
+ (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2),
+ (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
+ (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
+ (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
+ (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minu IntRegs:$src1, IntRegs:$src2),
+ (A2_minu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
+ (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negsat IntRegs:$src1),
+ (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1),
+ (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2),
+ (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_roundsat DoubleRegs:$src1),
+ (A2_roundsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
+ (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
+ (A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sath IntRegs:$src1),
+ (A2_sath IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
+ (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satuh IntRegs:$src1),
+ (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2),
+ (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+ (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
+ (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2),
+ (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svaddh IntRegs:$src1, IntRegs:$src2),
+ (A2_svaddh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
+ (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2),
+ (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
+ (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
+ (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
+ (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
+ (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2),
+ (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2),
+ (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
+ (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1),
+ (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxth IntRegs:$src1),
+ (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1),
+ (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
+ (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
+ (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
+ (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1),
+ (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
+ (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
+ (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1),
+ (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
+ (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
+ (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
+ (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2),
+ (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1),
+ (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxth IntRegs:$src1),
+ (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
+ (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2),
+ (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmpbeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmpbgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmpheq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmphgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_cmphgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
+ (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
+ (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2),
+ (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
+ (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
+ (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
(A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_rcmpneq IntRegs:$src1, IntRegs:$src2),
(A4_rcmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C2_bitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C2_bitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C4_nbitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C4_nbitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
- (C2_tfrpr (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
- (C2_tfrpr (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
- (C2_tfrpr (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
- (C2_tfrpr (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C4_cmpneq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C4_cmplte IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (C4_cmplteu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
- (C2_tfrpr (C2_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2),
- (C2_tfrpr (C2_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2),
- (C2_tfrpr (C2_xor (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
- (C2_tfrpr (C2_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_not PredRegs:$src1),
- (C2_tfrpr (C2_not (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
- (C2_tfrpr (C2_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_and_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_and_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_or_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_or_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_and_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_and_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_or_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
- (C2_tfrpr (C4_or_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1),
- (C2_tfrpr (C2_pxfer_map (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
- (C2_tfrpr (C2_any8 (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
+ (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2),
+ (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (C2_tfrpr (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
+ (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_all8 PredRegs:$src1),
(C2_tfrpr (C2_all8 (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2),
- (C2_vitpack (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
+ (C2_tfrpr (C2_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
+ (C2_tfrpr (C2_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
+ (C2_tfrpr (C2_any8 (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C2_bitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+ (C2_tfrpr (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C2_bitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C2_cmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C2_cmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C2_cmpgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_mask PredRegs:$src1),
+ (C2_mask (C2_tfrrp PredRegs:$src1))>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(C2_mux (C2_tfrrp PredRegs:$src1), IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3),
@@ -105,508 +465,176 @@
(C2_muxir (C2_tfrrp PredRegs:$src1), IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
(C2_muxri (C2_tfrrp PredRegs:$src1), s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (C2_vmux (C2_tfrrp PredRegs:$src1), DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_mask PredRegs:$src1),
- (C2_mask (C2_tfrrp PredRegs:$src1))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmpbeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmpbgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmpheq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmphgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_cmphgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
- (C2_tfrpr (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_not PredRegs:$src1),
+ (C2_tfrpr (C2_not (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2),
+ (C2_tfrpr (C2_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
+ (C2_tfrpr (C2_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1),
+ (C2_tfrpr (C2_pxfer_map (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_tfrpr PredRegs:$src1),
(C2_tfrpr (C2_tfrrp PredRegs:$src1))>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_tfrrp IntRegs:$src1),
(C2_tfrpr (C2_tfrrp IntRegs:$src1))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2),
+ (C2_vitpack (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (C2_vmux (C2_tfrrp PredRegs:$src1), DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2),
+ (C2_tfrpr (C2_xor (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_and_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_and_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_and_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_and_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C4_cmplte IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C4_cmplteu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C4_cmpneq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_tfrpr (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_fastcorner9 PredRegs:$src1, PredRegs:$src2),
(C2_tfrpr (C4_fastcorner9 (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
(C2_tfrpr (C4_fastcorner9_not (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
- (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
- (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
- (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2),
- (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
- (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2),
- (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
- (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2),
- (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C4_nbitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+ (C2_tfrpr (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (C4_nbitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_or_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_or_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_or_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+ (C2_tfrpr (C4_or_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
+ (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
+ (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
+ (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1),
+ (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1),
+ (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
+ (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1),
+ (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
+ (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
+ (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
+ (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1),
+ (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
+ (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1),
+ (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1),
+ (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2ud IntRegs:$src1),
+ (F2_conv_sf2ud IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1),
+ (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2uw IntRegs:$src1),
+ (F2_conv_sf2uw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2uw_chop IntRegs:$src1),
+ (F2_conv_sf2uw_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
+ (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1),
+ (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
+ (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
+ (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
+ (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
+ (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
+ (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
+ (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (C2_tfrpr (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2),
+ (C2_tfrpr (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
+ (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
+ (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
+ (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (C2_tfrpr (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (F2_sfcmpge IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
+ (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2),
+ (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
+ (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
+ (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, (C2_tfrrp PredRegs:$src4))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
+ (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
+ (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
+ (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2),
+ (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2),
+ (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
+ (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
(M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
- (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
- (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
- (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
- (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
- (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
- (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
- (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
- (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2),
- (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
- (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -615,6 +643,18 @@
(M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2),
(M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2),
@@ -631,132 +671,416 @@
(M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
- (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2),
- (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2),
+ (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2),
+ (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2),
- (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2),
- (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2),
- (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
- (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
- (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
- (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2),
- (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
- (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2),
+ (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
+ (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2),
+ (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2),
+ (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
@@ -765,182 +1089,82 @@
(M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2),
- (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
- (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
- (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
- (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
- (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2),
- (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2),
- (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2),
- (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2),
- (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2),
- (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2),
- (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2),
- (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
- (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
- (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
- (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2),
- (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
- (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
- (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
- (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
- (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_aslh IntRegs:$src1),
- (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_asrh IntRegs:$src1),
- (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
- (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_negsat IntRegs:$src1),
- (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_abs IntRegs:$src1),
- (A2_abs IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_abssat IntRegs:$src1),
- (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
- (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
- (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1),
- (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
- (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
- (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
- (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_minu IntRegs:$src1, IntRegs:$src2),
- (A2_minu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
- (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
- (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1),
- (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1),
- (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1),
- (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_zxth IntRegs:$src1),
- (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxth IntRegs:$src1),
- (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2),
- (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
- (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
- (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2),
- (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
- (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2),
- (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2),
- (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
- (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
- (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2),
- (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2),
- (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2),
- (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
+ (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
+ (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2),
+ (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
- (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
- (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2),
- (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
- (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
- (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
- (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -949,6 +1173,28 @@
(M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
+ (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2),
+ (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
+ (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
+ (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
+ (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
+ (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -957,714 +1203,470 @@
(M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
- (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
- (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
- (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
+ (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2),
+ (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+ (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
- (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
- (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1),
- (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1),
- (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
- (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_roundsat DoubleRegs:$src1),
- (A2_roundsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sath IntRegs:$src1),
- (A2_sath IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_satuh IntRegs:$src1),
- (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
- (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
- (A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
- (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
- (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
- (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
- (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svaddh IntRegs:$src1, IntRegs:$src2),
- (A2_svaddh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
- (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2),
- (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
- (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2),
- (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2),
- (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
- (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1),
- (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
- (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
- (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2),
- (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
- (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2),
- (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
- (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2),
- (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2),
- (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
- (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
- (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2),
- (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
- (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, (C2_tfrrp PredRegs:$src4))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (F2_sfcmpge IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
- (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2),
- (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (C2_tfrpr (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
- (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
- (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2),
- (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
- (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
- (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2),
- (C2_tfrpr (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
- (C2_tfrpr (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
- (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
- (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1),
- (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1),
- (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
- (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
- (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
- (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
- (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
- (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
- (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
- (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
- (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2uw IntRegs:$src1),
- (F2_conv_sf2uw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
- (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2ud IntRegs:$src1),
- (F2_conv_sf2ud IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
- (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
- (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
- (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
- (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
- (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2uw_chop IntRegs:$src1),
- (F2_conv_sf2uw_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1),
- (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1),
- (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1),
- (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
- (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1),
- (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1),
- (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1),
- (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2),
- (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2),
- (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r IntRegs:$src1, IntRegs:$src2),
- (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
- (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2),
- (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2),
- (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
- (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2),
- (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2),
- (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
- (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
- (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
- (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
- (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2),
- (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+ (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2),
+ (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
+ (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+ (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
+ (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3),
(S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
-def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
- (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
- (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
- (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
- (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1),
- (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1),
- (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
- (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2),
- (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
- (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
- (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
- (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
- (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
- (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3),
- (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
- (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
- (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2),
- (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (C2_tfrpr (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (C2_tfrpr (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (S2_tstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
- (C2_tfrpr (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2),
- (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
- (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2),
- (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
- (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
- (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
(S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2),
- (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
- (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2),
- (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2),
- (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
- (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
(S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2),
- (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2),
+ (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2),
+ (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2),
(S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
- (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1),
- (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
- (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
- (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
- (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1),
- (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1),
- (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
- (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1),
- (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1),
- (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
- (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
- (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
- (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1),
- (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
- (S2_vsatwuh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
- (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
- (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1),
- (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
- (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
- (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1),
- (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
- (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_parity IntRegs:$src1, IntRegs:$src2),
- (S4_parity IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
- (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1),
- (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
- (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1),
- (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
- (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clb IntRegs:$src1),
- (S2_clb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
- (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1),
- (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1),
- (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
- (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1),
- (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+ (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
(S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_brevp DoubleRegs:$src1),
(S2_brevp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
+ (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
+ (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1),
+ (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1),
+ (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clb IntRegs:$src1),
+ (S2_clb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1),
+ (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1),
+ (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2),
+ (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_ct0 IntRegs:$src1),
(S2_ct0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1),
- (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_ct0p DoubleRegs:$src1),
(S2_ct0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1),
+ (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_ct1p DoubleRegs:$src1),
(S2_ct1p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
- (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_deinterleave DoubleRegs:$src1),
(S2_deinterleave DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_Y2_dczeroa IntRegs:$src1),
- (Y2_dczeroa IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+ (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
+ (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+ (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
+ (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3),
+ (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
+ (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
+ (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
+ (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r IntRegs:$src1, IntRegs:$src2),
+ (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+ (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
+ (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2),
+ (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
+ (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1),
+ (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
+ (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (C2_tfrpr (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (S2_tstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+ (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+ (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
+ (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2),
+ (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+ (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1),
+ (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
+ (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1),
+ (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
+ (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1),
+ (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1),
+ (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1),
+ (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
+ (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
+ (S2_vsatwuh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1),
+ (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1),
+ (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1),
+ (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+ (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+ (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
+ (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
+ (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
+ (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1),
+ (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
+ (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
+ (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
+ (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
+ (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1),
+ (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+ (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
+ (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+ (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2),
+ (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (C2_tfrpr (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
+ (C2_tfrpr (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_parity IntRegs:$src1, IntRegs:$src2),
+ (S4_parity IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+ (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
+ (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
+ (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
+ (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
+ (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_Y2_dccleana IntRegs:$src1),
(Y2_dccleana IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_Y2_dccleaninva IntRegs:$src1),
(Y2_dccleaninva IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dcfetch IntRegs:$src1),
+ (Y2_dcfetch IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_Y2_dcinva IntRegs:$src1),
(Y2_dcinva IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dczeroa IntRegs:$src1),
+ (Y2_dczeroa IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_Y4_l2fetch IntRegs:$src1, IntRegs:$src2),
(Y4_l2fetch IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_Y5_l2fetch IntRegs:$src1, DoubleRegs:$src2),
@@ -1672,30 +1674,30 @@
// V60 Scalar Instructions.
-def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
(S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
(S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
- (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
- (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
(S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
(S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
// V62 Scalar Instructions.
@@ -1717,59 +1719,23 @@
// V66 Scalar Instructions.
-def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
- (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
(F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
def: Pat<(int_hexagon_F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2),
(F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
def: Pat<(int_hexagon_S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2),
(S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV66]>;
// V67 Scalar Instructions.
-def: Pat<(int_hexagon_M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_vdmpy DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_vdmpy_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
- (M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_A7_croundd_ri DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
(A7_croundd_ri DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_A7_croundd_rr DoubleRegs:$src1, IntRegs:$src2),
(A7_croundd_rr DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2),
- (A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_A7_vclip DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
(A7_vclip DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_F2_dfmax DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1778,123 +1744,422 @@
(F2_dfmin DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_F2_dfmpyfix DoubleRegs:$src1, DoubleRegs:$src2),
(F2_dfmpyfix DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2),
- (F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
-def: Pat<(int_hexagon_F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
- (F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
def: Pat<(int_hexagon_F2_dfmpyhh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(F2_dfmpyhh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2),
+ (F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_vdmpy DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_vdmpy_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+ (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+ (M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+
+// V68 Scalar Instructions.
+
+def: Pat<(int_hexagon_Y6_dmlink IntRegs:$src1, IntRegs:$src2),
+ (Y6_dmlink IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV68]>;
+def: Pat<(int_hexagon_Y6_dmpause ),
+ (Y6_dmpause )>, Requires<[HasV68]>;
+def: Pat<(int_hexagon_Y6_dmpoll ),
+ (Y6_dmpoll )>, Requires<[HasV68]>;
+def: Pat<(int_hexagon_Y6_dmresume IntRegs:$src1),
+ (Y6_dmresume IntRegs:$src1)>, Requires<[HasV68]>;
+def: Pat<(int_hexagon_Y6_dmstart IntRegs:$src1),
+ (Y6_dmstart IntRegs:$src1)>, Requires<[HasV68]>;
+def: Pat<(int_hexagon_Y6_dmwait ),
+ (Y6_dmwait )>, Requires<[HasV68]>;
// V60 HVX Instructions.
-def: Pat<(int_hexagon_V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
- (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vS32b_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
- (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2),
+ (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_hi HvxWR:$src1),
+ (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1),
+ (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lo HvxWR:$src1),
+ (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1),
+ (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplatw IntRegs:$src1),
+ (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplatw_128B IntRegs:$src1),
+ (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1),
+ (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1),
+ (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1),
+ (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1),
+ (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
(V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vS32b_nqpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
(V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
- (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
- (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
(V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vS32b_nt_nqpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
(V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+ (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+ (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+ (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+ (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1),
+ (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1),
+ (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1),
+ (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1),
+ (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
+ (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
+ (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1),
+ (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1),
+ (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
(V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_valignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
(V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
(V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
(V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2),
- (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1),
- (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1),
- (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1),
- (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1),
- (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1),
- (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1),
- (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1),
- (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1),
- (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2),
- (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2),
- (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2),
- (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2),
- (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackeb HvxVR:$src1, HvxVR:$src2),
- (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackeb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2),
- (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackob HvxVR:$src1, HvxVR:$src2),
- (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackob_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2),
- (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vzb HvxVR:$src1),
- (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1),
- (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsb HvxVR:$src1),
- (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsb_128B HvxVR:$src1),
- (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vzh HvxVR:$src1),
- (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vzh_128B HvxVR:$src1),
- (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsh HvxVR:$src1),
- (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1),
- (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2),
+ (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2),
+ (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2),
+ (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+ (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+ (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2),
+ (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2),
+ (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2),
+ (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslw HvxVR:$src1, IntRegs:$src2),
+ (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslw_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2),
+ (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2),
+ (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2),
+ (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
+ (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2),
+ (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassign HvxVR:$src1),
+ (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1),
+ (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1),
+ (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1),
+ (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1),
+ (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1),
+ (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1),
+ (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1),
+ (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2),
+ (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vd0 ),
+ (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vd0_128B ),
+ (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1),
+ (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1),
+ (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2),
+ (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1),
+ (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1),
+ (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2),
+ (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vdmpybus HvxVR:$src1, IntRegs:$src2),
(V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpybus_128B HvxVR:$src1, IntRegs:$src2),
@@ -1927,22 +2192,6 @@
(V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
(V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2),
- (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2),
- (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2),
(V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhisat_128B HvxWR:$src1, IntRegs:$src2),
@@ -1951,14 +2200,14 @@
(V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
(V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2),
- (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2),
+ (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
(V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
@@ -1967,62 +2216,510 @@
(V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
(V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyb HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyb_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2),
- (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2),
- (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2),
- (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
- (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
- (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
- (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
- (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2),
+ (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2),
+ (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2),
+ (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2),
+ (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2),
+ (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth HvxVR:$src1, HvxVR:$src2),
+ (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2),
+ (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2),
+ (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2),
+ (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+ (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+ (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+ (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+ (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxw HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2),
+ (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2),
+ (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
+ (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2),
+ (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2),
+ (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybus HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybus_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhv HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiowh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiowh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1),
+ (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1),
+ (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1),
+ (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1),
+ (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnot HvxVR:$src1),
+ (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1),
+ (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2),
+ (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackeb HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackeb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackob HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackob_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1),
+ (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1),
+ (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2),
+ (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vrmpybus HvxVR:$src1, IntRegs:$src2),
(V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vrmpybus_128B HvxVR:$src1, IntRegs:$src2),
@@ -2047,106 +2744,42 @@
(V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vrmpybusv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2),
- (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
- (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
- (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
- (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
- (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
- (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslw HvxVR:$src1, IntRegs:$src2),
- (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslw_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2),
- (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2),
- (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2),
- (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2),
- (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2),
- (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2),
- (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2),
- (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2),
- (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2),
- (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2),
- (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2),
+ (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2),
+ (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2),
+ (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2),
+ (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vroundhb HvxVR:$src1, HvxVR:$src2),
(V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vroundhb_128B HvxVR:$src1, HvxVR:$src2),
@@ -2155,710 +2788,22 @@
(V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vroundhub_128B HvxVR:$src1, HvxVR:$src2),
(V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2),
- (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2),
- (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2),
- (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
- (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2),
- (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubw HvxVR:$src1, HvxVR:$src2),
- (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubw_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubw_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2),
- (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2),
- (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2),
- (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
- (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2),
- (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2),
- (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2),
- (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2),
- (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2),
- (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2),
- (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2),
- (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2),
- (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2),
- (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vd0 ),
- (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vd0_128B ),
- (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1),
- (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1),
- (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1),
- (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1),
- (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
- (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
- (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1),
- (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1),
- (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2),
- (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2),
- (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2),
- (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2),
- (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhv HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhus_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiowh HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiowh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybus HvxVR:$src1, IntRegs:$src2),
- (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybus_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2),
- (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2),
- (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpahb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyihb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2),
- (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2),
- (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2),
- (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnot HvxVR:$src1),
- (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1),
- (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2),
- (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2),
- (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2),
- (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2),
- (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2),
- (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth HvxVR:$src1, HvxVR:$src2),
- (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2),
- (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb HvxVR:$src1, HvxVR:$src2),
- (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2),
- (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2),
- (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2),
- (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2),
- (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2),
- (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2),
- (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2),
- (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2),
- (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1),
- (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1),
- (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2),
- (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2),
- (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2),
- (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2),
- (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2),
- (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2),
- (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1),
- (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1),
- (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2),
- (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2),
- (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2),
- (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxw HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
- (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2),
+ (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsathub HvxVR:$src1, HvxVR:$src2),
(V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsathub_128B HvxVR:$src1, HvxVR:$src2),
@@ -2867,177 +2812,229 @@
(V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2),
- (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2),
- (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsb HvxVR:$src1),
+ (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsb_128B HvxVR:$src1),
+ (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsh HvxVR:$src1),
+ (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1),
+ (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vshufeh HvxVR:$src1, HvxVR:$src2),
(V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vshufeh_128B HvxVR:$src1, HvxVR:$src2),
(V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1),
- (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1),
- (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1),
- (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1),
- (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2),
- (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1),
- (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1),
- (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vshuffb HvxVR:$src1),
(V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vshuffb_128B HvxVR:$src1),
(V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2),
- (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2),
- (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2),
- (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lvsplatw IntRegs:$src1),
- (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lvsplatw_128B IntRegs:$src1),
- (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1),
- (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1),
- (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vassign HvxVR:$src1),
- (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1),
- (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2),
- (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2),
- (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2),
- (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1),
- (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1),
- (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1),
- (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1),
- (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1),
- (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1),
- (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1),
- (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1),
- (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1),
- (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1),
- (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
- (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
- (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
- (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
- (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_hi HvxWR:$src1),
- (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1),
- (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lo HvxWR:$src1),
- (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1),
- (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2),
+ (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1),
+ (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1),
+ (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2),
+ (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2),
+ (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubw HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubw_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubw_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyb HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1),
+ (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1),
+ (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1),
+ (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1),
+ (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2),
+ (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2),
+ (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2),
+ (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2),
+ (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1),
+ (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1),
+ (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1),
+ (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1),
+ (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2),
+ (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vzb HvxVR:$src1),
+ (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1),
+ (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vzh HvxVR:$src1),
+ (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vzh_128B HvxVR:$src1),
+ (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
// V62 HVX Instructions.
-def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2),
- (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrounduhub HvxVR:$src1, HvxVR:$src2),
- (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrounduhub_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduwsat HvxVR:$src1, HvxVR:$src2),
- (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduwsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1),
+ (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1),
+ (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1),
+ (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1),
+ (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2v2 IntRegs:$src1),
+ (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2v2_128B IntRegs:$src1),
+ (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2),
+ (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2),
+ (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2),
+ (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddbsat HvxVR:$src1, HvxVR:$src2),
(V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddbsat_128B HvxVR:$src1, HvxVR:$src2),
@@ -3046,66 +3043,42 @@
(V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
(V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2),
- (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
- (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
(V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
(V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
- (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
- (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2),
- (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddclbw HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddclbw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddubh_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2),
- (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2),
- (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2),
- (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduwsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vandnqrt HvxQR:$src1, IntRegs:$src2),
(V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vandnqrt_128B HvxQR:$src1, IntRegs:$src2),
@@ -3114,26 +3087,54 @@
(V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vandnqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
(V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvqv HvxQR:$src1, HvxVR:$src2),
- (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvqv_128B HvxQR:$src1, HvxVR:$src2),
- (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vandvnqv HvxQR:$src1, HvxVR:$src2),
(V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vandvnqv_128B HvxQR:$src1, HvxVR:$src2),
(V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2v2 IntRegs:$src1),
- (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2v2_128B IntRegs:$src1),
- (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2),
- (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2),
- (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2),
- (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2),
- (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvqv HvxQR:$src1, HvxVR:$src2),
+ (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvqv_128B HvxQR:$src1, HvxVR:$src2),
+ (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vmaxb HvxVR:$src1, HvxVR:$src2),
(V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmaxb_128B HvxVR:$src1, HvxVR:$src2),
@@ -3142,97 +3143,69 @@
(V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vminb_128B HvxVR:$src1, HvxVR:$src2),
(V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2),
+ (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+ (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2),
+ (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrounduhub HvxVR:$src1, HvxVR:$src2),
+ (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrounduhub_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2),
+ (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsatuwuh HvxVR:$src1, HvxVR:$src2),
(V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsatuwuh_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1),
- (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1),
- (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1),
- (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1),
- (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddclbw HvxVR:$src1, HvxVR:$src2),
- (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddclbw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2),
- (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
- (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
- (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
- (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
- (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
- (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+ (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+ (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+ (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
// V65 HVX Instructions.
-def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2),
- (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2),
- (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2),
- (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdd0 ),
- (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdd0_128B ),
- (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1),
(V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1),
@@ -3241,6 +3214,50 @@
(V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1),
(V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdd0 ),
+ (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdd0_128B ),
+ (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2),
+ (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2),
+ (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2),
(V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2),
@@ -3249,10 +3266,6 @@
(V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
(V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
(V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
@@ -3265,10 +3278,10 @@
(V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
(V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2),
- (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2),
- (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+ (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2),
(V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2),
@@ -3277,42 +3290,10 @@
(V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
(V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
- (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
- (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1),
(V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1),
@@ -3325,22 +3306,77 @@
(V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1),
(V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+ (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+ (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+ (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+ (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+ (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+ (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+ (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+ (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+ (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+ (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+ (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
// V66 HVX Instructions.
-def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2),
- (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2),
- (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasr_into_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
(V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddcarrysat_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
(V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasr_into_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2),
+ (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsatdw HvxVR:$src1, HvxVR:$src2),
(V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsatdw_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
+
+// V68 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_v6mpyhubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3),
+ (V6_v6mpyhubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_v6mpyhubs10_128B HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3),
+ (V6_v6mpyhubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_v6mpyhubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4),
+ (V6_v6mpyhubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_v6mpyhubs10_vxx_128B HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4),
+ (V6_v6mpyhubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_v6mpyvubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3),
+ (V6_v6mpyvubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_v6mpyvubs10_128B HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3),
+ (V6_v6mpyvubs10 HvxWR:$src1, HvxWR:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4),
+ (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx_128B HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4),
+ (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX128B]>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMappings.td
index 3fca1ae..919cb99 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMappings.td
@@ -197,6 +197,8 @@
def V6_stu0Alias : InstAlias<"vmemu($Rt32) = $Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, HvxVR:$Vs32)>, Requires<[UseHVX]>;
def V6_stunp0Alias : InstAlias<"if (!$Pv4) vmemu($Rt32) = $Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, HvxVR:$Vs32)>, Requires<[UseHVX]>;
def V6_stup0Alias : InstAlias<"if ($Pv4) vmemu($Rt32) = $Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, HvxVR:$Vs32)>, Requires<[UseHVX]>;
+def V6_v6mpyhubs10_altAlias : InstAlias<"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b10,#$Ii):h", (V6_v6mpyhubs10 HvxWR:$Vdd32, HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_v6mpyvubs10_altAlias : InstAlias<"$Vdd32.w = v6mpy($Vuu32.ub,$Vvv32.b10,#$Ii):v", (V6_v6mpyvubs10 HvxWR:$Vdd32, HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii)>, Requires<[UseHVX]>;
def V6_vabsb_altAlias : InstAlias<"$Vd32 = vabsb($Vu32)", (V6_vabsb HvxVR:$Vd32, HvxVR:$Vu32)>, Requires<[UseHVX]>;
def V6_vabsb_sat_altAlias : InstAlias<"$Vd32 = vabsb($Vu32):sat", (V6_vabsb_sat HvxVR:$Vd32, HvxVR:$Vu32)>, Requires<[UseHVX]>;
def V6_vabsdiffh_altAlias : InstAlias<"$Vd32 = vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMask.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMask.h
index 742fe2d..45e1a1e 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMask.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepMask.h
@@ -8,10 +8,10 @@
// Automatically generated file, do not edit!
//===----------------------------------------------------------------------===//
+
#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
#define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
-// clang-format off
HexagonInstruction InstructionEncodings[] = {
{ /*Tag:A2_addi*/
/*Rd32=add(Rs32,#s16)*/
@@ -2816,6 +2816,5 @@
0x00002404,
0 }
};
-// clang-format off
#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepOperands.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepOperands.td
index 6ef668d..4bdda2c 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepOperands.td
@@ -13,120 +13,120 @@
def _timm : PatLeaf<(vt timm), pred>;
}
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
+defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
-def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
-defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
-def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
-def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
-def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
-def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
-def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
-def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
-def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
-def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
-def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
-def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
-def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
-def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
-def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
-def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
+def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
+def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
-def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
-def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
-def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
-defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
defm s31_1ImmPred : ImmOpPred<[{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
-def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
-defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
defm s30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
-def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
-defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
-def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
-defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
-def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
-def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
-def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
-def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
-def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
-def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
-def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
-def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
-def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
-def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
-def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
-def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
-def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
-def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
-def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
-def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
-def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
-def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
-def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
-defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
-def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
-def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
-def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
-def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
-def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
-def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
-def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
-def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
-def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
-def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
-def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
-defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
+defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
+defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
+defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
+defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
+defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
+defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
+def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
+def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
+def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
+def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
+def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
+def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
+def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
+def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
+def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
+def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
+def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
+def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
+def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
+def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
+def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
+def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
+def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
+def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
+def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
index dba3923..1afe0f0 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
@@ -95,6 +95,16 @@
}
}
+inline bool is_TC2early(unsigned SchedClass) {
+ switch (SchedClass) {
+ case Hexagon::Sched::tc_45f9d1be:
+ case Hexagon::Sched::tc_a4ee89db:
+ return true;
+ default:
+ return false;
+ }
+}
+
inline bool is_TC3x(unsigned SchedClass) {
switch (SchedClass) {
case Hexagon::Sched::tc_01e1be3b:
@@ -126,16 +136,6 @@
}
}
-inline bool is_TC2early(unsigned SchedClass) {
- switch (SchedClass) {
- case Hexagon::Sched::tc_45f9d1be:
- case Hexagon::Sched::tc_a4ee89db:
- return true;
- default:
- return false;
- }
-}
-
inline bool is_TC4x(unsigned SchedClass) {
switch (SchedClass) {
case Hexagon::Sched::tc_02fe1c65:
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a62610a..5b78254 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -576,7 +576,7 @@
const auto &MFI = MF.getFrameInfo();
const auto &HST = MF.getSubtarget<HexagonSubtarget>();
assert(!MFI.hasVarSizedObjects() &&
- !HST.getRegisterInfo()->needsStackRealignment(MF));
+ !HST.getRegisterInfo()->hasStackRealignment(MF));
return F.hasFnAttribute(Attribute::NoReturn) &&
F.hasFnAttribute(Attribute::NoUnwind) &&
!F.hasFnAttribute(Attribute::UWTable) && HST.noreturnStackElim() &&
@@ -1145,7 +1145,7 @@
auto &MFI = MF.getFrameInfo();
auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
- bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool HasExtraAlign = HRI.hasStackRealignment(MF);
bool HasAlloca = MFI.hasVarSizedObjects();
// Insert ALLOCFRAME if we need to or at -O0 for the debugger. Think
@@ -1265,7 +1265,7 @@
int Offset = MFI.getObjectOffset(FI);
bool HasAlloca = MFI.hasVarSizedObjects();
- bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool HasExtraAlign = HRI.hasStackRealignment(MF);
bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
@@ -1552,7 +1552,7 @@
auto *NewMMO = MF.getMachineMemOperand(
MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize(),
MFI.getObjectAlign(FI), MMO->getAAInfo(), MMO->getRanges(),
- MMO->getSyncScopeID(), MMO->getOrdering(),
+ MMO->getSyncScopeID(), MMO->getSuccessOrdering(),
MMO->getFailureOrdering());
new_memops.push_back(NewMMO);
KeepOld = false;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 9585b14..07f85e6 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -19,8 +19,6 @@
// the predicate register, they cannot use the .new form. In such cases it
// is better to collapse them back to a single MUX instruction.
-#define DEBUG_TYPE "hexmux"
-
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
@@ -47,6 +45,8 @@
#include <limits>
#include <utility>
+#define DEBUG_TYPE "hexmux"
+
using namespace llvm;
namespace llvm {
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 2f23e86..58f3cd5 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -1098,12 +1098,11 @@
for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
E = MRI->use_end(); I != E; I = nextI) {
nextI = std::next(I); // I is invalidated by the setReg
- MachineOperand &Use = *I;
MachineInstr *UseMI = I->getParent();
if (UseMI == MI)
continue;
- if (Use.isDebug())
- UseMI->getOperand(0).setReg(0U);
+ if (I->isDebug())
+ I->setReg(0U);
}
}
@@ -1370,10 +1369,8 @@
LLVM_DEBUG(dbgs() << "\nhw_loop head, "
<< printMBBReference(**L->block_begin()));
// Ignore all BBs that form Loop.
- for (MachineBasicBlock *MBB : L->getBlocks()) {
- if (A == MBB)
- return false;
- }
+ if (llvm::is_contained(L->getBlocks(), A))
+ return false;
MachineInstr *Def = MRI->getVRegDef(MO->getReg());
LoopFeederPhi.insert(std::make_pair(MO->getReg(), Def));
return true;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index bdd5c7d..fd404a1 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -54,10 +54,9 @@
#define GET_DAGISEL_BODY HexagonDAGToDAGISel
#include "HexagonGenDAGISel.inc"
+namespace llvm {
/// createHexagonISelDag - This pass converts a legalized DAG into a
/// Hexagon-specific DAG, ready for instruction scheduling.
-///
-namespace llvm {
FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new HexagonDAGToDAGISel(TM, OptLevel);
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 29e76b5..b50a0e2 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -820,15 +820,22 @@
HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}
MVT getSingleVT(MVT ElemTy) const {
+ assert(ElemTy != MVT::i1 && "Use getBoolVT for predicates");
unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8);
return MVT::getVectorVT(ElemTy, NumElems);
}
MVT getPairVT(MVT ElemTy) const {
+ assert(ElemTy != MVT::i1); // Suspicious: there are no predicate pairs.
unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8);
return MVT::getVectorVT(ElemTy, NumElems);
}
+ MVT getBoolVT() const {
+ // Return HwLen x i1.
+ return MVT::getVectorVT(MVT::i1, HwLen);
+ }
+
void selectShuffle(SDNode *N);
void selectRor(SDNode *N);
void selectVAlign(SDNode *N);
@@ -837,15 +844,16 @@
void select(SDNode *ISelN);
void materialize(const ResultStack &Results);
+ SDValue getConst32(int Val, const SDLoc &dl);
SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
enum : unsigned {
None,
PackMux,
};
- OpRef concat(OpRef Va, OpRef Vb, ResultStack &Results);
+ OpRef concats(OpRef Va, OpRef Vb, ResultStack &Results);
OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
- MutableArrayRef<int> NewMask, unsigned Options = None);
+ MutableArrayRef<int> NewMask, unsigned Options = None);
OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
MutableArrayRef<int> NewMask);
OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
@@ -917,6 +925,85 @@
return true;
}
+static SmallVector<unsigned, 4> getInputSegmentList(ShuffleMask SM,
+ unsigned SegLen) {
+ assert(isPowerOf2_32(SegLen));
+ SmallVector<unsigned, 4> SegList;
+ if (SM.MaxSrc == -1)
+ return SegList;
+
+ unsigned Shift = Log2_32(SegLen);
+ BitVector Segs(alignTo(SM.MaxSrc + 1, SegLen) >> Shift);
+
+ for (int I = 0, E = SM.Mask.size(); I != E; ++I) {
+ int M = SM.Mask[I];
+ if (M >= 0)
+ Segs.set(M >> Shift);
+ }
+
+ for (unsigned B : Segs.set_bits())
+ SegList.push_back(B);
+ return SegList;
+}
+
+static SmallVector<unsigned, 4> getOutputSegmentMap(ShuffleMask SM,
+ unsigned SegLen) {
+ // Calculate the layout of the output segments in terms of the input
+ // segments.
+ // For example [1,3,1,0] means that the output consists of 4 output
+ // segments, where the first output segment has only elements of the
+ // input segment at index 1. The next output segment only has elements
+ // of the input segment 3, etc.
+ // If an output segment only has undef elements, the value will be ~0u.
+ // If an output segment has elements from more than one input segment,
+ // the corresponding value will be ~1u.
+ unsigned MaskLen = SM.Mask.size();
+ assert(MaskLen % SegLen == 0);
+ SmallVector<unsigned, 4> Map(MaskLen / SegLen);
+
+ for (int S = 0, E = Map.size(); S != E; ++S) {
+ unsigned Idx = ~0u;
+ for (int I = 0; I != static_cast<int>(SegLen); ++I) {
+ int M = SM.Mask[S*SegLen + I];
+ if (M < 0)
+ continue;
+ unsigned G = M / SegLen; // Input segment of this element.
+ if (Idx == ~0u) {
+ Idx = G;
+ } else if (Idx != G) {
+ Idx = ~1u;
+ break;
+ }
+ }
+ Map[S] = Idx;
+ }
+
+ return Map;
+}
+
+static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap,
+ unsigned SegLen, MutableArrayRef<int> PackedMask) {
+ SmallVector<unsigned, 4> InvMap;
+ for (int I = OutSegMap.size() - 1; I >= 0; --I) {
+ unsigned S = OutSegMap[I];
+ assert(S != ~0u && "Unexpected undef");
+ assert(S != ~1u && "Unexpected multi");
+ if (InvMap.size() <= S)
+ InvMap.resize(S+1);
+ InvMap[S] = I;
+ }
+
+ unsigned Shift = Log2_32(SegLen);
+ for (int I = 0, E = Mask.size(); I != E; ++I) {
+ int M = Mask[I];
+ if (M >= 0) {
+ int OutIdx = InvMap[M >> Shift];
+ M = (M & (SegLen-1)) + SegLen*OutIdx;
+ }
+ PackedMask[I] = M;
+ }
+}
+
static bool isPermutation(ArrayRef<int> Mask) {
// Check by adding all numbers only works if there is no overflow.
assert(Mask.size() < 0x00007FFF && "Sanity failure");
@@ -1017,18 +1104,20 @@
DAG.RemoveDeadNodes();
}
-OpRef HvxSelector::concat(OpRef Lo, OpRef Hi, ResultStack &Results) {
+OpRef HvxSelector::concats(OpRef Lo, OpRef Hi, ResultStack &Results) {
DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
const SDLoc &dl(Results.InpNode);
Results.push(TargetOpcode::REG_SEQUENCE, getPairVT(MVT::i8), {
- DAG.getTargetConstant(Hexagon::HvxWRRegClassID, dl, MVT::i32),
- Lo, DAG.getTargetConstant(Hexagon::vsub_lo, dl, MVT::i32),
- Hi, DAG.getTargetConstant(Hexagon::vsub_hi, dl, MVT::i32),
+ getConst32(Hexagon::HvxWRRegClassID, dl),
+ Lo, getConst32(Hexagon::vsub_lo, dl),
+ Hi, getConst32(Hexagon::vsub_hi, dl),
});
return OpRef::res(Results.top());
}
-// Va, Vb are single vectors, SM can be arbitrarily long.
+// Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb,
+// pack these halves into a single vector, and remap SM into NewMask to use
+// the new vector instead.
OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
ResultStack &Results, MutableArrayRef<int> NewMask,
unsigned Options) {
@@ -1036,80 +1125,205 @@
if (!Va.isValid() || !Vb.isValid())
return OpRef::fail();
- int VecLen = SM.Mask.size();
MVT Ty = getSingleVT(MVT::i8);
+ MVT PairTy = getPairVT(MVT::i8);
+ OpRef Inp[2] = {Va, Vb};
+ unsigned VecLen = SM.Mask.size();
- auto IsExtSubvector = [] (ShuffleMask M) {
- assert(M.MinSrc >= 0 && M.MaxSrc >= 0);
- for (int I = 0, E = M.Mask.size(); I != E; ++I) {
- if (M.Mask[I] >= 0 && M.Mask[I]-I != M.MinSrc)
- return false;
+ auto valign = [this](OpRef Lo, OpRef Hi, unsigned Amt, MVT Ty,
+ ResultStack &Results) {
+ if (Amt == 0)
+ return Lo;
+ const SDLoc &dl(Results.InpNode);
+ if (isUInt<3>(Amt) || isUInt<3>(HwLen - Amt)) {
+ bool IsRight = isUInt<3>(Amt); // Right align.
+ SDValue S = getConst32(IsRight ? Amt : HwLen - Amt, dl);
+ unsigned Opc = IsRight ? Hexagon::V6_valignbi : Hexagon::V6_vlalignbi;
+ Results.push(Opc, Ty, {Hi, Lo, S});
+ return OpRef::res(Results.top());
}
- return true;
+ Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(Amt, dl)});
+ OpRef A = OpRef::res(Results.top());
+ Results.push(Hexagon::V6_valignb, Ty, {Hi, Lo, A});
+ return OpRef::res(Results.top());
};
- if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
- if (SM.MinSrc == 0 || SM.MinSrc == int(HwLen) || !IsExtSubvector(SM)) {
- // If the mask picks elements from only one of the operands, return
- // that operand, and update the mask to use index 0 to refer to the
- // first element of that operand.
- // If the mask extracts a subvector, it will be handled below, so
- // skip it here.
- if (SM.MaxSrc < int(HwLen)) {
- memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
- return Va;
- }
- if (SM.MinSrc >= int(HwLen)) {
- for (int I = 0; I != VecLen; ++I) {
- int M = SM.Mask[I];
- if (M != -1)
- M -= HwLen;
- NewMask[I] = M;
- }
- return Vb;
- }
- }
- int MinSrc = SM.MinSrc;
- if (SM.MaxSrc < int(HwLen)) {
- Vb = Va;
- } else if (SM.MinSrc > int(HwLen)) {
- Va = Vb;
- MinSrc = SM.MinSrc - HwLen;
- }
- const SDLoc &dl(Results.InpNode);
- if (isUInt<3>(MinSrc) || isUInt<3>(HwLen-MinSrc)) {
- bool IsRight = isUInt<3>(MinSrc); // Right align.
- SDValue S = DAG.getTargetConstant(IsRight ? MinSrc : HwLen-MinSrc,
- dl, MVT::i32);
- unsigned Opc = IsRight ? Hexagon::V6_valignbi
- : Hexagon::V6_vlalignbi;
- Results.push(Opc, Ty, {Vb, Va, S});
- } else {
- SDValue S = DAG.getTargetConstant(MinSrc, dl, MVT::i32);
- Results.push(Hexagon::A2_tfrsi, MVT::i32, {S});
- unsigned Top = Results.top();
- Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)});
- }
- for (int I = 0; I != VecLen; ++I) {
+ // Segment is a vector half.
+ unsigned SegLen = HwLen / 2;
+
+ // Check if we can shuffle vector halves around to get the used elements
+ // into a single vector.
+ SmallVector<int,128> MaskH(SM.Mask.begin(), SM.Mask.end());
+ SmallVector<unsigned, 4> SegList = getInputSegmentList(SM.Mask, SegLen);
+ unsigned SegCount = SegList.size();
+ SmallVector<unsigned, 4> SegMap = getOutputSegmentMap(SM.Mask, SegLen);
+
+ if (SegList.empty())
+ return OpRef::undef(Ty);
+
+ // NOTE:
+ // In the following part of the function, where the segments are rearranged,
+ // the shuffle mask SM can be of any length that is a multiple of a vector
+ // (i.e. a multiple of 2*SegLen), and non-zero.
+ // The output segment map is computed, and it may have any even number of
+ // entries, but the rearrangement of input segments will be done based only
+ // on the first two (non-undef) entries in the segment map.
+ // For example, if the output map is 3, 1, 1, 3 (it can have at most two
+ // distinct entries!), the segments 1 and 3 of Va/Vb will be packaged into
+ // a single vector V = 3:1. The output mask will then be updated to use
+ // seg(0,V), seg(1,V), seg(1,V), seg(0,V).
+ //
+ // Picking the segments based on the output map is an optimization. For
+ // correctness it is only necessary that Seg0 and Seg1 are the two input
+ // segments that are used in the output.
+
+ unsigned Seg0 = ~0u, Seg1 = ~0u;
+ for (int I = 0, E = SegMap.size(); I != E; ++I) {
+ unsigned X = SegMap[I];
+ if (X == ~0u)
+ continue;
+ if (Seg0 == ~0u)
+ Seg0 = X;
+ else if (Seg1 != ~0u)
+ break;
+ if (X == ~1u || X != Seg0)
+ Seg1 = X;
+ }
+
+ if (SegCount == 1) {
+ unsigned SrcOp = SegList[0] / 2;
+ for (int I = 0; I != static_cast<int>(VecLen); ++I) {
int M = SM.Mask[I];
- if (M != -1)
- M -= SM.MinSrc;
+ if (M >= 0) {
+ M -= SrcOp * HwLen;
+ assert(M >= 0);
+ }
NewMask[I] = M;
}
- return OpRef::res(Results.top());
+ return Inp[SrcOp];
}
+ if (SegCount == 2) {
+ // Seg0 should not be undef here: this would imply a SegList
+ // with <= 1 elements, which was checked earlier.
+ assert(Seg0 != ~0u);
+
+ // If Seg0 or Seg1 are "multi-defined", pick them from the input
+ // segment list instead.
+ if (Seg0 == ~1u || Seg1 == ~1u) {
+ if (Seg0 == Seg1) {
+ Seg0 = SegList[0];
+ Seg1 = SegList[1];
+ } else if (Seg0 == ~1u) {
+ Seg0 = SegList[0] != Seg1 ? SegList[0] : SegList[1];
+ } else {
+ assert(Seg1 == ~1u); // Sanity
+ Seg1 = SegList[0] != Seg0 ? SegList[0] : SegList[1];
+ }
+ }
+ assert(Seg0 != ~1u && Seg1 != ~1u);
+
+ assert(Seg0 != Seg1 && "Expecting different segments");
+ const SDLoc &dl(Results.InpNode);
+ Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(SegLen, dl)});
+ OpRef HL = OpRef::res(Results.top());
+
+ // Va = AB, Vb = CD
+
+ if (Seg0 / 2 == Seg1 / 2) {
+ // Same input vector.
+ Va = Inp[Seg0 / 2];
+ if (Seg0 > Seg1) {
+ // Swap halves.
+ Results.push(Hexagon::V6_vror, Ty, {Inp[Seg0 / 2], HL});
+ Va = OpRef::res(Results.top());
+ }
+ packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
+ } else if (Seg0 % 2 == Seg1 % 2) {
+ // Picking AC, BD, CA, or DB.
+ // vshuff(CD,AB,HL) -> BD:AC
+ // vshuff(AB,CD,HL) -> DB:CA
+ auto Vs = (Seg0 == 0 || Seg0 == 1) ? std::make_pair(Vb, Va) // AC or BD
+ : std::make_pair(Va, Vb); // CA or DB
+ Results.push(Hexagon::V6_vshuffvdd, PairTy, {Vs.first, Vs.second, HL});
+ OpRef P = OpRef::res(Results.top());
+ Va = (Seg0 == 0 || Seg0 == 2) ? OpRef::lo(P) : OpRef::hi(P);
+ packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
+ } else {
+ // Picking AD, BC, CB, or DA.
+ if ((Seg0 == 0 && Seg1 == 3) || (Seg0 == 2 && Seg1 == 1)) {
+ // AD or BC: this can be done using vmux.
+ // Q = V6_pred_scalar2 SegLen
+ // V = V6_vmux Q, (Va, Vb) or (Vb, Va)
+ Results.push(Hexagon::V6_pred_scalar2, getBoolVT(), {HL});
+ OpRef Qt = OpRef::res(Results.top());
+ auto Vs = (Seg0 == 0) ? std::make_pair(Va, Vb) // AD
+ : std::make_pair(Vb, Va); // CB
+ Results.push(Hexagon::V6_vmux, Ty, {Qt, Vs.first, Vs.second});
+ Va = OpRef::res(Results.top());
+ packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
+ } else {
+ // BC or DA: this could be done via valign by SegLen.
+ // Do nothing here, because valign (if possible) will be generated
+ // later on (make sure the Seg0 values are as expected, for sanity).
+ assert(Seg0 == 1 || Seg0 == 3);
+ }
+ }
+ }
+
+ // Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va).
+
+ ShuffleMask SMH(MaskH);
+ assert(SMH.Mask.size() == VecLen);
+ SmallVector<int,128> MaskA(SMH.Mask.begin(), SMH.Mask.end());
+
+ if (SMH.MaxSrc - SMH.MinSrc >= static_cast<int>(HwLen)) {
+ // valign(Lo=Va,Hi=Vb) won't work. Try swapping Va/Vb.
+ SmallVector<int,128> Swapped(SMH.Mask.begin(), SMH.Mask.end());
+ ShuffleVectorSDNode::commuteMask(Swapped);
+ ShuffleMask SW(Swapped);
+ if (SW.MaxSrc - SW.MinSrc < static_cast<int>(HwLen)) {
+ MaskA.assign(SW.Mask.begin(), SW.Mask.end());
+ std::swap(Va, Vb);
+ }
+ }
+ ShuffleMask SMA(MaskA);
+ assert(SMA.Mask.size() == VecLen);
+
+ if (SMA.MaxSrc - SMA.MinSrc < static_cast<int>(HwLen)) {
+ int ShiftR = SMA.MinSrc;
+ if (ShiftR >= static_cast<int>(HwLen)) {
+ Va = Vb;
+ Vb = OpRef::undef(Ty);
+ ShiftR -= HwLen;
+ }
+ OpRef RetVal = valign(Va, Vb, ShiftR, Ty, Results);
+
+ for (int I = 0; I != static_cast<int>(VecLen); ++I) {
+ int M = SMA.Mask[I];
+ if (M != -1)
+ M -= SMA.MinSrc;
+ NewMask[I] = M;
+ }
+ return RetVal;
+ }
+
+ // By here, packing by segment (half-vector) shuffling, and vector alignment
+ // failed. Try vmux.
+ // Note: since this is using the original mask, Va and Vb must not have been
+ // modified.
+
if (Options & PackMux) {
// If elements picked from Va and Vb have all different (source) indexes
// (relative to the start of the argument), do a mux, and update the mask.
BitVector Picked(HwLen);
SmallVector<uint8_t,128> MuxBytes(HwLen);
bool CanMux = true;
- for (int I = 0; I != VecLen; ++I) {
+ for (int I = 0; I != static_cast<int>(VecLen); ++I) {
int M = SM.Mask[I];
if (M == -1)
continue;
- if (M >= int(HwLen))
+ if (M >= static_cast<int>(HwLen))
M -= HwLen;
else
MuxBytes[M] = 0xFF;
@@ -1122,27 +1336,23 @@
if (CanMux)
return vmuxs(MuxBytes, Va, Vb, Results);
}
-
return OpRef::fail();
}
+// Va, Vb are vector pairs. If SM only uses two single vectors from Va/Vb,
+// pack these vectors into a pair, and remap SM into NewMask to use the
+// new pair instead.
OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
ResultStack &Results, MutableArrayRef<int> NewMask) {
DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
- unsigned HalfMask = 0;
- unsigned LogHw = Log2_32(HwLen);
- for (int M : SM.Mask) {
- if (M == -1)
- continue;
- HalfMask |= (1u << (M >> LogHw));
- }
-
- if (HalfMask == 0)
+ SmallVector<unsigned, 4> SegList = getInputSegmentList(SM.Mask, HwLen);
+ if (SegList.empty())
return OpRef::undef(getPairVT(MVT::i8));
// If more than two halves are used, bail.
// TODO: be more aggressive here?
- if (countPopulation(HalfMask) > 2)
+ unsigned SegCount = SegList.size();
+ if (SegCount > 2)
return OpRef::fail();
MVT HalfTy = getSingleVT(MVT::i8);
@@ -1150,29 +1360,23 @@
OpRef Inp[2] = { Va, Vb };
OpRef Out[2] = { OpRef::undef(HalfTy), OpRef::undef(HalfTy) };
- uint8_t HalfIdx[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
- unsigned Idx = 0;
- for (unsigned I = 0; I != 4; ++I) {
- if ((HalfMask & (1u << I)) == 0)
- continue;
- assert(Idx < 2);
- OpRef Op = Inp[I/2];
- Out[Idx] = (I & 1) ? OpRef::hi(Op) : OpRef::lo(Op);
- HalfIdx[I] = Idx++;
+ // Really make sure we have at most 2 vectors used in the mask.
+ assert(SegCount <= 2);
+
+ for (int I = 0, E = SegList.size(); I != E; ++I) {
+ unsigned S = SegList[I];
+ OpRef Op = Inp[S / 2];
+ Out[I] = (S & 1) ? OpRef::hi(Op) : OpRef::lo(Op);
}
- int VecLen = SM.Mask.size();
- for (int I = 0; I != VecLen; ++I) {
- int M = SM.Mask[I];
- if (M >= 0) {
- uint8_t Idx = HalfIdx[M >> LogHw];
- assert(Idx == 0 || Idx == 1);
- M = (M & (HwLen-1)) + HwLen*Idx;
- }
- NewMask[I] = M;
- }
-
- return concat(Out[0], Out[1], Results);
+ // NOTE: Using SegList as the packing map here (not SegMap). This works,
+ // because we're not concerned here about the order of the segments (i.e.
+ // single vectors) in the output pair. Changing the order of vectors is
+ // free (as opposed to changing the order of vector halves as in packs),
+ // and so there is no extra cost added in case the order needs to be
+ // changed later.
+ packSegmentMask(SM.Mask, SegList, HwLen, NewMask);
+ return concats(Out[0], Out[1], Results);
}
OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
@@ -1194,7 +1398,7 @@
size_t S = Bytes.size() / 2;
OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results);
OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results);
- return concat(L, H, Results);
+ return concats(L, H, Results);
}
OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
@@ -1209,6 +1413,25 @@
if (isUndef(SM.Mask))
return OpRef::undef(getSingleVT(MVT::i8));
+ unsigned HalfLen = HwLen / 2;
+ assert(isPowerOf2_32(HalfLen)); // Sanity.
+
+ // Handle special case where the output is the same half of the input
+ // repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
+ std::pair<int, unsigned> Strip1 = findStrip(SM.Mask, 1, HalfLen);
+ if ((Strip1.first & ~HalfLen) == 0 && Strip1.second == HalfLen) {
+ std::pair<int, unsigned> Strip2 =
+ findStrip(SM.Mask.drop_front(HalfLen), 1, HalfLen);
+ if (Strip1 == Strip2) {
+ const SDLoc &dl(Results.InpNode);
+ Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(HalfLen, dl)});
+ Results.push(Hexagon::V6_vshuffvdd, getPairVT(MVT::i8),
+ {Va, Va, OpRef::res(Results.top())});
+ OpRef S = OpRef::res(Results.top());
+ return (Strip1.first == 0) ? OpRef::lo(S) : OpRef::hi(S);
+ }
+ }
+
OpRef P = perfect(SM, Va, Results);
if (P.isValid())
return P;
@@ -1226,10 +1449,14 @@
return C;
int VecLen = SM.Mask.size();
- SmallVector<int,128> NewMask(VecLen);
- OpRef P = packs(SM, Va, Vb, Results, NewMask);
+ SmallVector<int,128> PackedMask(VecLen);
+ OpRef P = packs(SM, Va, Vb, Results, PackedMask);
if (P.isValid())
- return shuffs1(ShuffleMask(NewMask), P, Results);
+ return shuffs1(ShuffleMask(PackedMask), P, Results);
+
+ // TODO: Before we split the mask, try perfect shuffle on concatenated
+ // operands. This won't work now, because the perfect code does not
+ // tolerate undefs in the mask.
SmallVector<int,128> MaskL(VecLen), MaskR(VecLen);
splitMask(SM.Mask, MaskL, MaskR);
@@ -1267,7 +1494,7 @@
OpRef L = shuffs1(PM.lo(), P, Results);
OpRef H = shuffs1(PM.hi(), P, Results);
if (L.isValid() && H.isValid())
- return concat(L, H, Results);
+ return concats(L, H, Results);
}
OpRef R = perfect(SM, Va, Results);
@@ -1278,7 +1505,7 @@
OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results);
OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results);
if (L.isValid() && H.isValid())
- return concat(L, H, Results);
+ return concats(L, H, Results);
return OpRef::fail();
}
@@ -1650,7 +1877,7 @@
int L = Strip.second;
// First, check the non-ignored strips.
- for (int I = 2*L; I < 2*N; I += 2*L) {
+ for (int I = 2*L; I < N; I += 2*L) {
auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
if (S.second != unsigned(L))
return OpRef::fail();
@@ -1658,7 +1885,7 @@
return OpRef::fail();
}
// Check the -1s.
- for (int I = L; I < 2*N; I += 2*L) {
+ for (int I = L; I < N; I += 2*L) {
auto S = findStrip(SM.Mask.drop_front(I), 0, N-I);
if (S.first != -1 || S.second != unsigned(L))
return OpRef::fail();
@@ -1945,9 +2172,9 @@
const SDLoc &dl(Results.InpNode);
OpRef Arg = HavePairs ? Va
- : concat(Va, OpRef::undef(SingleTy), Results);
+ : concats(Va, OpRef::undef(SingleTy), Results);
if (InvertedPair)
- Arg = concat(OpRef::hi(Arg), OpRef::lo(Arg), Results);
+ Arg = concats(OpRef::hi(Arg), OpRef::lo(Arg), Results);
for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1];
@@ -1962,8 +2189,7 @@
++I;
NodeTemplate Res;
- Results.push(Hexagon::A2_tfrsi, MVT::i32,
- { DAG.getTargetConstant(S, dl, MVT::i32) });
+ Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(S, dl)});
Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
Res.Ty = PairTy;
Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) };
@@ -2026,6 +2252,10 @@
return OpRef::fail();
}
+SDValue HvxSelector::getConst32(int Val, const SDLoc &dl) {
+ return DAG.getTargetConstant(Val, dl, MVT::i32);
+}
+
SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
const SDLoc &dl) {
SmallVector<SDValue, 128> Elems;
@@ -2126,9 +2356,8 @@
if (S == 0) {
NewN = VecV.getNode();
} else if (isUInt<3>(S)) {
- SDValue C = DAG.getTargetConstant(S, dl, MVT::i32);
NewN = DAG.getMachineNode(Hexagon::V6_valignbi, dl, Ty,
- {VecV, VecV, C});
+ {VecV, VecV, getConst32(S, dl)});
}
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index c8994a3..6ded323 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -35,6 +35,8 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InlineAsm.h"
@@ -42,6 +44,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -219,8 +222,29 @@
// Copy the result values into the output registers.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
+ SDValue Val = OutVals[i];
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+ switch (VA.getLocInfo()) {
+ default:
+ // Loc info must be one of Full, BCvt, SExt, ZExt, or AExt.
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getBitcast(VA.getLocVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Val);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Val, Flag);
// Guarantee that all emitted copies are stuck together with flags.
Flag = Chain.getValue(1);
@@ -308,6 +332,8 @@
.Case("m1", Hexagon::M1)
.Case("usr", Hexagon::USR)
.Case("ugp", Hexagon::UGP)
+ .Case("cs0", Hexagon::CS0)
+ .Case("cs1", Hexagon::CS1)
.Default(Register());
if (Reg)
return Reg;
@@ -498,7 +524,7 @@
if (NeedsArgAlign && Subtarget.hasV60Ops()) {
LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
- Align VecAlign(HRI.getSpillAlignment(Hexagon::HvxVRRegClass));
+ Align VecAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
LargestAlignSeen = std::max(LargestAlignSeen, VecAlign);
MFI.ensureMaxAlignment(LargestAlignSeen);
}
@@ -701,7 +727,7 @@
SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDLoc dl(Op);
- SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
return DAG.getNode(HexagonISD::READCYCLE, dl, VTs, Chain);
}
@@ -1703,6 +1729,12 @@
setOperationAction(ISD::STORE, VT, Custom);
}
+ // Custom-lower load/stores of boolean vectors.
+ for (MVT VT : {MVT::v2i1, MVT::v4i1, MVT::v8i1}) {
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ }
+
for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v8i8, MVT::v2i32, MVT::v4i16,
MVT::v2i32}) {
setCondCodeAction(ISD::SETNE, VT, Expand);
@@ -1882,24 +1914,57 @@
return nullptr;
}
-void
-HexagonTargetLowering::validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
- unsigned NeedAlign) const {
+bool
+HexagonTargetLowering::validateConstPtrAlignment(SDValue Ptr, Align NeedAlign,
+ const SDLoc &dl, SelectionDAG &DAG) const {
auto *CA = dyn_cast<ConstantSDNode>(Ptr);
if (!CA)
- return;
+ return true;
unsigned Addr = CA->getZExtValue();
- unsigned HaveAlign = Addr != 0 ? 1u << countTrailingZeros(Addr) : NeedAlign;
- if (HaveAlign < NeedAlign) {
- std::string ErrMsg;
- raw_string_ostream O(ErrMsg);
- O << "Misaligned constant address: " << format_hex(Addr, 10)
- << " has alignment " << HaveAlign
- << ", but the memory access requires " << NeedAlign;
- if (DebugLoc DL = dl.getDebugLoc())
- DL.print(O << ", at ");
- report_fatal_error(O.str());
- }
+ Align HaveAlign =
+ Addr != 0 ? Align(1ull << countTrailingZeros(Addr)) : NeedAlign;
+ if (HaveAlign >= NeedAlign)
+ return true;
+
+ static int DK_MisalignedTrap = llvm::getNextAvailablePluginDiagnosticKind();
+
+ struct DiagnosticInfoMisalignedTrap : public DiagnosticInfo {
+ DiagnosticInfoMisalignedTrap(StringRef M)
+ : DiagnosticInfo(DK_MisalignedTrap, DS_Remark), Msg(M) {}
+ void print(DiagnosticPrinter &DP) const override {
+ DP << Msg;
+ }
+ static bool classof(const DiagnosticInfo *DI) {
+ return DI->getKind() == DK_MisalignedTrap;
+ }
+ StringRef Msg;
+ };
+
+ std::string ErrMsg;
+ raw_string_ostream O(ErrMsg);
+ O << "Misaligned constant address: " << format_hex(Addr, 10)
+ << " has alignment " << HaveAlign.value()
+ << ", but the memory access requires " << NeedAlign.value();
+ if (DebugLoc DL = dl.getDebugLoc())
+ DL.print(O << ", at ");
+ O << ". The instruction has been replaced with a trap.";
+
+ DAG.getContext()->diagnose(DiagnosticInfoMisalignedTrap(O.str()));
+ return false;
+}
+
+SDValue
+HexagonTargetLowering::replaceMemWithUndef(SDValue Op, SelectionDAG &DAG)
+ const {
+ const SDLoc &dl(Op);
+ auto *LS = cast<LSBaseSDNode>(Op.getNode());
+ assert(!LS->isIndexed() && "Not expecting indexed ops on constant address");
+
+ SDValue Chain = LS->getChain();
+ SDValue Trap = DAG.getNode(ISD::TRAP, dl, MVT::Other, Chain);
+ if (LS->getOpcode() == ISD::LOAD)
+ return DAG.getMergeValues({DAG.getUNDEF(ty(Op)), Trap}, dl);
+ return Trap;
}
// Bit-reverse Load Intrinsic: Check if the instruction is a bit reverse load
@@ -2072,7 +2137,7 @@
TargetLoweringBase::LegalizeTypeAction
HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
- unsigned VecLen = VT.getVectorNumElements();
+ unsigned VecLen = VT.getVectorMinNumElements();
MVT ElemTy = VT.getVectorElementType();
if (VecLen == 1 || VT.isScalableVector())
@@ -2855,27 +2920,64 @@
SDValue
HexagonTargetLowering::LowerLoad(SDValue Op, SelectionDAG &DAG) const {
+ MVT Ty = ty(Op);
+ const SDLoc &dl(Op);
+ // Lower loads of scalar predicate vectors (v2i1, v4i1, v8i1) to loads of i1
+ // followed by a TYPECAST.
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
- unsigned ClaimAlign = LN->getAlignment();
- validateConstPtrAlignment(LN->getBasePtr(), SDLoc(Op), ClaimAlign);
+ bool DoCast = (Ty == MVT::v2i1 || Ty == MVT::v4i1 || Ty == MVT::v8i1);
+ if (DoCast) {
+ SDValue NL = DAG.getLoad(
+ LN->getAddressingMode(), LN->getExtensionType(), MVT::i1, dl,
+ LN->getChain(), LN->getBasePtr(), LN->getOffset(), LN->getPointerInfo(),
+ /*MemoryVT*/ MVT::i1, LN->getAlign(), LN->getMemOperand()->getFlags(),
+ LN->getAAInfo(), LN->getRanges());
+ LN = cast<LoadSDNode>(NL.getNode());
+ }
+
+ Align ClaimAlign = LN->getAlign();
+ if (!validateConstPtrAlignment(LN->getBasePtr(), ClaimAlign, dl, DAG))
+ return replaceMemWithUndef(Op, DAG);
+
// Call LowerUnalignedLoad for all loads, it recognizes loads that
// don't need extra aligning.
- return LowerUnalignedLoad(Op, DAG);
+ SDValue LU = LowerUnalignedLoad(SDValue(LN, 0), DAG);
+ if (DoCast) {
+ SDValue TC = DAG.getNode(HexagonISD::TYPECAST, dl, Ty, LU);
+ SDValue Ch = cast<LoadSDNode>(LU.getNode())->getChain();
+ return DAG.getMergeValues({TC, Ch}, dl);
+ }
+ return LU;
}
SDValue
HexagonTargetLowering::LowerStore(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
- unsigned ClaimAlign = SN->getAlignment();
- SDValue Ptr = SN->getBasePtr();
const SDLoc &dl(Op);
- validateConstPtrAlignment(Ptr, dl, ClaimAlign);
+ StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+ SDValue Val = SN->getValue();
+ MVT Ty = ty(Val);
+
+ bool DoCast = (Ty == MVT::v2i1 || Ty == MVT::v4i1 || Ty == MVT::v8i1);
+ if (DoCast) {
+ SDValue TC = DAG.getNode(HexagonISD::TYPECAST, dl, MVT::i1, Val);
+ SDValue NS = DAG.getStore(SN->getChain(), dl, TC, SN->getBasePtr(),
+ SN->getMemOperand());
+ if (SN->isIndexed()) {
+ NS = DAG.getIndexedStore(NS, dl, SN->getBasePtr(), SN->getOffset(),
+ SN->getAddressingMode());
+ }
+ SN = cast<StoreSDNode>(NS.getNode());
+ }
+
+ Align ClaimAlign = SN->getAlign();
+ if (!validateConstPtrAlignment(SN->getBasePtr(), ClaimAlign, dl, DAG))
+ return replaceMemWithUndef(Op, DAG);
MVT StoreTy = SN->getMemoryVT().getSimpleVT();
- unsigned NeedAlign = Subtarget.getTypeAlignment(StoreTy);
+ Align NeedAlign = Subtarget.getTypeAlignment(StoreTy);
if (ClaimAlign < NeedAlign)
return expandUnalignedStore(SN, DAG);
- return Op;
+ return SDValue(SN, 0);
}
SDValue
@@ -2883,8 +2985,8 @@
const {
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
MVT LoadTy = ty(Op);
- unsigned NeedAlign = Subtarget.getTypeAlignment(LoadTy);
- unsigned HaveAlign = LN->getAlignment();
+ unsigned NeedAlign = Subtarget.getTypeAlignment(LoadTy).value();
+ unsigned HaveAlign = LN->getAlign().value();
if (HaveAlign >= NeedAlign)
return Op;
@@ -2952,7 +3054,7 @@
WideMMO = MF.getMachineMemOperand(
MMO->getPointerInfo(), MMO->getFlags(), 2 * LoadLen, Align(LoadLen),
MMO->getAAInfo(), MMO->getRanges(), MMO->getSyncScopeID(),
- MMO->getOrdering(), MMO->getFailureOrdering());
+ MMO->getSuccessOrdering(), MMO->getFailureOrdering());
}
SDValue Load0 = DAG.getLoad(LoadTy, dl, Chain, Base0, WideMMO);
@@ -3442,8 +3544,8 @@
}
bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Alignment,
- MachineMemOperand::Flags Flags, bool *Fast) const {
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
MVT SVT = VT.getSimpleVT();
if (Subtarget.isHVXVectorType(SVT, true))
return allowsHvxMisalignedMemoryAccesses(SVT, Flags, Fast);
@@ -3489,31 +3591,32 @@
return true;
}
-Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
- AtomicOrdering Ord) const {
+Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder,
+ Type *ValueTy, Value *Addr,
+ AtomicOrdering Ord) const {
BasicBlock *BB = Builder.GetInsertBlock();
Module *M = BB->getParent()->getParent();
- auto PT = cast<PointerType>(Addr->getType());
- Type *Ty = PT->getElementType();
- unsigned SZ = Ty->getPrimitiveSizeInBits();
+ unsigned SZ = ValueTy->getPrimitiveSizeInBits();
assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
: Intrinsic::hexagon_L4_loadd_locked;
Function *Fn = Intrinsic::getDeclaration(M, IntID);
- PointerType *NewPtrTy
- = Builder.getIntNTy(SZ)->getPointerTo(PT->getAddressSpace());
+ auto PtrTy = cast<PointerType>(Addr->getType());
+ PointerType *NewPtrTy =
+ Builder.getIntNTy(SZ)->getPointerTo(PtrTy->getAddressSpace());
Addr = Builder.CreateBitCast(Addr, NewPtrTy);
Value *Call = Builder.CreateCall(Fn, Addr, "larx");
- return Builder.CreateBitCast(Call, Ty);
+ return Builder.CreateBitCast(Call, ValueTy);
}
/// Perform a store-conditional operation to Addr. Return the status of the
/// store. This should be 0 if the store succeeded, non-zero otherwise.
-Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
- Value *Val, Value *Addr, AtomicOrdering Ord) const {
+Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
+ Value *Val, Value *Addr,
+ AtomicOrdering Ord) const {
BasicBlock *BB = Builder.GetInsertBlock();
Module *M = BB->getParent()->getParent();
Type *Ty = Val->getType();
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index cfccb14a..d518c03 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -278,13 +278,6 @@
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
- unsigned
- getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
- return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
- }
-
// Intrinsics
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
@@ -318,8 +311,9 @@
bool *Fast) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
- const override;
+ Align Alignment,
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const override;
/// Returns relocation base for the given PIC jumptable.
SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
@@ -329,10 +323,10 @@
EVT NewVT) const override;
// Handling of atomic RMW instructions.
- Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
- AtomicOrdering Ord) const override;
- Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
- Value *Addr, AtomicOrdering Ord) const override;
+ Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
+ AtomicOrdering Ord) const override;
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
AtomicExpansionKind
@@ -347,8 +341,9 @@
void initializeHVXLowering();
unsigned getPreferredHvxVectorAction(MVT VecTy) const;
- void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
- unsigned NeedAlign) const;
+ bool validateConstPtrAlignment(SDValue Ptr, Align NeedAlign, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ SDValue replaceMemWithUndef(SDValue Op, SelectionDAG &DAG) const;
std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 29b7581..e7d3c7c 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1954,6 +1954,8 @@
unsigned WideOpLen = (8 * HwLen) / ElemTy.getSizeInBits();
assert(WideOpLen * ElemTy.getSizeInBits() == 8 * HwLen);
MVT WideOpTy = MVT::getVectorVT(ElemTy, WideOpLen);
+ if (!Subtarget.isHVXVectorType(WideOpTy, true))
+ return SDValue();
SDValue WideOp0 = appendUndef(Op0, WideOpTy, DAG);
SDValue WideOp1 = appendUndef(Op1, WideOpTy, DAG);
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 26fc093..f14eaac 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1022,7 +1022,7 @@
return true;
};
- auto UseAligned = [&] (const MachineInstr &MI, unsigned NeedAlign) {
+ auto UseAligned = [&](const MachineInstr &MI, Align NeedAlign) {
if (MI.memoperands().empty())
return false;
return all_of(MI.memoperands(), [NeedAlign](const MachineMemOperand *MMO) {
@@ -1086,7 +1086,7 @@
const MachineOperand &BaseOp = MI.getOperand(1);
assert(BaseOp.getSubReg() == 0);
int Offset = MI.getOperand(2).getImm();
- unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+ Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
unsigned NewOpc = UseAligned(MI, NeedAlign) ? Hexagon::V6_vL32b_ai
: Hexagon::V6_vL32Ub_ai;
BuildMI(MBB, MI, DL, get(NewOpc), DstReg)
@@ -1102,7 +1102,7 @@
assert(BaseOp.getSubReg() == 0);
int Offset = MI.getOperand(2).getImm();
unsigned VecOffset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
- unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+ Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
unsigned NewOpc = UseAligned(MI, NeedAlign) ? Hexagon::V6_vL32b_ai
: Hexagon::V6_vL32Ub_ai;
BuildMI(MBB, MI, DL, get(NewOpc),
@@ -1124,7 +1124,7 @@
const MachineOperand &BaseOp = MI.getOperand(0);
assert(BaseOp.getSubReg() == 0);
int Offset = MI.getOperand(1).getImm();
- unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+ Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
unsigned NewOpc = UseAligned(MI, NeedAlign) ? Hexagon::V6_vS32b_ai
: Hexagon::V6_vS32Ub_ai;
BuildMI(MBB, MI, DL, get(NewOpc))
@@ -1141,7 +1141,7 @@
assert(BaseOp.getSubReg() == 0);
int Offset = MI.getOperand(1).getImm();
unsigned VecOffset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
- unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+ Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
unsigned NewOpc = UseAligned(MI, NeedAlign) ? Hexagon::V6_vS32b_ai
: Hexagon::V6_vS32Ub_ai;
BuildMI(MBB, MI, DL, get(NewOpc))
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 10d0261..370ea5f 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -365,41 +365,101 @@
def: Pat<(int_hexagon_V6_vdd0_128B),
(V6_vdd0)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
- (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
- (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
- (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
- (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
- (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+
+multiclass T_VP_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID HvxVR:$Vu, DoubleRegs:$Rt),
+ (MI HvxVR:$Vu, DoubleRegs:$Rt)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ HvxVR:$Vu, DoubleRegs:$Rt),
+ (MI HvxVR:$Vu, DoubleRegs:$Rt)>;
+}
+
+multiclass T_WVP_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID HvxWR:$Vx, HvxVR:$Vu, DoubleRegs:$Rt),
+ (MI HvxWR:$Vx, HvxVR:$Vu, DoubleRegs:$Rt)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ HvxWR:$Vx, HvxVR:$Vu, DoubleRegs:$Rt),
+ (MI HvxWR:$Vx, HvxVR:$Vu, DoubleRegs:$Rt)>;
+}
+
+// These are actually only in V65.
+let Predicates = [HasV65, UseHVX] in {
+ defm: T_VP_pat<V6_vrmpyub_rtt, int_hexagon_V6_vrmpyub_rtt>;
+ defm: T_VP_pat<V6_vrmpybub_rtt, int_hexagon_V6_vrmpybub_rtt>;
+
+ defm: T_WVP_pat<V6_vrmpyub_rtt_acc, int_hexagon_V6_vrmpyub_rtt_acc>;
+ defm: T_WVP_pat<V6_vrmpybub_rtt_acc, int_hexagon_V6_vrmpybub_rtt_acc>;
+}
+
+
+multiclass T_pRI_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID PredRegs:$P, IntRegs:$R, timm:$s),
+ (MI PredRegs:$P, IntRegs:$R, imm:$s)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ PredRegs:$P, IntRegs:$R, timm:$s),
+ (MI PredRegs:$P, IntRegs:$R, imm:$s)>;
+}
+
+multiclass T_pRM_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID PredRegs:$P, IntRegs:$R, ModRegs:$M),
+ (MI PredRegs:$P, IntRegs:$R, ModRegs:$M)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ PredRegs:$P, IntRegs:$R, ModRegs:$M),
+ (MI PredRegs:$P, IntRegs:$R, ModRegs:$M)>;
+}
+
+let Predicates = [HasV62, UseHVX] in {
+ defm: T_pRI_pat<V6_vL32b_pred_ai, int_hexagon_V6_vL32b_pred_ai>;
+ defm: T_pRI_pat<V6_vL32b_npred_ai, int_hexagon_V6_vL32b_npred_ai>;
+ defm: T_pRI_pat<V6_vL32b_pred_pi, int_hexagon_V6_vL32b_pred_pi>;
+ defm: T_pRI_pat<V6_vL32b_npred_pi, int_hexagon_V6_vL32b_npred_pi>;
+ defm: T_pRI_pat<V6_vL32b_nt_pred_ai, int_hexagon_V6_vL32b_nt_pred_ai>;
+ defm: T_pRI_pat<V6_vL32b_nt_npred_ai, int_hexagon_V6_vL32b_nt_npred_ai>;
+ defm: T_pRI_pat<V6_vL32b_nt_pred_pi, int_hexagon_V6_vL32b_nt_pred_pi>;
+ defm: T_pRI_pat<V6_vL32b_nt_npred_pi, int_hexagon_V6_vL32b_nt_npred_pi>;
+
+ defm: T_pRM_pat<V6_vL32b_pred_ppu, int_hexagon_V6_vL32b_pred_ppu>;
+ defm: T_pRM_pat<V6_vL32b_npred_ppu, int_hexagon_V6_vL32b_npred_ppu>;
+ defm: T_pRM_pat<V6_vL32b_nt_pred_ppu, int_hexagon_V6_vL32b_nt_pred_ppu>;
+ defm: T_pRM_pat<V6_vL32b_nt_npred_ppu, int_hexagon_V6_vL32b_nt_npred_ppu>;
+}
+
+multiclass T_pRIV_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID PredRegs:$P, IntRegs:$R, timm:$s, HvxVR:$V),
+ (MI PredRegs:$P, IntRegs:$R, imm:$s, HvxVR:$V)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ PredRegs:$P, IntRegs:$R, timm:$s, HvxVR:$V),
+ (MI PredRegs:$P, IntRegs:$R, imm:$s, HvxVR:$V)>;
+}
+
+multiclass T_pRMV_pat<InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID PredRegs:$P, IntRegs:$R, ModRegs:$M, HvxVR:$V),
+ (MI PredRegs:$P, IntRegs:$R, ModRegs:$M, HvxVR:$V)>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")
+ PredRegs:$P, IntRegs:$R, ModRegs:$M, HvxVR:$V),
+ (MI PredRegs:$P, IntRegs:$R, ModRegs:$M, HvxVR:$V)>;
+}
+
+let Predicates = [HasV60, UseHVX] in {
+ defm: T_pRIV_pat<V6_vS32b_pred_ai, int_hexagon_V6_vS32b_pred_ai>;
+ defm: T_pRIV_pat<V6_vS32b_npred_ai, int_hexagon_V6_vS32b_npred_ai>;
+ defm: T_pRIV_pat<V6_vS32b_pred_pi, int_hexagon_V6_vS32b_pred_pi>;
+ defm: T_pRIV_pat<V6_vS32b_npred_pi, int_hexagon_V6_vS32b_npred_pi>;
+ defm: T_pRIV_pat<V6_vS32Ub_pred_ai, int_hexagon_V6_vS32Ub_pred_ai>;
+ defm: T_pRIV_pat<V6_vS32Ub_npred_ai, int_hexagon_V6_vS32Ub_npred_ai>;
+ defm: T_pRIV_pat<V6_vS32Ub_pred_pi, int_hexagon_V6_vS32Ub_pred_pi>;
+ defm: T_pRIV_pat<V6_vS32Ub_npred_pi, int_hexagon_V6_vS32Ub_npred_pi>;
+ defm: T_pRIV_pat<V6_vS32b_nt_pred_ai, int_hexagon_V6_vS32b_nt_pred_ai>;
+ defm: T_pRIV_pat<V6_vS32b_nt_npred_ai, int_hexagon_V6_vS32b_nt_npred_ai>;
+ defm: T_pRIV_pat<V6_vS32b_nt_pred_pi, int_hexagon_V6_vS32b_nt_pred_pi>;
+ defm: T_pRIV_pat<V6_vS32b_nt_npred_pi, int_hexagon_V6_vS32b_nt_npred_pi>;
+
+ defm: T_pRMV_pat<V6_vS32b_pred_ppu, int_hexagon_V6_vS32b_pred_ppu>;
+ defm: T_pRMV_pat<V6_vS32b_npred_ppu, int_hexagon_V6_vS32b_npred_ppu>;
+ defm: T_pRMV_pat<V6_vS32Ub_pred_ppu, int_hexagon_V6_vS32Ub_pred_ppu>;
+ defm: T_pRMV_pat<V6_vS32Ub_npred_ppu, int_hexagon_V6_vS32Ub_npred_ppu>;
+ defm: T_pRMV_pat<V6_vS32b_nt_pred_ppu, int_hexagon_V6_vS32b_nt_pred_ppu>;
+ defm: T_pRMV_pat<V6_vS32b_nt_npred_ppu, int_hexagon_V6_vS32b_nt_npred_ppu>;
+}
include "HexagonDepMapAsm2Intrin.td"
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
deleted file mode 100644
index 7293075..0000000
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
+++ /dev/null
@@ -1,85 +0,0 @@
-//===--- HexagonMapAsm2IntrinV65.gen.td -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0), (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0_128B), (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
index d216c51..cad5ca8 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -293,6 +293,8 @@
class Not2<PatFrag P>
: PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
+class VNot2<PatFrag P, PatFrag Not>
+ : PatFrag<(ops node:$A, node:$B), (P node:$A, (Not node:$B))>;
// If there is a constant operand that feeds the and/or instruction,
// do not generate the compound instructions.
@@ -349,7 +351,7 @@
}
multiclass MinMax_pats<InstHexagon PickT, InstHexagon PickS,
- PatFrag Sel, PatFrag CmpOp,
+ SDPatternOperator Sel, SDPatternOperator CmpOp,
ValueType CmpType, PatFrag CmpPred> {
def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)),
CmpPred:$Vt, CmpPred:$Vs),
@@ -564,37 +566,50 @@
def: Pat<(pnot V8I1:$Ps), (C2_not V8I1:$Ps)>;
def: Pat<(add I1:$Ps, -1), (C2_not I1:$Ps)>;
-multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> {
- def: OpR_RR_pat<MI, Op, i1, I1>;
- def: OpR_RR_pat<MI, Op, v2i1, V2I1>;
- def: OpR_RR_pat<MI, Op, v4i1, V4I1>;
- def: OpR_RR_pat<MI, Op, v8i1, V8I1>;
+def: OpR_RR_pat<C2_and, And, i1, I1>;
+def: OpR_RR_pat<C2_or, Or, i1, I1>;
+def: OpR_RR_pat<C2_xor, Xor, i1, I1>;
+def: OpR_RR_pat<C2_andn, Not2<And>, i1, I1>;
+def: OpR_RR_pat<C2_orn, Not2<Or>, i1, I1>;
+
+def: AccRRR_pat<C4_and_and, And, Su<And>, I1, I1, I1>;
+def: AccRRR_pat<C4_and_or, And, Su< Or>, I1, I1, I1>;
+def: AccRRR_pat<C4_or_and, Or, Su<And>, I1, I1, I1>;
+def: AccRRR_pat<C4_or_or, Or, Su< Or>, I1, I1, I1>;
+def: AccRRR_pat<C4_and_andn, And, Su<Not2<And>>, I1, I1, I1>;
+def: AccRRR_pat<C4_and_orn, And, Su<Not2< Or>>, I1, I1, I1>;
+def: AccRRR_pat<C4_or_andn, Or, Su<Not2<And>>, I1, I1, I1>;
+def: AccRRR_pat<C4_or_orn, Or, Su<Not2< Or>>, I1, I1, I1>;
+
+multiclass BoolvOpR_RR_pat<InstHexagon MI, PatFrag VOp> {
+ def: OpR_RR_pat<MI, VOp, v2i1, V2I1>;
+ def: OpR_RR_pat<MI, VOp, v4i1, V4I1>;
+ def: OpR_RR_pat<MI, VOp, v8i1, V8I1>;
}
-multiclass BoolAccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op> {
- def: AccRRR_pat<MI, AccOp, Op, I1, I1, I1>;
- def: AccRRR_pat<MI, AccOp, Op, V2I1, V2I1, V2I1>;
- def: AccRRR_pat<MI, AccOp, Op, V4I1, V4I1, V4I1>;
- def: AccRRR_pat<MI, AccOp, Op, V8I1, V8I1, V8I1>;
+multiclass BoolvAccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag VOp> {
+ def: AccRRR_pat<MI, AccOp, VOp, V2I1, V2I1, V2I1>;
+ def: AccRRR_pat<MI, AccOp, VOp, V4I1, V4I1, V4I1>;
+ def: AccRRR_pat<MI, AccOp, VOp, V8I1, V8I1, V8I1>;
}
-defm: BoolOpR_RR_pat<C2_and, And>;
-defm: BoolOpR_RR_pat<C2_or, Or>;
-defm: BoolOpR_RR_pat<C2_xor, Xor>;
-defm: BoolOpR_RR_pat<C2_andn, Not2<And>>;
-defm: BoolOpR_RR_pat<C2_orn, Not2<Or>>;
+defm: BoolvOpR_RR_pat<C2_and, And>;
+defm: BoolvOpR_RR_pat<C2_or, Or>;
+defm: BoolvOpR_RR_pat<C2_xor, Xor>;
+defm: BoolvOpR_RR_pat<C2_andn, VNot2<And, pnot>>;
+defm: BoolvOpR_RR_pat<C2_orn, VNot2< Or, pnot>>;
// op(Ps, op(Pt, Pu))
-defm: BoolAccRRR_pat<C4_and_and, And, Su<And>>;
-defm: BoolAccRRR_pat<C4_and_or, And, Su<Or>>;
-defm: BoolAccRRR_pat<C4_or_and, Or, Su<And>>;
-defm: BoolAccRRR_pat<C4_or_or, Or, Su<Or>>;
+defm: BoolvAccRRR_pat<C4_and_and, And, Su<And>>;
+defm: BoolvAccRRR_pat<C4_and_or, And, Su<Or>>;
+defm: BoolvAccRRR_pat<C4_or_and, Or, Su<And>>;
+defm: BoolvAccRRR_pat<C4_or_or, Or, Su<Or>>;
-// op(Ps, op(Pt, ~Pu))
-defm: BoolAccRRR_pat<C4_and_andn, And, Su<Not2<And>>>;
-defm: BoolAccRRR_pat<C4_and_orn, And, Su<Not2<Or>>>;
-defm: BoolAccRRR_pat<C4_or_andn, Or, Su<Not2<And>>>;
-defm: BoolAccRRR_pat<C4_or_orn, Or, Su<Not2<Or>>>;
+// op(Ps, op(Pt, !Pu))
+defm: BoolvAccRRR_pat<C4_and_andn, And, Su<VNot2<And, pnot>>>;
+defm: BoolvAccRRR_pat<C4_and_orn, And, Su<VNot2< Or, pnot>>>;
+defm: BoolvAccRRR_pat<C4_or_andn, Or, Su<VNot2<And, pnot>>>;
+defm: BoolvAccRRR_pat<C4_or_orn, Or, Su<VNot2< Or, pnot>>>;
// --(5) Compare ---------------------------------------------------------
@@ -1933,6 +1948,9 @@
// --(12) Load -----------------------------------------------------------
//
+def L1toI32: OutPatFrag<(ops node:$Rs), (A2_subri 0, (i32 $Rs))>;
+def L1toI64: OutPatFrag<(ops node:$Rs), (ToSext64 (L1toI32 $Rs))>;
+
def extloadv2i8: PatFrag<(ops node:$ptr), (extload node:$ptr), [{
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8;
}]>;
@@ -2089,11 +2107,17 @@
}
let AddedComplexity = 30 in {
+ // Loads of i1 are loading a byte, and the byte should be either 0 or 1.
+ // It doesn't matter if it's sign- or zero-extended, so use zero-extension
+ // everywhere.
+ defm: Loadxim_pat<sextloadi1, i32, L1toI32, anyimm0, L2_loadrub_io>;
defm: Loadxim_pat<extloadi1, i64, ToAext64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<sextloadi1, i64, L1toI64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<zextloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
+
defm: Loadxim_pat<extloadi8, i64, ToAext64, anyimm0, L2_loadrub_io>;
defm: Loadxim_pat<extloadi16, i64, ToAext64, anyimm1, L2_loadruh_io>;
defm: Loadxim_pat<extloadi32, i64, ToAext64, anyimm2, L2_loadri_io>;
- defm: Loadxim_pat<zextloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
defm: Loadxim_pat<zextloadi8, i64, ToZext64, anyimm0, L2_loadrub_io>;
defm: Loadxim_pat<zextloadi16, i64, ToZext64, anyimm1, L2_loadruh_io>;
defm: Loadxim_pat<zextloadi32, i64, ToZext64, anyimm2, L2_loadri_io>;
@@ -2103,6 +2127,7 @@
}
let AddedComplexity = 60 in {
+ def: Loadxu_pat<extloadi1, i32, anyimm0, L4_loadrub_ur>;
def: Loadxu_pat<extloadi8, i32, anyimm0, L4_loadrub_ur>;
def: Loadxu_pat<extloadi16, i32, anyimm1, L4_loadruh_ur>;
def: Loadxu_pat<extloadv2i8, v2i16, anyimm1, L4_loadbzw2_ur>;
@@ -2111,6 +2136,7 @@
def: Loadxu_pat<sextloadi16, i32, anyimm1, L4_loadrh_ur>;
def: Loadxu_pat<sextloadv2i8, v2i16, anyimm1, L4_loadbsw2_ur>;
def: Loadxu_pat<sextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>;
+ def: Loadxu_pat<zextloadi1, i32, anyimm0, L4_loadrub_ur>;
def: Loadxu_pat<zextloadi8, i32, anyimm0, L4_loadrub_ur>;
def: Loadxu_pat<zextloadi16, i32, anyimm1, L4_loadruh_ur>;
def: Loadxu_pat<zextloadv2i8, v2i16, anyimm1, L4_loadbzw2_ur>;
@@ -2125,6 +2151,11 @@
def: Loadxu_pat<load, f32, anyimm2, L4_loadri_ur>;
def: Loadxu_pat<load, f64, anyimm3, L4_loadrd_ur>;
+ def: Loadxum_pat<sextloadi1, i32, anyimm0, L1toI32, L4_loadrub_ur>;
+ def: Loadxum_pat<extloadi1, i64, anyimm0, ToAext64, L4_loadrub_ur>;
+ def: Loadxum_pat<sextloadi1, i64, anyimm0, L1toI64, L4_loadrub_ur>;
+ def: Loadxum_pat<zextloadi1, i64, anyimm0, ToZext64, L4_loadrub_ur>;
+
def: Loadxum_pat<sextloadi8, i64, anyimm0, ToSext64, L4_loadrb_ur>;
def: Loadxum_pat<zextloadi8, i64, anyimm0, ToZext64, L4_loadrub_ur>;
def: Loadxum_pat<extloadi8, i64, anyimm0, ToAext64, L4_loadrub_ur>;
@@ -2137,7 +2168,9 @@
}
let AddedComplexity = 40 in {
+ def: Loadxr_shl_pat<extloadi1, i32, L4_loadrub_rr>;
def: Loadxr_shl_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_shl_pat<zextloadi1, i32, L4_loadrub_rr>;
def: Loadxr_shl_pat<zextloadi8, i32, L4_loadrub_rr>;
def: Loadxr_shl_pat<sextloadi8, i32, L4_loadrb_rr>;
def: Loadxr_shl_pat<extloadi16, i32, L4_loadruh_rr>;
@@ -2155,8 +2188,10 @@
}
let AddedComplexity = 20 in {
+ def: Loadxr_add_pat<extloadi1, i32, L4_loadrub_rr>;
def: Loadxr_add_pat<extloadi8, i32, L4_loadrub_rr>;
def: Loadxr_add_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_add_pat<zextloadi1, i32, L4_loadrub_rr>;
def: Loadxr_add_pat<sextloadi8, i32, L4_loadrb_rr>;
def: Loadxr_add_pat<extloadi16, i32, L4_loadruh_rr>;
def: Loadxr_add_pat<zextloadi16, i32, L4_loadruh_rr>;
@@ -2173,6 +2208,11 @@
}
let AddedComplexity = 40 in {
+ def: Loadxrm_shl_pat<sextloadi1, i32, L1toI32, L4_loadrub_rr>;
+ def: Loadxrm_shl_pat<extloadi1, i64, ToAext64, L4_loadrub_rr>;
+ def: Loadxrm_shl_pat<sextloadi1, i64, L1toI64, L4_loadrub_rr>;
+ def: Loadxrm_shl_pat<zextloadi1, i64, ToZext64, L4_loadrub_rr>;
+
def: Loadxrm_shl_pat<extloadi8, i64, ToAext64, L4_loadrub_rr>;
def: Loadxrm_shl_pat<zextloadi8, i64, ToZext64, L4_loadrub_rr>;
def: Loadxrm_shl_pat<sextloadi8, i64, ToSext64, L4_loadrb_rr>;
@@ -2184,7 +2224,12 @@
def: Loadxrm_shl_pat<sextloadi32, i64, ToSext64, L4_loadri_rr>;
}
-let AddedComplexity = 20 in {
+let AddedComplexity = 30 in {
+ def: Loadxrm_add_pat<sextloadi1, i32, L1toI32, L4_loadrub_rr>;
+ def: Loadxrm_add_pat<extloadi1, i64, ToAext64, L4_loadrub_rr>;
+ def: Loadxrm_add_pat<sextloadi1, i64, L1toI64, L4_loadrub_rr>;
+ def: Loadxrm_add_pat<zextloadi1, i64, ToZext64, L4_loadrub_rr>;
+
def: Loadxrm_add_pat<extloadi8, i64, ToAext64, L4_loadrub_rr>;
def: Loadxrm_add_pat<zextloadi8, i64, ToZext64, L4_loadrub_rr>;
def: Loadxrm_add_pat<sextloadi8, i64, ToSext64, L4_loadrb_rr>;
@@ -2199,12 +2244,13 @@
// Absolute address
let AddedComplexity = 60 in {
+ def: Loada_pat<extloadi1, i32, anyimm0, PS_loadrubabs>;
def: Loada_pat<zextloadi1, i32, anyimm0, PS_loadrubabs>;
- def: Loada_pat<sextloadi8, i32, anyimm0, PS_loadrbabs>;
def: Loada_pat<extloadi8, i32, anyimm0, PS_loadrubabs>;
+ def: Loada_pat<sextloadi8, i32, anyimm0, PS_loadrbabs>;
def: Loada_pat<zextloadi8, i32, anyimm0, PS_loadrubabs>;
- def: Loada_pat<sextloadi16, i32, anyimm1, PS_loadrhabs>;
def: Loada_pat<extloadi16, i32, anyimm1, PS_loadruhabs>;
+ def: Loada_pat<sextloadi16, i32, anyimm1, PS_loadrhabs>;
def: Loada_pat<zextloadi16, i32, anyimm1, PS_loadruhabs>;
def: Loada_pat<load, i32, anyimm2, PS_loadriabs>;
def: Loada_pat<load, v2i16, anyimm2, PS_loadriabs>;
@@ -2223,6 +2269,12 @@
}
let AddedComplexity = 30 in {
+ def: Loadam_pat<load, i1, anyimm0, I32toI1, PS_loadrubabs>;
+ def: Loadam_pat<sextloadi1, i32, anyimm0, L1toI32, PS_loadrubabs>;
+ def: Loadam_pat<extloadi1, i64, anyimm0, ToZext64, PS_loadrubabs>;
+ def: Loadam_pat<sextloadi1, i64, anyimm0, L1toI64, PS_loadrubabs>;
+ def: Loadam_pat<zextloadi1, i64, anyimm0, ToZext64, PS_loadrubabs>;
+
def: Loadam_pat<extloadi8, i64, anyimm0, ToAext64, PS_loadrubabs>;
def: Loadam_pat<sextloadi8, i64, anyimm0, ToSext64, PS_loadrbabs>;
def: Loadam_pat<zextloadi8, i64, anyimm0, ToZext64, PS_loadrubabs>;
@@ -2232,9 +2284,6 @@
def: Loadam_pat<extloadi32, i64, anyimm2, ToAext64, PS_loadriabs>;
def: Loadam_pat<sextloadi32, i64, anyimm2, ToSext64, PS_loadriabs>;
def: Loadam_pat<zextloadi32, i64, anyimm2, ToZext64, PS_loadriabs>;
-
- def: Loadam_pat<load, i1, anyimm0, I32toI1, PS_loadrubabs>;
- def: Loadam_pat<zextloadi1, i64, anyimm0, ToZext64, PS_loadrubabs>;
}
// GP-relative address
@@ -2265,6 +2314,11 @@
}
let AddedComplexity = 70 in {
+ def: Loadam_pat<sextloadi1, i32, addrgp, L1toI32, L2_loadrubgp>;
+ def: Loadam_pat<extloadi1, i64, addrgp, ToAext64, L2_loadrubgp>;
+ def: Loadam_pat<sextloadi1, i64, addrgp, L1toI64, L2_loadrubgp>;
+ def: Loadam_pat<zextloadi1, i64, addrgp, ToZext64, L2_loadrubgp>;
+
def: Loadam_pat<extloadi8, i64, addrgp, ToAext64, L2_loadrubgp>;
def: Loadam_pat<sextloadi8, i64, addrgp, ToSext64, L2_loadrbgp>;
def: Loadam_pat<zextloadi8, i64, addrgp, ToZext64, L2_loadrubgp>;
@@ -2276,17 +2330,8 @@
def: Loadam_pat<zextloadi32, i64, addrgp, ToZext64, L2_loadrigp>;
def: Loadam_pat<load, i1, addrgp, I32toI1, L2_loadrubgp>;
- def: Loadam_pat<zextloadi1, i64, addrgp, ToZext64, L2_loadrubgp>;
}
-
-// Sign-extending loads of i1 need to replicate the lowest bit throughout
-// the 32-bit value. Since the loaded value can only be 0 or 1, 0-v should
-// do the trick.
-let AddedComplexity = 20 in
-def: Pat<(i32 (sextloadi1 I32:$Rs)),
- (A2_subri 0, (L2_loadrub_io IntRegs:$Rs, 0))>;
-
// Patterns for loads of i1:
def: Pat<(i1 (load AddrFI:$fi)),
(C2_tfrrp (L2_loadrub_io AddrFI:$fi, 0))>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index cd894c5..a22a3f8 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -95,21 +95,41 @@
// HVX loads
-multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+multiclass HvxLdfi_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+ PatFrag ImmPred> {
+ def: Pat<(ResType (Load (add (i32 AddrFI:$fi), ImmPred:$Off))),
+ (MI AddrFI:$fi, imm:$Off)>;
+ def: Pat<(ResType (Load (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off))),
+ (MI AddrFI:$fi, imm:$Off)>;
+ def: Pat<(ResType (Load AddrFI:$fi)), (ResType (MI AddrFI:$fi, 0))>;
+}
+
+multiclass HvxLdgi_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
PatFrag ImmPred> {
+ def: Pat<(ResType (Load (add I32:$Rt, ImmPred:$Off))),
+ (MI I32:$Rt, imm:$Off)>;
def: Pat<(ResType (Load I32:$Rt)),
(MI I32:$Rt, 0)>;
- def: Pat<(ResType (Load (add I32:$Rt, ImmPred:$s))),
- (MI I32:$Rt, imm:$s)>;
+}
+
+multiclass HvxLdc_pat<InstHexagon MI, PatFrag Load, ValueType ResType> {
// The HVX selection code for shuffles can generate vector constants.
// Calling "Select" on the resulting loads from CP fails without these
// patterns.
- def: Pat<(ResType (Load (HexagonCP tconstpool:$A))),
- (MI (A2_tfrsi imm:$A), 0)>;
- def: Pat<(ResType (Load (HexagonAtPcrel tconstpool:$A))),
- (MI (C4_addipc imm:$A), 0)>;
+ def: Pat<(ResType (Load (HexagonCP tconstpool:$Addr))),
+ (MI (A2_tfrsi imm:$Addr), 0)>;
+ def: Pat<(ResType (Load (HexagonAtPcrel tconstpool:$Addr))),
+ (MI (C4_addipc imm:$Addr), 0)>;
}
+multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+ PatFrag ImmPred> {
+ defm: HvxLdfi_pat<MI, Load, ResType, ImmPred>;
+ defm: HvxLdgi_pat<MI, Load, ResType, ImmPred>;
+ defm: HvxLdc_pat <MI, Load, ResType>;
+}
+
+// Aligned loads: everything, plus loads with valignaddr node.
multiclass HvxLda_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
PatFrag ImmPred> {
let AddedComplexity = 50 in {
@@ -122,41 +142,61 @@
}
let Predicates = [UseHVX] in {
+ // alignedload will match a non-temporal load as well, so try non-temporal
+ // first.
defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI8, IsVecOff>;
defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI16, IsVecOff>;
defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI32, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI8, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI16, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI32, IsVecOff>;
- defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI8, IsVecOff>;
- defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI16, IsVecOff>;
- defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI32, IsVecOff>;
-
- defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI8, IsVecOff>;
- defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI16, IsVecOff>;
- defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI32, IsVecOff>;
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI8, IsVecOff>;
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI16, IsVecOff>;
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI32, IsVecOff>;
}
+
// HVX stores
-multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag ImmPred,
- PatFrag Value> {
+multiclass HvxStfi_pat<InstHexagon MI, PatFrag Store, PatFrag Value,
+ PatFrag ImmPred> {
+ def: Pat<(Store Value:$Vs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, Value:$Vs)>;
+ def: Pat<(Store Value:$Vs, (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, Value:$Vs)>;
+ def: Pat<(Store Value:$Vs, AddrFI:$fi),
+ (MI AddrFI:$fi, 0, Value:$Vs)>;
+}
+
+multiclass HvxStgi_pat<InstHexagon MI, PatFrag Store, PatFrag Value,
+ PatFrag ImmPred> {
+ def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$Off)),
+ (MI I32:$Rt, imm:$Off, Value:$Vs)>;
+ def: Pat<(Store Value:$Vs, (IsOrAdd I32:$Rt, ImmPred:$Off)),
+ (MI I32:$Rt, imm:$Off, Value:$Vs)>;
def: Pat<(Store Value:$Vs, I32:$Rt),
(MI I32:$Rt, 0, Value:$Vs)>;
- def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$s)),
- (MI I32:$Rt, imm:$s, Value:$Vs)>;
+}
+
+multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag Value,
+ PatFrag ImmPred> {
+ defm: HvxStfi_pat<MI, Store, Value, ImmPred>;
+ defm: HvxStgi_pat<MI, Store, Value, ImmPred>;
}
let Predicates = [UseHVX] in {
- defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI8>;
- defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI16>;
- defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI32>;
-
- defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI8>;
- defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI16>;
- defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI32>;
-
- defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI8>;
- defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI16>;
- defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI32>;
+ // alignedstore will match a non-temporal store as well, so try non-temporal
+ // first.
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVI8, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVI16, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVI32, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, HVI8, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, HVI16, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, HVI32, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVI8, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVI16, IsVecOff>;
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVI32, IsVecOff>;
}
// Bitcasts between same-size vector types are no-ops, except for the
@@ -526,32 +566,32 @@
def: Pat<(qnot HQ16:$Qs), (V6_pred_not HvxQR:$Qs)>;
def: Pat<(qnot HQ32:$Qs), (V6_pred_not HvxQR:$Qs)>;
- def: OpR_RR_pat<V6_pred_and, And, VecQ8, HQ8>;
- def: OpR_RR_pat<V6_pred_and, And, VecQ16, HQ16>;
- def: OpR_RR_pat<V6_pred_and, And, VecQ32, HQ32>;
- def: OpR_RR_pat<V6_pred_or, Or, VecQ8, HQ8>;
- def: OpR_RR_pat<V6_pred_or, Or, VecQ16, HQ16>;
- def: OpR_RR_pat<V6_pred_or, Or, VecQ32, HQ32>;
- def: OpR_RR_pat<V6_pred_xor, Xor, VecQ8, HQ8>;
- def: OpR_RR_pat<V6_pred_xor, Xor, VecQ16, HQ16>;
- def: OpR_RR_pat<V6_pred_xor, Xor, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_and, And, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_and, And, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_and, And, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ32, HQ32>;
- def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ8, HQ8>;
- def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ16, HQ16>;
- def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ32, HQ32>;
- def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ8, HQ8>;
- def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ16, HQ16>;
- def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_and_n, VNot2<And, qnot>, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_and_n, VNot2<And, qnot>, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_and_n, VNot2<And, qnot>, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_or_n, VNot2<Or, qnot>, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_or_n, VNot2<Or, qnot>, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_or_n, VNot2<Or, qnot>, VecQ32, HQ32>;
- def: OpR_RR_pat<V6_veqb, seteq, VecQ8, HVI8>;
- def: OpR_RR_pat<V6_veqh, seteq, VecQ16, HVI16>;
- def: OpR_RR_pat<V6_veqw, seteq, VecQ32, HVI32>;
- def: OpR_RR_pat<V6_vgtb, setgt, VecQ8, HVI8>;
- def: OpR_RR_pat<V6_vgth, setgt, VecQ16, HVI16>;
- def: OpR_RR_pat<V6_vgtw, setgt, VecQ32, HVI32>;
- def: OpR_RR_pat<V6_vgtub, setugt, VecQ8, HVI8>;
- def: OpR_RR_pat<V6_vgtuh, setugt, VecQ16, HVI16>;
- def: OpR_RR_pat<V6_vgtuw, setugt, VecQ32, HVI32>;
+ def: OpR_RR_pat<V6_veqb, seteq, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_veqh, seteq, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_veqw, seteq, VecQ32, HVI32>;
+ def: OpR_RR_pat<V6_vgtb, setgt, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_vgth, setgt, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_vgtw, setgt, VecQ32, HVI32>;
+ def: OpR_RR_pat<V6_vgtub, setugt, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_vgtuh, setugt, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_vgtuw, setugt, VecQ32, HVI32>;
def: AccRRR_pat<V6_veqb_and, And, seteq, HQ8, HVI8, HVI8>;
def: AccRRR_pat<V6_veqb_or, Or, seteq, HQ8, HVI8, HVI8>;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 5ece577..6e55bc6 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -17,8 +17,10 @@
#include "HexagonSubtarget.h"
#include "HexagonTargetMachine.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -30,6 +32,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -41,6 +44,15 @@
using namespace llvm;
+static cl::opt<unsigned> FrameIndexSearchRange(
+ "hexagon-frame-index-search-range", cl::init(32), cl::Hidden,
+ cl::desc("Limit on instruction search range in frame index elimination"));
+
+static cl::opt<unsigned> FrameIndexReuseLimit(
+ "hexagon-frame-index-reuse-limit", cl::init(~0), cl::Hidden,
+ cl::desc("Limit on the number of reused registers in frame index "
+ "elimination"));
+
HexagonRegisterInfo::HexagonRegisterInfo(unsigned HwMode)
: HexagonGenRegisterInfo(Hexagon::R31, 0/*DwarfFlavor*/, 0/*EHFlavor*/,
0/*PC*/, HwMode) {}
@@ -133,7 +145,7 @@
BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
- const {
+ const {
BitVector Reserved(getNumRegs());
Reserved.set(Hexagon::R29);
Reserved.set(Hexagon::R30);
@@ -188,10 +200,10 @@
return Reserved;
}
-
void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOp,
RegScavenger *RS) const {
+ static unsigned ReuseCount = 0;
//
// Hexagon_TODO: Do we need to enforce this for Hexagon?
assert(SPAdj == 0 && "Unexpected");
@@ -210,7 +222,6 @@
int Offset = HFI.getFrameIndexReference(MF, FI, BP).getFixed();
// Add the offset from the instruction.
int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
- bool IsKill = false;
unsigned Opc = MI.getOpcode();
switch (Opc) {
@@ -228,18 +239,107 @@
if (!HII.isValidOffset(Opc, RealOffset, this)) {
// If the offset is not valid, calculate the address in a temporary
// register and use it with offset 0.
+ int InstOffset = 0;
+ // The actual base register (BP) is typically shared between many
+ // instructions where frame indices are being replaced. In scalar
+ // instructions the offset range is large, and the need for an extra
+ // add instruction is infrequent. Vector loads/stores, however, have
+ // a much smaller offset range: [-8, 7), or #s4. In those cases it
+ // makes sense to "standardize" the immediate in the "addi" instruction
+ // so that multiple loads/stores could be based on it.
+ bool IsPair = false;
+ switch (MI.getOpcode()) {
+ // All of these instructions have the same format: base+#s4.
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::PS_vloadrw_nt_ai:
+ case Hexagon::PS_vstorerw_ai:
+ case Hexagon::PS_vstorerw_nt_ai:
+ IsPair = true;
+ LLVM_FALLTHROUGH;
+ case Hexagon::PS_vloadrv_ai:
+ case Hexagon::PS_vloadrv_nt_ai:
+ case Hexagon::PS_vstorerv_ai:
+ case Hexagon::PS_vstorerv_nt_ai:
+ case Hexagon::V6_vL32b_ai:
+ case Hexagon::V6_vS32b_ai: {
+ unsigned HwLen = HST.getVectorLength();
+ if (RealOffset % HwLen == 0) {
+ int VecOffset = RealOffset / HwLen;
+ // Rewrite the offset as "base + [-8, 7)".
+ VecOffset += 8;
+ // Pairs are expanded into two instructions: make sure that both
+ // can use the same base (i.e. VecOffset+1 is not a different
+ // multiple of 16 than VecOffset).
+ if (!IsPair || (VecOffset + 1) % 16 != 0) {
+ RealOffset = (VecOffset & -16) * HwLen;
+ InstOffset = (VecOffset % 16 - 8) * HwLen;
+ }
+ }
+ }
+ }
+
+ // Search backwards in the block for "Reg = A2_addi BP, RealOffset".
+ // This will give us a chance to avoid creating a new register.
+ Register ReuseBP;
+
+ if (ReuseCount < FrameIndexReuseLimit) {
+ unsigned SearchCount = 0, SearchRange = FrameIndexSearchRange;
+ SmallSet<Register,2> SeenVRegs;
+ bool PassedCall = false;
+ LiveRegUnits Defs(*this), Uses(*this);
+
+ for (auto I = std::next(II.getReverse()), E = MB.rend(); I != E; ++I) {
+ if (SearchCount == SearchRange)
+ break;
+ ++SearchCount;
+ const MachineInstr &BI = *I;
+ LiveRegUnits::accumulateUsedDefed(BI, Defs, Uses, this);
+ PassedCall |= BI.isCall();
+ for (const MachineOperand &Op : BI.operands()) {
+ if (SeenVRegs.size() > 1)
+ break;
+ if (Op.isReg() && Op.getReg().isVirtual())
+ SeenVRegs.insert(Op.getReg());
+ }
+ if (BI.getOpcode() != Hexagon::A2_addi)
+ continue;
+ if (BI.getOperand(1).getReg() != BP)
+ continue;
+ const auto &Op2 = BI.getOperand(2);
+ if (!Op2.isImm() || Op2.getImm() != RealOffset)
+ continue;
+
+ Register R = BI.getOperand(0).getReg();
+ if (R.isPhysical()) {
+ if (Defs.available(R))
+ ReuseBP = R;
+ } else if (R.isVirtual()) {
+ // Extending a range of a virtual register can be dangerous,
+ // since the scavenger will need to find a physical register
+ // for it. Avoid extending the range past a function call,
+ // and avoid overlapping it with another virtual register.
+ if (!PassedCall && SeenVRegs.size() <= 1)
+ ReuseBP = R;
+ }
+ break;
+ }
+ if (ReuseBP)
+ ++ReuseCount;
+ }
+
auto &MRI = MF.getRegInfo();
- Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
- .addReg(BP)
- .addImm(RealOffset);
- BP = TmpR;
- RealOffset = 0;
- IsKill = true;
+ if (!ReuseBP) {
+ ReuseBP = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), ReuseBP)
+ .addReg(BP)
+ .addImm(RealOffset);
+ }
+ BP = ReuseBP;
+ RealOffset = InstOffset;
}
- MI.getOperand(FIOp).ChangeToRegister(BP, false, false, IsKill);
+ MI.getOperand(FIOp).ChangeToRegister(BP, false, false, false);
MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSchedule.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSchedule.td
index 5efd02a..88d775f 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -68,3 +68,4 @@
include "HexagonScheduleV66.td"
include "HexagonScheduleV67.td"
include "HexagonScheduleV67T.td"
+include "HexagonScheduleV68.td"
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonScheduleV68.td b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonScheduleV68.td
new file mode 100644
index 0000000..fefc130
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonScheduleV68.td
@@ -0,0 +1,38 @@
+//=-HexagonScheduleV68.td - HexagonV68 Scheduling Definitions *- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//
+// ScalarItin and HVXItin contain some old itineraries
+// still used by a handful of instructions. Hopefully, we will be able
+// to get rid of them soon.
+def HexagonV68ItinList : DepScalarItinV68, ScalarItin,
+ DepHVXItinV68, HVXItin, PseudoItin {
+ list<InstrItinData> ItinList =
+ !listconcat(DepScalarItinV68_list, ScalarItin_list,
+ DepHVXItinV68_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV68 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+ CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+ CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+ CVI_ALL_NOMEM, CVI_ZW],
+ [Hex_FWD, HVX_FWD],
+ HexagonV68ItinList.ItinList>;
+
+def HexagonModelV68 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV68;
+ let LoadLatency = 1;
+ let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V68 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index c8c66eb..52452e9 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -6,8 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hsdr"
-
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
@@ -42,6 +40,8 @@
#include <utility>
#include <vector>
+#define DEBUG_TYPE "hsdr"
+
using namespace llvm;
namespace llvm {
@@ -296,8 +296,7 @@
Visited.insert(T);
// Add all registers associated with T.
USet &Asc = AssocMap[T];
- for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J)
- WorkQ.push_back(*J);
+ append_range(WorkQ, Asc);
}
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index 2c40071..93ba277 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -180,7 +180,7 @@
return true;
MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo());
- if (AA->alias(L, SL))
+ if (!AA->isNoAlias(L, SL))
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 7b7fb8d..a4f2e15 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Alignment.h"
#include <memory>
#include <string>
#include <vector>
@@ -179,6 +180,12 @@
bool hasV67OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V67;
}
+ bool hasV68Ops() const {
+ return getHexagonArchVersion() >= Hexagon::ArchEnum::V68;
+ }
+ bool hasV68OpsOnly() const {
+ return getHexagonArchVersion() == Hexagon::ArchEnum::V68;
+ }
bool useAudioOps() const { return UseAudioOps; }
bool useCompound() const { return UseCompound; }
@@ -212,6 +219,9 @@
bool useHVXV67Ops() const {
return HexagonHVXVersion >= Hexagon::ArchEnum::V67;
}
+ bool useHVXV68Ops() const {
+ return HexagonHVXVersion >= Hexagon::ArchEnum::V68;
+ }
bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
@@ -279,10 +289,10 @@
bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const;
bool isTypeForHVX(Type *VecTy, bool IncludeBool = false) const;
- unsigned getTypeAlignment(MVT Ty) const {
+ Align getTypeAlignment(MVT Ty) const {
if (isHVXVectorType(Ty, true))
- return getVectorLength();
- return Ty.getSizeInBits() / 8;
+ return Align(getVectorLength());
+ return Align(std::max<unsigned>(1, Ty.getSizeInBits() / 8));
}
unsigned getL1CacheLineSize() const;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9195bb3..3cbb4b5 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -251,8 +251,7 @@
// Creating a separate target feature is not strictly necessary, it only
// exists to make "unsafe-fp-math" force creating a new subtarget.
- if (FnAttrs.hasFnAttribute("unsafe-fp-math") &&
- F.getFnAttribute("unsafe-fp-math").getValueAsString() == "true")
+ if (F.getFnAttribute("unsafe-fp-math").getValueAsBool())
FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
auto &I = SubtargetMap[CPU + FS];
@@ -279,8 +278,7 @@
});
}
-void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) {
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback(
[=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
LPM.addPass(HexagonLoopIdiomRecognitionPass());
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index fa17412..66679df 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,8 +37,7 @@
static unsigned getModuleMatchQuality(const Module &M);
void adjustPassManager(PassManagerBuilder &PMB) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 595cf94..2546678 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -10,8 +10,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-sdata"
-
#include "HexagonTargetObjectFile.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
@@ -32,6 +30,8 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "hexagon-sdata"
+
using namespace llvm;
static cl::opt<unsigned> SmallDataThreshold("hexagon-small-data-threshold",
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 1cefa6a..108027d 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -80,8 +80,10 @@
}
}
-bool HexagonTTIImpl::shouldFavorPostInc() const {
- return true;
+TTI::AddressingModeKind
+HexagonTTIImpl::getPreferredAddressingMode(const Loop *L,
+ ScalarEvolution *SE) const {
+ return TTI::AMK_PostIndexed;
}
/// --- Vector TTI begin ---
@@ -96,54 +98,69 @@
return useHVX() ? 2 : 1;
}
-unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
- return Vector ? getMinVectorRegisterBitWidth() : 32;
+TypeSize
+HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(getMinVectorRegisterBitWidth());
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
+ }
+
+ llvm_unreachable("Unsupported register kind");
}
unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
return useHVX() ? ST.getVectorLength()*8 : 32;
}
-unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
- return (8 * ST.getVectorLength()) / ElemWidth;
+ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
+ bool IsScalable) const {
+ assert(!IsScalable && "Scalable VFs are not supported for Hexagon");
+ return ElementCount::getFixed((8 * ST.getVectorLength()) / ElemWidth);
}
-unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty,
- const APInt &DemandedElts,
- bool Insert, bool Extract) {
+InstructionCost HexagonTTIImpl::getScalarizationOverhead(
+ VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) {
return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
-unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
- ArrayRef<const Value*> Args, unsigned VF) {
- return BaseT::getOperandsScalarizationOverhead(Args, VF);
+InstructionCost
+HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys) {
+ return BaseT::getOperandsScalarizationOverhead(Args, Tys);
}
-unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
- ArrayRef<Type*> Tys, TTI::TargetCostKind CostKind) {
+InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
+ ArrayRef<Type *> Tys,
+ TTI::TargetCostKind CostKind) {
return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind);
}
-unsigned
+InstructionCost
HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
if (ICA.getID() == Intrinsic::bswap) {
- std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ICA.getReturnType());
+ std::pair<InstructionCost, MVT> LT =
+ TLI.getTypeLegalizationCost(DL, ICA.getReturnType());
return LT.first + 2;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
- ScalarEvolution *SE, const SCEV *S) {
+InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp,
+ ScalarEvolution *SE,
+ const SCEV *S) {
return 0;
}
-unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
@@ -157,7 +174,9 @@
VectorType *VecTy = cast<VectorType>(Src);
unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize();
if (useHVX() && ST.isTypeForHVX(VecTy)) {
- unsigned RegWidth = getRegisterBitWidth(true);
+ unsigned RegWidth =
+ getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize();
assert(RegWidth && "Non-zero vector register width expected");
// Cost of HVX loads.
if (VecWidth % RegWidth == 0)
@@ -193,27 +212,28 @@
CostKind, I);
}
-unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
- Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
}
-unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
- int Index, Type *SubTp) {
+InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+ ArrayRef<int> Mask, int Index,
+ Type *SubTp) {
return 1;
}
-unsigned HexagonTTIImpl::getGatherScatterOpCost(
+InstructionCost HexagonTTIImpl::getGatherScatterOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment, CostKind, I);
}
-unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
+InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
@@ -226,23 +246,23 @@
CostKind);
}
-unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
- std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
if (Opcode == Instruction::FCmp)
return LT.first + FloatFactor * getTypeNumElements(ValTy);
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
-unsigned HexagonTTIImpl::getArithmeticInstrCost(
+InstructionCost HexagonTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
// TODO: Handle more cost kinds.
@@ -252,7 +272,7 @@
Opd2PropInfo, Args, CxtI);
if (Ty->isVectorTy()) {
- std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
if (LT.second.isFloatingPoint())
return LT.first + FloatFactor * getTypeNumElements(Ty);
}
@@ -260,17 +280,21 @@
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
-unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
- Type *SrcTy, TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
+ Type *SrcTy,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
- std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
- std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
- unsigned Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+ std::pair<InstructionCost, MVT> SrcLT =
+ TLI.getTypeLegalizationCost(DL, SrcTy);
+ std::pair<InstructionCost, MVT> DstLT =
+ TLI.getTypeLegalizationCost(DL, DstTy);
+ InstructionCost Cost =
+ std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
// TODO: Allow non-throughput costs that aren't binary.
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
@@ -279,8 +303,8 @@
return 1;
}
-unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
: Val;
if (Opcode == Instruction::InsertElement) {
@@ -316,10 +340,9 @@
return ST.getL1CacheLineSize();
}
-int
-HexagonTTIImpl::getUserCost(const User *U,
- ArrayRef<const Value *> Operands,
- TTI::TargetCostKind CostKind) {
+InstructionCost HexagonTTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind) {
auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
if (!CI->isIntegerCast())
return false;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 835358d..2144fb2 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -67,7 +67,8 @@
TTI::PeelingPreferences &PP);
/// Bias LSR towards creating post-increment opportunities.
- bool shouldFavorPostInc() const;
+ TTI::AddressingModeKind
+ getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const;
// L1 cache prefetch.
unsigned getPrefetchDistance() const override;
@@ -80,11 +81,11 @@
unsigned getNumberOfRegisters(bool vector) const;
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getMinVectorRegisterBitWidth() const;
- unsigned getMinimumVF(unsigned ElemWidth) const;
+ ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
- bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+ bool shouldMaximizeVectorBandwidth() const {
return true;
}
bool supportsEfficientVectorElementLoadStore() {
@@ -103,41 +104,43 @@
return true;
}
- unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
- bool Insert, bool Extract);
- unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF);
- unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys,
- TTI::TargetCostKind CostKind);
- unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
- unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
- const SCEV *S);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- unsigned
+ InstructionCost getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract);
+ InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys);
+ InstructionCost getCallInstrCost(Function *F, Type *RetTy,
+ ArrayRef<Type *> Tys,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
+ const SCEV *S);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
- unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp);
- unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment, TTI::TargetCostKind CostKind,
- const Instruction *I);
- unsigned getInterleavedMemoryOpCost(
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+ ArrayRef<int> Mask, int Index, Type *SubTp);
+ InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I);
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- unsigned getArithmeticInstrCost(
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -146,13 +149,15 @@
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
return 1;
}
@@ -161,8 +166,8 @@
/// @}
- int getUserCost(const User *U, ArrayRef<const Value *> Operands,
- TTI::TargetCostKind CostKind);
+ InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind);
// Hexagon specific decision to generate a lookup table.
bool shouldBuildLookupTables() const;
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index a605fdf..f949a93 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -22,12 +22,14 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -179,12 +181,13 @@
struct ByteSpan {
struct Segment {
+ // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
Segment(Value *Val, int Begin, int Len)
: Val(Val), Start(Begin), Size(Len) {}
Segment(const Segment &Seg) = default;
- Value *Val;
- int Start;
- int Size;
+ Value *Val; // Value representable as a sequence of bytes.
+ int Start; // First byte of the value that belongs to the segment.
+ int Size; // Number of bytes in the segment.
};
struct Block {
@@ -192,13 +195,14 @@
Block(Value *Val, int Off, int Len, int Pos)
: Seg(Val, Off, Len), Pos(Pos) {}
Block(const Block &Blk) = default;
- Segment Seg;
- int Pos;
+ Segment Seg; // Value segment.
+ int Pos; // Position (offset) of the segment in the Block.
};
int extent() const;
ByteSpan section(int Start, int Length) const;
ByteSpan &shift(int Offset);
+ SmallVector<Value *, 8> values() const;
int size() const { return Blocks.size(); }
Block &operator[](int i) { return Blocks[i]; }
@@ -295,9 +299,10 @@
return getIfUnordered(dyn_cast<StoreInst>(In));
}
-#if !defined(_MSC_VER) || _MSC_VER >= 1924
-// VS2017 has trouble compiling this:
+#if !defined(_MSC_VER) || _MSC_VER >= 1926
+// VS2017 and some versions of VS2019 have trouble compiling this:
// error C2976: 'std::map': too few template arguments
+// VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
template <typename Pred, typename... Ts>
void erase_if(std::map<Ts...> &map, Pred p)
#else
@@ -354,6 +359,13 @@
return *this;
}
+auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
+ SmallVector<Value *, 8> Values(Blocks.size());
+ for (int i = 0, e = Blocks.size(); i != e; ++i)
+ Values[i] = Blocks[i].Seg.Val;
+ return Values;
+}
+
auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
const auto *C = dyn_cast<ConstantInt>(V);
assert(C && "Alignment must be a compile-time constant integer");
@@ -428,16 +440,21 @@
-> Value * {
// The adjustment is in bytes, but if it's a multiple of the type size,
// we don't need to do pointer casts.
- Type *ElemTy = cast<PointerType>(Ptr->getType())->getElementType();
- int ElemSize = HVC.getSizeOf(ElemTy);
- if (Adjust % ElemSize == 0) {
- Value *Tmp0 = Builder.CreateGEP(Ptr, HVC.getConstInt(Adjust / ElemSize));
- return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
+ auto *PtrTy = cast<PointerType>(Ptr->getType());
+ if (!PtrTy->isOpaque()) {
+ Type *ElemTy = PtrTy->getElementType();
+ int ElemSize = HVC.getSizeOf(ElemTy);
+ if (Adjust % ElemSize == 0) {
+ Value *Tmp0 =
+ Builder.CreateGEP(ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize));
+ return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
+ }
}
PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
- Value *Tmp1 = Builder.CreateGEP(Tmp0, HVC.getConstInt(Adjust));
+ Value *Tmp1 = Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Tmp0,
+ HVC.getConstInt(Adjust));
return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
}
@@ -458,7 +475,7 @@
return PassThru;
if (Mask == ConstantInt::getTrue(Mask->getType()))
return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
- return Builder.CreateMaskedLoad(Ptr, Align(Alignment), Mask, PassThru);
+ return Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment), Mask, PassThru);
}
auto AlignVectors::createAlignedStore(IRBuilder<> &Builder, Value *Val,
@@ -521,11 +538,6 @@
return !llvm::any_of(
G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
});
- // Remove groups where everything is properly aligned.
- erase_if(AddrGroups, [&](auto &G) {
- return llvm::all_of(G.second,
- [&](auto &I) { return I.HaveAlign >= I.NeedAlign; });
- });
return !AddrGroups.empty();
}
@@ -768,28 +780,37 @@
Type *SecTy = HVC.getByteTy(ScLen);
int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+ bool DoAlign = !HVC.isZero(AlignVal);
if (Move.IsLoad) {
ByteSpan ASpan;
auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
auto *Undef = UndefValue::get(SecTy);
- for (int i = 0; i != NumSectors + 1; ++i) {
+ for (int i = 0; i != NumSectors + DoAlign; ++i) {
Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
// FIXME: generate a predicated load?
Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+ // If vector shifting is potentially needed, accumulate metadata
+ // from source sections of twice the load width.
+ int Start = (i - DoAlign) * ScLen;
+ int Width = (1 + DoAlign) * ScLen;
+ propagateMetadata(cast<Instruction>(Load),
+ VSpan.section(Start, Width).values());
ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
}
- for (int j = 0; j != NumSectors; ++j) {
- ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
- ASpan[j + 1].Seg.Val, AlignVal);
+ if (DoAlign) {
+ for (int j = 0; j != NumSectors; ++j) {
+ ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
+ ASpan[j + 1].Seg.Val, AlignVal);
+ }
}
for (ByteSpan::Block &B : VSpan) {
- ByteSpan Section = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+ ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
- for (ByteSpan::Block &S : Section) {
+ for (ByteSpan::Block &S : ASection) {
Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
Accum =
HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
@@ -822,13 +843,13 @@
// Create an extra "undef" sector at the beginning and at the end.
// They will be used as the left/right filler in the vlalign step.
- for (int i = -1; i != NumSectors + 1; ++i) {
+ for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
// For stores, the size of each section is an aligned vector length.
// Adjust the store offsets relative to the section start offset.
- ByteSpan Section = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+ ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
Value *AccumV = UndefValue::get(SecTy);
Value *AccumM = HVC.getNullValue(SecTy);
- for (ByteSpan::Block &S : Section) {
+ for (ByteSpan::Block &S : VSection) {
Value *Pay = getPayload(S.Seg.Val);
Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
Pay->getType(), HVC.getByteTy());
@@ -842,19 +863,29 @@
}
// vlalign
- for (int j = 1; j != NumSectors + 2; ++j) {
- ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
- ASpanV[j].Seg.Val, AlignVal);
- ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
- ASpanM[j].Seg.Val, AlignVal);
+ if (DoAlign) {
+ for (int j = 1; j != NumSectors + 2; ++j) {
+ ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
+ ASpanV[j].Seg.Val, AlignVal);
+ ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
+ ASpanM[j].Seg.Val, AlignVal);
+ }
}
- for (int i = 0; i != NumSectors + 1; ++i) {
+ for (int i = 0; i != NumSectors + DoAlign; ++i) {
Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
Value *Val = ASpanV[i].Seg.Val;
Value *Mask = ASpanM[i].Seg.Val; // bytes
- if (!HVC.isUndef(Val) && !HVC.isZero(Mask))
- createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+ if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
+ Value *Store = createAlignedStore(Builder, Val, Ptr, ScLen,
+ HVC.vlsb(Builder, Mask));
+ // If vector shifting is potentially needed, accumulate metadata
+ // from source sections of twice the store width.
+ int Start = (i - DoAlign) * ScLen;
+ int Width = (1 + DoAlign) * ScLen;
+ propagateMetadata(cast<Instruction>(Store),
+ VSpan.section(Start, Width).values());
+ }
}
}
@@ -1295,8 +1326,7 @@
return None;
Builder B(Gep0->getParent());
- Value *BasePtr = Gep0->getPointerOperand();
- int Scale = DL.getTypeStoreSize(BasePtr->getType()->getPointerElementType());
+ int Scale = DL.getTypeStoreSize(Gep0->getSourceElementType());
// FIXME: for now only check GEPs with a single index.
if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
@@ -1382,6 +1412,11 @@
const Instruction &I = *It;
if (llvm::is_contained(Ignore, &I))
continue;
+ // assume intrinsic can be ignored
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ continue;
+ }
// Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
if (I.mayThrow())
return false;
@@ -1457,6 +1492,8 @@
}
bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 5154a0a..e1c95f1 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -637,9 +637,9 @@
return false;
}
- if (STI.getCPU().equals_lower("hexagonv5") ||
- STI.getCPU().equals_lower("hexagonv55") ||
- STI.getCPU().equals_lower("hexagonv60")) {
+ if (STI.getCPU().equals_insensitive("hexagonv5") ||
+ STI.getCPU().equals_insensitive("hexagonv55") ||
+ STI.getCPU().equals_insensitive("hexagonv60")) {
// If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
// therefore, not duplexable if slot 1 is a store, and slot 0 is not.
if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 53e76a8..0624214 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -10,7 +10,6 @@
// instructions on to the real streamer.
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagonmcelfstreamer"
#include "MCTargetDesc/HexagonMCELFStreamer.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
@@ -37,6 +36,8 @@
#include <cassert>
#include <cstdint>
+#define DEBUG_TYPE "hexagonmcelfstreamer"
+
using namespace llvm;
static cl::opt<unsigned> GPSize
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7d45b4f..d38b77b 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -11,8 +11,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-shuffle"
-
#include "MCTargetDesc/HexagonMCShuffler.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonShuffler.h"
@@ -24,6 +22,8 @@
#include "llvm/Support/raw_ostream.h"
#include <cassert>
+#define DEBUG_TYPE "hexagon-shuffle"
+
using namespace llvm;
static cl::opt<bool>
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 5e4138a..32b0c61 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -78,6 +78,8 @@
cl::init(false));
cl::opt<bool> MV67T("mv67t", cl::Hidden, cl::desc("Build for Hexagon V67T"),
cl::init(false));
+cl::opt<bool> MV68("mv68", cl::Hidden, cl::desc("Build for Hexagon V68"),
+ cl::init(false));
cl::opt<Hexagon::ArchEnum>
EnableHVX("mhvx",
@@ -88,6 +90,7 @@
clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"),
clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"),
clEnumValN(Hexagon::ArchEnum::V67, "v67", "Build for HVX v67"),
+ clEnumValN(Hexagon::ArchEnum::V68, "v68", "Build for HVX v68"),
// Sentinel for no value specified.
clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
// Sentinel for flag not present.
@@ -118,6 +121,8 @@
return "hexagonv67";
if (MV67T)
return "hexagonv67t";
+ if (MV68)
+ return "hexagonv68";
return "";
}
@@ -363,6 +368,9 @@
case Hexagon::ArchEnum::V67:
Result.push_back("+hvxv67");
break;
+ case Hexagon::ArchEnum::V68:
+ Result.push_back("+hvxv68");
+ break;
case Hexagon::ArchEnum::Generic:{
Result.push_back(StringSwitch<StringRef>(CPU)
.Case("hexagonv60", "+hvxv60")
@@ -370,7 +378,8 @@
.Case("hexagonv65", "+hvxv65")
.Case("hexagonv66", "+hvxv66")
.Case("hexagonv67", "+hvxv67")
- .Case("hexagonv67t", "+hvxv67"));
+ .Case("hexagonv67t", "+hvxv67")
+ .Case("hexagonv68", "+hvxv68"));
break;
}
case Hexagon::ArchEnum::NoArch:
@@ -413,8 +422,8 @@
// turns on hvxvNN, corresponding to the existing ArchVNN.
FeatureBitset FB = S;
unsigned CpuArch = ArchV5;
- for (unsigned F : {ArchV67, ArchV66, ArchV65, ArchV62, ArchV60, ArchV55,
- ArchV5}) {
+ for (unsigned F : {ArchV68, ArchV67, ArchV66, ArchV65, ArchV62, ArchV60,
+ ArchV55, ArchV5}) {
if (!FB.test(F))
continue;
CpuArch = F;
@@ -429,7 +438,7 @@
}
bool HasHvxVer = false;
for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
- ExtensionHVXV66, ExtensionHVXV67}) {
+ ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68}) {
if (!FB.test(F))
continue;
HasHvxVer = true;
@@ -442,6 +451,9 @@
// HasHvxVer is false, and UseHvx is true.
switch (CpuArch) {
+ case ArchV68:
+ FB.set(ExtensionHVXV68);
+ LLVM_FALLTHROUGH;
case ArchV67:
FB.set(ExtensionHVXV67);
LLVM_FALLTHROUGH;
@@ -525,6 +537,7 @@
{"hexagonv66", ELF::EF_HEXAGON_MACH_V66},
{"hexagonv67", ELF::EF_HEXAGON_MACH_V67},
{"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T},
+ {"hexagonv68", ELF::EF_HEXAGON_MACH_V68},
};
auto F = ElfFlags.find(STI.getCPU());
diff --git a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 8a44ba3..1fce90b 100644
--- a/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/src/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -11,8 +11,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-shuffle"
-
#include "MCTargetDesc/HexagonShuffler.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
@@ -33,6 +31,8 @@
#include <utility>
#include <vector>
+#define DEBUG_TYPE "hexagon-shuffle"
+
using namespace llvm;
namespace {
diff --git a/src/llvm-project/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/src/llvm-project/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 639ab24..e2642dd 100644
--- a/src/llvm-project/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -754,9 +754,9 @@
return nullptr;
// Check if identifier has a modifier
- if (Identifier.equals_lower("hi"))
+ if (Identifier.equals_insensitive("hi"))
Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
- else if (Identifier.equals_lower("lo"))
+ else if (Identifier.equals_insensitive("lo"))
Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
// If the identifier corresponds to a variant then extract the real
diff --git a/src/llvm-project/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/src/llvm-project/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
index 6ca9b1d..8aa2841 100644
--- a/src/llvm-project/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
+++ b/src/llvm-project/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -13,10 +13,10 @@
#ifndef LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
#define LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
-#define DEBUG_TYPE "lanai-disassembler"
-
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#define DEBUG_TYPE "lanai-disassembler"
+
namespace llvm {
class LanaiDisassembler : public MCDisassembler {
diff --git a/src/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
index 3c84ed0..3a2d503 100644
--- a/src/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -33,7 +33,7 @@
// Get the alignment.
Align StackAlign =
- LRI->needsStackRealignment(MF) ? MFI.getMaxAlign() : getStackAlign();
+ LRI->hasStackRealignment(MF) ? MFI.getMaxAlign() : getStackAlign();
// Get the maximum call frame size of all the calls.
unsigned MaxCallFrameSize = MFI.getMaxCallFrameSize();
diff --git a/src/llvm-project/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 64f87ae..abe20c8 100644
--- a/src/llvm-project/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -147,14 +147,14 @@
// Addressable stack objects are addressed using neg. offsets from fp
// or pos. offsets from sp/basepointer
- if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
+ if (!HasFP || (hasStackRealignment(MF) && FrameIndex >= 0))
Offset += MF.getFrameInfo().getStackSize();
Register FrameReg = getFrameRegister(MF);
if (FrameIndex >= 0) {
if (hasBasePointer(MF))
FrameReg = getBaseRegister();
- else if (needsStackRealignment(MF))
+ else if (hasStackRealignment(MF))
FrameReg = Lanai::SP;
}
@@ -245,7 +245,7 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
// When we need stack realignment and there are dynamic allocas, we can't
// reference off of the stack pointer, so we reserve a base pointer.
- if (needsStackRealignment(MF) && MFI.hasVarSizedObjects())
+ if (hasStackRealignment(MF) && MFI.hasVarSizedObjects())
return true;
return false;
diff --git a/src/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 263f838..f1fcbe4 100644
--- a/src/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -49,7 +49,8 @@
return TTI::PSK_Software;
}
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
if (Imm == 0)
return TTI::TCC_Free;
@@ -66,18 +67,20 @@
return 4 * TTI::TCC_Basic;
}
- int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr) {
+ InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr) {
return getIntImmCost(Imm, Ty, CostKind);
}
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind) {
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
return getIntImmCost(Imm, Ty, CostKind);
}
- unsigned getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/src/llvm-project/llvm/lib/Target/M68k/AsmParser/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/M68k/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..f4bbd39
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/AsmParser/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_component_library(LLVMM68kAsmParser
+ M68kAsmParser.cpp
+
+ LINK_COMPONENTS
+ MC
+ MCParser
+ Support
+ M68kCodeGen
+ M68kInfo
+
+ ADD_TO_COMPONENT
+ M68k
+)
diff --git a/src/llvm-project/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/src/llvm-project/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
new file mode 100644
index 0000000..94126e1
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -0,0 +1,861 @@
+//===---- M68kAsmParser.cpp - Parse M68k assembly to MCInst instructions --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "M68kInstrInfo.h"
+#include "M68kRegisterInfo.h"
+#include "TargetInfo/M68kTargetInfo.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <sstream>
+
+#define DEBUG_TYPE "m68k-asm-parser"
+
+using namespace llvm;
+
+static cl::opt<bool> RegisterPrefixOptional(
+ "m68k-register-prefix-optional", cl::Hidden,
+ cl::desc("Enable specifying registers without the % prefix"),
+ cl::init(false));
+
+namespace {
+/// Parses M68k assembly from a stream.
+class M68kAsmParser : public MCTargetAsmParser {
+ const MCSubtargetInfo &STI;
+ MCAsmParser &Parser;
+ const MCRegisterInfo *MRI;
+
+#define GET_ASSEMBLER_HEADER
+#include "M68kGenAsmMatcher.inc"
+
+ // Helpers for Match&Emit.
+ bool invalidOperand(const SMLoc &Loc, const OperandVector &Operands,
+ const uint64_t &ErrorInfo);
+ bool missingFeature(const SMLoc &Loc, const uint64_t &ErrorInfo);
+ bool emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const;
+ bool parseRegisterName(unsigned int &RegNo, SMLoc Loc,
+ StringRef RegisterName);
+ OperandMatchResultTy parseRegister(unsigned int &RegNo);
+
+ // Parser functions.
+ void eatComma();
+
+ bool isExpr();
+ OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseMemOp(OperandVector &Operands);
+
+public:
+ M68kAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+ MCAsmParserExtension::Initialize(Parser);
+ MRI = getContext().getRegisterInfo();
+
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+};
+
+struct M68kMemOp {
+ enum class Kind {
+ Addr,
+ Reg,
+ RegIndirect,
+ RegPostIncrement,
+ RegPreDecrement,
+ RegIndirectDisplacement,
+ RegIndirectDisplacementIndex,
+ };
+
+ // These variables are used for the following forms:
+ // Addr: (OuterDisp)
+ // Reg: %OuterReg
+ // RegIndirect: (%OuterReg)
+ // RegPostIncrement: (%OuterReg)+
+ // RegPreDecrement: -(%OuterReg)
+ // RegIndirectDisplacement: OuterDisp(%OuterReg)
+ // RegIndirectDisplacementIndex:
+ // OuterDisp(%OuterReg, %InnerReg.Size * Scale, InnerDisp)
+
+ Kind Op;
+ unsigned OuterReg;
+ unsigned InnerReg;
+ const MCExpr *OuterDisp;
+ const MCExpr *InnerDisp;
+ uint8_t Size : 4;
+ uint8_t Scale : 4;
+ const MCExpr *Expr;
+
+ M68kMemOp() {}
+ M68kMemOp(Kind Op) : Op(Op) {}
+
+ void print(raw_ostream &OS) const;
+};
+
+/// An parsed M68k assembly operand.
+class M68kOperand : public MCParsedAsmOperand {
+ typedef MCParsedAsmOperand Base;
+
+ enum class KindTy {
+ Invalid,
+ Token,
+ Imm,
+ MemOp,
+ };
+
+ KindTy Kind;
+ SMLoc Start, End;
+ union {
+ StringRef Token;
+ int64_t Imm;
+ const MCExpr *Expr;
+ M68kMemOp MemOp;
+ };
+
+public:
+ M68kOperand(KindTy Kind, SMLoc Start, SMLoc End)
+ : Base(), Kind(Kind), Start(Start), End(End) {}
+
+ SMLoc getStartLoc() const override { return Start; }
+ SMLoc getEndLoc() const override { return End; }
+
+ void print(raw_ostream &OS) const override;
+
+ bool isMem() const override { return false; }
+ bool isMemOp() const { return Kind == KindTy::MemOp; }
+
+ static void addExpr(MCInst &Inst, const MCExpr *Expr);
+
+ // Reg
+ bool isReg() const override;
+ unsigned getReg() const override;
+ void addRegOperands(MCInst &Inst, unsigned N) const;
+
+ static std::unique_ptr<M68kOperand> createMemOp(M68kMemOp MemOp, SMLoc Start,
+ SMLoc End);
+
+ // Token
+ bool isToken() const override;
+ StringRef getToken() const;
+ static std::unique_ptr<M68kOperand> createToken(StringRef Token, SMLoc Start,
+ SMLoc End);
+
+ // Imm
+ bool isImm() const override;
+ void addImmOperands(MCInst &Inst, unsigned N) const;
+
+ static std::unique_ptr<M68kOperand> createImm(const MCExpr *Expr, SMLoc Start,
+ SMLoc End);
+
+ // Addr
+ bool isAddr() const;
+ void addAddrOperands(MCInst &Inst, unsigned N) const;
+
+ // ARI
+ bool isARI() const;
+ void addARIOperands(MCInst &Inst, unsigned N) const;
+
+ // ARID
+ bool isARID() const;
+ void addARIDOperands(MCInst &Inst, unsigned N) const;
+
+ // ARII
+ bool isARII() const;
+ void addARIIOperands(MCInst &Inst, unsigned N) const;
+
+ // ARIPD
+ bool isARIPD() const;
+ void addARIPDOperands(MCInst &Inst, unsigned N) const;
+
+ // ARIPI
+ bool isARIPI() const;
+ void addARIPIOperands(MCInst &Inst, unsigned N) const;
+
+ // PCD
+ bool isPCD() const;
+ void addPCDOperands(MCInst &Inst, unsigned N) const;
+
+ // PCI
+ bool isPCI() const;
+ void addPCIOperands(MCInst &Inst, unsigned N) const;
+};
+
+} // end anonymous namespace.
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kAsmParser() {
+ RegisterMCAsmParser<M68kAsmParser> X(getTheM68kTarget());
+}
+
+#define GET_MATCHER_IMPLEMENTATION
+#include "M68kGenAsmMatcher.inc"
+
+void M68kMemOp::print(raw_ostream &OS) const {
+ switch (Op) {
+ case Kind::Addr:
+ OS << OuterDisp;
+ break;
+ case Kind::Reg:
+ OS << '%' << OuterReg;
+ break;
+ case Kind::RegIndirect:
+ OS << "(%" << OuterReg << ')';
+ break;
+ case Kind::RegPostIncrement:
+ OS << "(%" << OuterReg << ")+";
+ break;
+ case Kind::RegPreDecrement:
+ OS << "-(%" << OuterReg << ")";
+ break;
+ case Kind::RegIndirectDisplacement:
+ OS << OuterDisp << "(%" << OuterReg << ")";
+ break;
+ case Kind::RegIndirectDisplacementIndex:
+ OS << OuterDisp << "(%" << OuterReg << ", " << InnerReg << "." << Size
+ << ", " << InnerDisp << ")";
+ break;
+ }
+}
+
+void M68kOperand::addExpr(MCInst &Inst, const MCExpr *Expr) {
+ if (auto Const = dyn_cast<MCConstantExpr>(Expr)) {
+ Inst.addOperand(MCOperand::createImm(Const->getValue()));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createExpr(Expr));
+}
+
+// Reg
+bool M68kOperand::isReg() const {
+ return Kind == KindTy::MemOp && MemOp.Op == M68kMemOp::Kind::Reg;
+}
+
+unsigned M68kOperand::getReg() const {
+ assert(isReg());
+ return MemOp.OuterReg;
+}
+
+void M68kOperand::addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(isReg() && "wrong operand kind");
+ assert((N == 1) && "can only handle one register operand");
+
+ Inst.addOperand(MCOperand::createReg(getReg()));
+}
+
+std::unique_ptr<M68kOperand> M68kOperand::createMemOp(M68kMemOp MemOp,
+ SMLoc Start, SMLoc End) {
+ auto Op = std::make_unique<M68kOperand>(KindTy::MemOp, Start, End);
+ Op->MemOp = MemOp;
+ return Op;
+}
+
+// Token
+bool M68kOperand::isToken() const { return Kind == KindTy::Token; }
+StringRef M68kOperand::getToken() const {
+ assert(isToken());
+ return Token;
+}
+
+std::unique_ptr<M68kOperand> M68kOperand::createToken(StringRef Token,
+ SMLoc Start, SMLoc End) {
+ auto Op = std::make_unique<M68kOperand>(KindTy::Token, Start, End);
+ Op->Token = Token;
+ return Op;
+}
+
+// Imm
+bool M68kOperand::isImm() const { return Kind == KindTy::Imm; }
+void M68kOperand::addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(isImm() && "wrong oeprand kind");
+ assert((N == 1) && "can only handle one register operand");
+
+ M68kOperand::addExpr(Inst, Expr);
+}
+
+std::unique_ptr<M68kOperand> M68kOperand::createImm(const MCExpr *Expr,
+ SMLoc Start, SMLoc End) {
+ auto Op = std::make_unique<M68kOperand>(KindTy::Imm, Start, End);
+ Op->Expr = Expr;
+ return Op;
+}
+
+// Addr
+bool M68kOperand::isAddr() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::Addr;
+}
+void M68kOperand::addAddrOperands(MCInst &Inst, unsigned N) const {
+ M68kOperand::addExpr(Inst, MemOp.OuterDisp);
+}
+
+// ARI
+bool M68kOperand::isARI() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::RegIndirect &&
+ M68k::AR32RegClass.contains(MemOp.OuterReg);
+}
+void M68kOperand::addARIOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::createReg(MemOp.OuterReg));
+}
+
+// ARID
+bool M68kOperand::isARID() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::RegIndirectDisplacement &&
+ M68k::AR32RegClass.contains(MemOp.OuterReg);
+}
+void M68kOperand::addARIDOperands(MCInst &Inst, unsigned N) const {
+ M68kOperand::addExpr(Inst, MemOp.OuterDisp);
+ Inst.addOperand(MCOperand::createReg(MemOp.OuterReg));
+}
+
+// ARII
+bool M68kOperand::isARII() const {
+ return isMemOp() &&
+ MemOp.Op == M68kMemOp::Kind::RegIndirectDisplacementIndex &&
+ M68k::AR32RegClass.contains(MemOp.OuterReg);
+}
+void M68kOperand::addARIIOperands(MCInst &Inst, unsigned N) const {
+ M68kOperand::addExpr(Inst, MemOp.OuterDisp);
+ Inst.addOperand(MCOperand::createReg(MemOp.OuterReg));
+ Inst.addOperand(MCOperand::createReg(MemOp.InnerReg));
+}
+
+// ARIPD
+bool M68kOperand::isARIPD() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::RegPreDecrement &&
+ M68k::AR32RegClass.contains(MemOp.OuterReg);
+}
+void M68kOperand::addARIPDOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::createReg(MemOp.OuterReg));
+}
+
+// ARIPI
+bool M68kOperand::isARIPI() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::RegPostIncrement &&
+ M68k::AR32RegClass.contains(MemOp.OuterReg);
+}
+void M68kOperand::addARIPIOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::createReg(MemOp.OuterReg));
+}
+
+// PCD
+bool M68kOperand::isPCD() const {
+ return isMemOp() && MemOp.Op == M68kMemOp::Kind::RegIndirectDisplacement &&
+ MemOp.OuterReg == M68k::PC;
+}
+void M68kOperand::addPCDOperands(MCInst &Inst, unsigned N) const {
+ M68kOperand::addExpr(Inst, MemOp.OuterDisp);
+}
+
+// PCI
+bool M68kOperand::isPCI() const {
+ return isMemOp() &&
+ MemOp.Op == M68kMemOp::Kind::RegIndirectDisplacementIndex &&
+ MemOp.OuterReg == M68k::PC;
+}
+void M68kOperand::addPCIOperands(MCInst &Inst, unsigned N) const {
+ M68kOperand::addExpr(Inst, MemOp.OuterDisp);
+ Inst.addOperand(MCOperand::createReg(MemOp.InnerReg));
+}
+
+static inline bool checkRegisterClass(unsigned RegNo, bool Data, bool Address,
+ bool SP) {
+ switch (RegNo) {
+ case M68k::A0:
+ case M68k::A1:
+ case M68k::A2:
+ case M68k::A3:
+ case M68k::A4:
+ case M68k::A5:
+ case M68k::A6:
+ return Address;
+
+ case M68k::SP:
+ return SP;
+
+ case M68k::D0:
+ case M68k::D1:
+ case M68k::D2:
+ case M68k::D3:
+ case M68k::D4:
+ case M68k::D5:
+ case M68k::D6:
+ case M68k::D7:
+ return Data;
+
+ case M68k::SR:
+ case M68k::CCR:
+ return false;
+
+ default:
+ llvm_unreachable("unexpected register type");
+ return false;
+ }
+}
+
+unsigned M68kAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) {
+ M68kOperand &Operand = (M68kOperand &)Op;
+
+ switch (Kind) {
+ case MCK_XR16:
+ case MCK_SPILL:
+ if (Operand.isReg() &&
+ checkRegisterClass(Operand.getReg(), true, true, true)) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_AR16:
+ case MCK_AR32:
+ if (Operand.isReg() &&
+ checkRegisterClass(Operand.getReg(), false, true, true)) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_AR32_NOSP:
+ if (Operand.isReg() &&
+ checkRegisterClass(Operand.getReg(), false, true, false)) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_DR8:
+ case MCK_DR16:
+ case MCK_DR32:
+ if (Operand.isReg() &&
+ checkRegisterClass(Operand.getReg(), true, false, false)) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_AR16_TC:
+ if (Operand.isReg() &&
+ ((Operand.getReg() == M68k::A0) || (Operand.getReg() == M68k::A1))) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_DR16_TC:
+ if (Operand.isReg() &&
+ ((Operand.getReg() == M68k::D0) || (Operand.getReg() == M68k::D1))) {
+ return Match_Success;
+ }
+ break;
+
+ case MCK_XR16_TC:
+ if (Operand.isReg() &&
+ ((Operand.getReg() == M68k::D0) || (Operand.getReg() == M68k::D1) ||
+ (Operand.getReg() == M68k::A0) || (Operand.getReg() == M68k::A1))) {
+ return Match_Success;
+ }
+ break;
+ }
+
+ return Match_InvalidOperand;
+}
+
+bool M68kAsmParser::parseRegisterName(unsigned &RegNo, SMLoc Loc,
+ StringRef RegisterName) {
+ auto RegisterNameLower = RegisterName.lower();
+
+ // CCR register
+ if (RegisterNameLower == "ccr") {
+ RegNo = M68k::CCR;
+ return true;
+ }
+
+ // Parse simple general-purpose registers.
+ if (RegisterNameLower.size() == 2) {
+ static unsigned RegistersByIndex[] = {
+ M68k::D0, M68k::D1, M68k::D2, M68k::D3, M68k::D4, M68k::D5,
+ M68k::D6, M68k::D7, M68k::A0, M68k::A1, M68k::A2, M68k::A3,
+ M68k::A4, M68k::A5, M68k::A6, M68k::SP,
+ };
+
+ switch (RegisterNameLower[0]) {
+ case 'd':
+ case 'a': {
+ if (isdigit(RegisterNameLower[1])) {
+ unsigned IndexOffset = (RegisterNameLower[0] == 'a') ? 8 : 0;
+ unsigned RegIndex = (unsigned)(RegisterNameLower[1] - '0');
+ if (RegIndex < 8) {
+ RegNo = RegistersByIndex[IndexOffset + RegIndex];
+ return true;
+ }
+ }
+ break;
+ }
+
+ case 's':
+ if (RegisterNameLower[1] == 'p') {
+ RegNo = M68k::SP;
+ return true;
+ } else if (RegisterNameLower[1] == 'r') {
+ RegNo = M68k::SR;
+ return true;
+ }
+ break;
+
+ case 'p':
+ if (RegisterNameLower[1] == 'c') {
+ RegNo = M68k::PC;
+ return true;
+ }
+ break;
+ }
+ }
+
+ return false;
+}
+
+OperandMatchResultTy M68kAsmParser::parseRegister(unsigned &RegNo) {
+ bool HasPercent = false;
+ AsmToken PercentToken;
+
+ LLVM_DEBUG(dbgs() << "parseRegister "; getTok().dump(dbgs()); dbgs() << "\n");
+
+ if (getTok().is(AsmToken::Percent)) {
+ HasPercent = true;
+ PercentToken = Lex();
+ } else if (!RegisterPrefixOptional.getValue()) {
+ return MatchOperand_NoMatch;
+ }
+
+ if (!Parser.getTok().is(AsmToken::Identifier)) {
+ if (HasPercent) {
+ getLexer().UnLex(PercentToken);
+ }
+ return MatchOperand_NoMatch;
+ }
+
+ auto RegisterName = Parser.getTok().getString();
+ if (!parseRegisterName(RegNo, Parser.getLexer().getLoc(), RegisterName)) {
+ if (HasPercent) {
+ getLexer().UnLex(PercentToken);
+ }
+ return MatchOperand_NoMatch;
+ }
+
+ Parser.Lex();
+ return MatchOperand_Success;
+}
+
+bool M68kAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ auto Result = tryParseRegister(RegNo, StartLoc, EndLoc);
+ if (Result != MatchOperand_Success) {
+ return Error(StartLoc, "expected register");
+ }
+
+ return false;
+}
+
+OperandMatchResultTy M68kAsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ StartLoc = getLexer().getLoc();
+ auto Result = parseRegister(RegNo);
+ EndLoc = getLexer().getLoc();
+ return Result;
+}
+
+bool M68kAsmParser::isExpr() {
+ switch (Parser.getTok().getKind()) {
+ case AsmToken::Identifier:
+ case AsmToken::Integer:
+ return true;
+ case AsmToken::Minus:
+ return getLexer().peekTok().getKind() == AsmToken::Integer;
+
+ default:
+ return false;
+ }
+}
+
+OperandMatchResultTy M68kAsmParser::parseImm(OperandVector &Operands) {
+ if (getLexer().isNot(AsmToken::Hash)) {
+ return MatchOperand_NoMatch;
+ }
+ SMLoc Start = getLexer().getLoc();
+ Parser.Lex();
+
+ SMLoc End;
+ const MCExpr *Expr;
+
+ if (getParser().parseExpression(Expr, End)) {
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(M68kOperand::createImm(Expr, Start, End));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy M68kAsmParser::parseMemOp(OperandVector &Operands) {
+ SMLoc Start = getLexer().getLoc();
+ bool IsPD = false;
+ M68kMemOp MemOp;
+
+ // Check for a plain register.
+ auto Result = parseRegister(MemOp.OuterReg);
+ if (Result == MatchOperand_Success) {
+ MemOp.Op = M68kMemOp::Kind::Reg;
+ Operands.push_back(
+ M68kOperand::createMemOp(MemOp, Start, getLexer().getLoc()));
+ return MatchOperand_Success;
+ }
+
+ if (Result == MatchOperand_ParseFail) {
+ return Result;
+ }
+
+ // Check for pre-decrement & outer displacement.
+ bool HasDisplacement = false;
+ if (getLexer().is(AsmToken::Minus)) {
+ IsPD = true;
+ Parser.Lex();
+ } else if (isExpr()) {
+ if (Parser.parseExpression(MemOp.OuterDisp)) {
+ return MatchOperand_ParseFail;
+ }
+ HasDisplacement = true;
+ }
+
+ if (getLexer().isNot(AsmToken::LParen)) {
+ if (HasDisplacement) {
+ MemOp.Op = M68kMemOp::Kind::Addr;
+ Operands.push_back(
+ M68kOperand::createMemOp(MemOp, Start, getLexer().getLoc()));
+ return MatchOperand_Success;
+ } else if (IsPD) {
+ Error(getLexer().getLoc(), "expected (");
+ return MatchOperand_ParseFail;
+ }
+
+ return MatchOperand_NoMatch;
+ }
+ Parser.Lex();
+
+ // Check for constant dereference & MIT-style displacement
+ if (!HasDisplacement && isExpr()) {
+ if (Parser.parseExpression(MemOp.OuterDisp)) {
+ return MatchOperand_ParseFail;
+ }
+ HasDisplacement = true;
+
+ // If we're not followed by a comma, we're a constant dereference.
+ if (getLexer().isNot(AsmToken::Comma)) {
+ MemOp.Op = M68kMemOp::Kind::Addr;
+ Operands.push_back(
+ M68kOperand::createMemOp(MemOp, Start, getLexer().getLoc()));
+ return MatchOperand_Success;
+ }
+
+ Parser.Lex();
+ }
+
+ Result = parseRegister(MemOp.OuterReg);
+ if (Result == MatchOperand_ParseFail) {
+ return MatchOperand_ParseFail;
+ }
+
+ if (Result != MatchOperand_Success) {
+ Error(getLexer().getLoc(), "expected register");
+ return MatchOperand_ParseFail;
+ }
+
+ // Check for Index.
+ bool HasIndex = false;
+ if (Parser.getTok().is(AsmToken::Comma)) {
+ Parser.Lex();
+
+ Result = parseRegister(MemOp.InnerReg);
+ if (Result == MatchOperand_ParseFail) {
+ return Result;
+ }
+
+ if (Result == MatchOperand_NoMatch) {
+ Error(getLexer().getLoc(), "expected register");
+ return MatchOperand_ParseFail;
+ }
+
+ // TODO: parse size, scale and inner displacement.
+ MemOp.Size = 4;
+ MemOp.Scale = 1;
+ MemOp.InnerDisp = MCConstantExpr::create(0, Parser.getContext(), true, 4);
+ HasIndex = true;
+ }
+
+ if (Parser.getTok().isNot(AsmToken::RParen)) {
+ Error(getLexer().getLoc(), "expected )");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+
+ bool IsPI = false;
+ if (!IsPD && Parser.getTok().is(AsmToken::Plus)) {
+ Parser.Lex();
+ IsPI = true;
+ }
+
+ SMLoc End = getLexer().getLoc();
+
+ unsigned OpCount = IsPD + IsPI + (HasIndex || HasDisplacement);
+ if (OpCount > 1) {
+ Error(Start, "only one of post-increment, pre-decrement or displacement "
+ "can be used");
+ return MatchOperand_ParseFail;
+ }
+
+ if (IsPD) {
+ MemOp.Op = M68kMemOp::Kind::RegPreDecrement;
+ } else if (IsPI) {
+ MemOp.Op = M68kMemOp::Kind::RegPostIncrement;
+ } else if (HasIndex) {
+ MemOp.Op = M68kMemOp::Kind::RegIndirectDisplacementIndex;
+ } else if (HasDisplacement) {
+ MemOp.Op = M68kMemOp::Kind::RegIndirectDisplacement;
+ } else {
+ MemOp.Op = M68kMemOp::Kind::RegIndirect;
+ }
+
+ Operands.push_back(M68kOperand::createMemOp(MemOp, Start, End));
+ return MatchOperand_Success;
+}
+
+void M68kAsmParser::eatComma() {
+ if (Parser.getTok().is(AsmToken::Comma)) {
+ Parser.Lex();
+ }
+}
+
+bool M68kAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ SMLoc Start = getLexer().getLoc();
+ Operands.push_back(M68kOperand::createToken(Name, Start, Start));
+
+ bool First = true;
+ while (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+ if (!First) {
+ eatComma();
+ } else {
+ First = false;
+ }
+
+ auto MatchResult = MatchOperandParserImpl(Operands, Name);
+ if (MatchResult == MatchOperand_Success) {
+ continue;
+ }
+
+ // Add custom operand formats here...
+ SMLoc Loc = getLexer().getLoc();
+ Parser.eatToEndOfStatement();
+ return Error(Loc, "unexpected token parsing operands");
+ }
+
+ // Eat EndOfStatement.
+ Parser.Lex();
+ return false;
+}
+
+bool M68kAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
+
+bool M68kAsmParser::invalidOperand(SMLoc const &Loc,
+ OperandVector const &Operands,
+ uint64_t const &ErrorInfo) {
+ SMLoc ErrorLoc = Loc;
+ char const *Diag = 0;
+
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size()) {
+ Diag = "too few operands for instruction.";
+ } else {
+ auto const &Op = (M68kOperand const &)*Operands[ErrorInfo];
+ if (Op.getStartLoc() != SMLoc()) {
+ ErrorLoc = Op.getStartLoc();
+ }
+ }
+ }
+
+ if (!Diag) {
+ Diag = "invalid operand for instruction";
+ }
+
+ return Error(ErrorLoc, Diag);
+}
+
+bool M68kAsmParser::missingFeature(llvm::SMLoc const &Loc,
+ uint64_t const &ErrorInfo) {
+ return Error(Loc, "instruction requires a CPU feature not currently enabled");
+}
+
+bool M68kAsmParser::emit(MCInst &Inst, SMLoc const &Loc,
+ MCStreamer &Out) const {
+ Inst.setLoc(Loc);
+ Out.emitInstruction(Inst, STI);
+
+ return false;
+}
+
+bool M68kAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+ switch (MatchResult) {
+ case Match_Success:
+ return emit(Inst, Loc, Out);
+ case Match_MissingFeature:
+ return missingFeature(Loc, ErrorInfo);
+ case Match_InvalidOperand:
+ return invalidOperand(Loc, Operands, ErrorInfo);
+ case Match_MnemonicFail:
+ return Error(Loc, "invalid instruction");
+ default:
+ return true;
+ }
+}
+
+void M68kOperand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case KindTy::Invalid:
+ OS << "invalid";
+ break;
+
+ case KindTy::Token:
+ OS << "token '" << Token << "'";
+ break;
+
+ case KindTy::Imm:
+ OS << "immediate " << Imm;
+ break;
+
+ case KindTy::MemOp:
+ MemOp.print(OS);
+ break;
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/M68k/CMakeLists.txt
new file mode 100644
index 0000000..0e7bcff
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/CMakeLists.txt
@@ -0,0 +1,58 @@
+add_llvm_component_group(M68k)
+
+set(LLVM_TARGET_DEFINITIONS M68k.td)
+
+tablegen(LLVM M68kGenGlobalISel.inc -gen-global-isel)
+tablegen(LLVM M68kGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM M68kGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM M68kGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM M68kGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM M68kGenMCCodeBeads.inc -gen-code-beads)
+tablegen(LLVM M68kGenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM M68kGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM M68kGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM M68kGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM M68kGenAsmMatcher.inc -gen-asm-matcher)
+
+add_public_tablegen_target(M68kCommonTableGen)
+
+add_llvm_target(M68kCodeGen
+ GlSel/M68kCallLowering.cpp
+ GlSel/M68kInstructionSelector.cpp
+ GlSel/M68kLegalizerInfo.cpp
+ GlSel/M68kRegisterBankInfo.cpp
+ M68kAsmPrinter.cpp
+ M68kCollapseMOVEMPass.cpp
+ M68kExpandPseudo.cpp
+ M68kFrameLowering.cpp
+ M68kInstrInfo.cpp
+ M68kISelLowering.cpp
+ M68kISelDAGToDAG.cpp
+ M68kMachineFunction.cpp
+ M68kMCInstLower.cpp
+ M68kRegisterInfo.cpp
+ M68kSubtarget.cpp
+ M68kTargetMachine.cpp
+ M68kTargetObjectFile.cpp
+
+ LINK_COMPONENTS
+ Analysis
+ AsmPrinter
+ CodeGen
+ Core
+ GlobalISel
+ MC
+ SelectionDAG
+ Support
+ Target
+ M68kDesc
+ M68kInfo
+
+ ADD_TO_COMPONENT
+ M68k
+)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
diff --git a/src/llvm-project/llvm/lib/Target/M68k/Disassembler/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/M68k/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..2b8e676
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/Disassembler/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_component_library(LLVMM68kDisassembler
+ M68kDisassembler.cpp
+
+ LINK_COMPONENTS
+ M68kDesc
+ M68kInfo
+ MCDisassembler
+ Support
+
+ ADD_TO_COMPONENT
+ M68k
+)
+
diff --git a/src/llvm-project/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/src/llvm-project/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
new file mode 100644
index 0000000..a8453c8
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -0,0 +1,612 @@
+//===- M68kDisassembler.cpp - Disassembler for M68k -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the M68k Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "M68k.h"
+#include "M68kRegisterInfo.h"
+#include "M68kSubtarget.h"
+#include "MCTargetDesc/M68kMCCodeEmitter.h"
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+#include "TargetInfo/M68kTargetInfo.h"
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+constexpr unsigned MaxInstructionWords = 11;
+
+class M68kInstructionBuffer {
+ typedef SmallVector<uint16_t, MaxInstructionWords> BufferType;
+ BufferType Buffer;
+
+public:
+ M68kInstructionBuffer() {}
+
+ template <typename TIt>
+ M68kInstructionBuffer(TIt Start, TIt End) : Buffer(Start, End) {}
+
+ unsigned size() const { return Buffer.size(); }
+
+ BufferType::const_iterator begin() const { return Buffer.begin(); }
+ BufferType::const_iterator end() const { return Buffer.end(); }
+
+ uint16_t operator[](unsigned Index) const {
+ assert((Index < Buffer.size()) && "tried to read out of bounds word");
+ return Buffer[Index];
+ }
+
+ void truncate(unsigned NewLength) {
+ assert((NewLength <= Buffer.size()) &&
+ "instruction buffer too short to truncate");
+ Buffer.resize(NewLength);
+ }
+
+ void dump() const;
+
+ static M68kInstructionBuffer fill(ArrayRef<uint8_t> Bytes);
+};
+
+class M68kInstructionReader {
+ M68kInstructionBuffer Buffer;
+ unsigned NumRead;
+
+public:
+ M68kInstructionReader(M68kInstructionBuffer Buf) : Buffer(Buf), NumRead(0) {}
+
+ unsigned size() const { return (Buffer.size() * 16) - NumRead; }
+
+ uint64_t readBits(unsigned NumBits);
+};
+
+struct M68kInstructionLookup {
+ unsigned OpCode;
+ M68kInstructionBuffer Mask;
+ M68kInstructionBuffer Value;
+
+ unsigned size() const { return Mask.size(); }
+
+ // Check whether this instruction could possibly match the given bytes.
+ bool matches(const M68kInstructionBuffer &Test) const;
+ void dump() const;
+};
+
+class M68kInstructionLookupBuilder {
+ std::array<uint16_t, MaxInstructionWords> Mask;
+ std::array<uint16_t, MaxInstructionWords> Value;
+ unsigned NumWritten;
+
+public:
+ M68kInstructionLookupBuilder() : NumWritten(0) {
+ Mask.fill(0);
+ Value.fill(0);
+ }
+
+ unsigned numWords() const {
+ assert(!(NumWritten & 0xf) && "instructions must be whole words");
+ return NumWritten >> 4;
+ }
+
+ bool isValid() const;
+ M68kInstructionLookup build(unsigned OpCode);
+ void addBits(unsigned N, uint64_t Bits);
+ void skipBits(unsigned N);
+};
+
+/// A disassembler class for M68k.
+class M68kDisassembler : public MCDisassembler {
+ MCInstrInfo *MCII;
+ std::vector<M68kInstructionLookup> Lookups;
+
+public:
+ M68kDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ MCInstrInfo *MCII)
+ : MCDisassembler(STI, Ctx), MCII(MCII) {
+ buildBeadTable();
+ }
+ virtual ~M68kDisassembler() {}
+
+ void buildBeadTable();
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &CStream) const override;
+ void decodeReg(MCInst &Instr, unsigned int Bead,
+ M68kInstructionReader &Reader, unsigned &Scratch) const;
+ void decodeImm(MCInst &Instr, unsigned int Bead,
+ M68kInstructionReader &Reader, unsigned &Scratch) const;
+ unsigned int getRegOperandIndex(MCInst &Instr, unsigned int Bead) const;
+ unsigned int getImmOperandIndex(MCInst &Instr, unsigned int Bead) const;
+};
+} // namespace
+
+static unsigned RegisterDecode[] = {
+ M68k::A0, M68k::A1, M68k::A2, M68k::A3, M68k::A4, M68k::A5,
+ M68k::A6, M68k::SP, M68k::D0, M68k::D1, M68k::D2, M68k::D3,
+ M68k::D4, M68k::D5, M68k::D6, M68k::D7,
+};
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void M68kInstructionBuffer::dump() const {
+ for (auto Word : Buffer) {
+ for (unsigned B = 0; B < 16; ++B) {
+ uint16_t Bit = (1 << (16 - B - 1));
+ unsigned IsClear = !(Word & Bit);
+
+ if (B == 8)
+ dbgs() << " ";
+
+ char Ch = IsClear ? '0' : '1';
+ dbgs() << Ch;
+ }
+
+ dbgs() << " ";
+ }
+
+ dbgs() << "\n";
+}
+#endif
+
+M68kInstructionBuffer M68kInstructionBuffer::fill(ArrayRef<uint8_t> Bytes) {
+ SmallVector<uint16_t, MaxInstructionWords> Buffer;
+ Buffer.resize(std::min(Bytes.size() / 2, Buffer.max_size()));
+
+ for (unsigned I = 0, E = Buffer.size(); I < E; ++I) {
+ unsigned Offset = I * 2;
+ uint64_t Hi = Bytes[Offset];
+ uint64_t Lo = Bytes[Offset + 1];
+ uint64_t Word = (Hi << 8) | Lo;
+ Buffer[I] = Word;
+
+ LLVM_DEBUG(
+ errs() << format("Read word %x (%d)\n", (unsigned)Word, Buffer.size()));
+ }
+
+ return M68kInstructionBuffer(Buffer.begin(), Buffer.end());
+}
+
+uint64_t M68kInstructionReader::readBits(unsigned NumBits) {
+ assert((size() >= NumBits) && "not enough bits to read");
+
+ // We have to read the bits in 16-bit chunks because we read them as
+ // 16-bit words but they're actually written in big-endian. If a read
+ // crosses a word boundary we have to be careful.
+
+ uint64_t Value = 0;
+ unsigned BitsRead = 0;
+
+ while (BitsRead < NumBits) {
+ unsigned AvailableThisWord = 16 - (NumRead & 0xf);
+ unsigned ToRead = std::min(NumBits, AvailableThisWord);
+
+ unsigned WordIndex = NumRead >> 4;
+ uint64_t ThisWord = Buffer[WordIndex] >> (NumRead & 0xf);
+ uint64_t Mask = (1 << ToRead) - 1;
+ Value |= (ThisWord & Mask) << BitsRead;
+ NumRead += ToRead;
+ BitsRead += ToRead;
+ }
+ return Value;
+}
+
+bool M68kInstructionLookup::matches(const M68kInstructionBuffer &Test) const {
+ if (Test.size() < Value.size())
+ return false;
+
+ for (unsigned I = 0, E = Value.size(); I < E; ++I) {
+ uint16_t Have = Test[I];
+ uint16_t Need = Value[I];
+ uint16_t WordMask = Mask[I];
+
+ if ((Have & WordMask) != Need)
+ return false;
+ }
+
+ return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void M68kInstructionLookup::dump() const {
+ dbgs() << "M68kInstructionLookup " << OpCode << " ";
+
+ for (unsigned I = 0, E = Mask.size(); I < E; ++I) {
+ uint16_t WordMask = Mask[I];
+ uint16_t WordValue = Value[I];
+
+ for (unsigned B = 0; B < 16; ++B) {
+ uint16_t Bit = (1 << (15 - B));
+ unsigned IsMasked = !(WordMask & Bit);
+ unsigned IsClear = !(WordValue & Bit);
+
+ if (B == 8)
+ dbgs() << " ";
+
+ char Ch = IsMasked ? '?' : (IsClear ? '0' : '1');
+ dbgs() << Ch;
+ }
+
+ dbgs() << " ";
+ }
+
+ dbgs() << "\n";
+}
+#endif
+
+bool M68kInstructionLookupBuilder::isValid() const {
+ for (unsigned I = 0, E = numWords(); I < E; ++I)
+ if (Mask[I])
+ return true;
+
+ return false;
+}
+
+M68kInstructionLookup M68kInstructionLookupBuilder::build(unsigned OpCode) {
+ unsigned NumWords = numWords();
+ M68kInstructionBuffer MaskBuffer(Mask.begin(), Mask.begin() + NumWords);
+ M68kInstructionBuffer ValueBuffer(Value.begin(), Value.begin() + NumWords);
+ M68kInstructionLookup Ret;
+ Ret.OpCode = OpCode;
+ Ret.Mask = MaskBuffer;
+ Ret.Value = ValueBuffer;
+ return Ret;
+}
+
+void M68kInstructionLookupBuilder::addBits(unsigned N, uint64_t Bits) {
+ while (N > 0) {
+ unsigned WordIndex = NumWritten >> 4;
+ unsigned WordOffset = NumWritten & 0xf;
+ unsigned AvailableThisWord = 16 - WordOffset;
+ unsigned ToWrite = std::min(AvailableThisWord, N);
+
+ uint16_t WordMask = (1 << ToWrite) - 1;
+ uint16_t BitsToWrite = Bits & WordMask;
+
+ Value[WordIndex] |= (BitsToWrite << WordOffset);
+ Mask[WordIndex] |= (WordMask << WordOffset);
+
+ Bits >>= ToWrite;
+ N -= ToWrite;
+ NumWritten += ToWrite;
+ }
+}
+
+void M68kInstructionLookupBuilder::skipBits(unsigned N) { NumWritten += N; }
+
+// This is a bit of a hack: we can't generate this table at table-gen time
+// because some of the definitions are in our platform.
+void M68kDisassembler::buildBeadTable() {
+ const unsigned NumInstr = M68k::INSTRUCTION_LIST_END;
+ Lookups.reserve(NumInstr);
+
+ for (unsigned I = 0; I < NumInstr; ++I) {
+ M68kInstructionLookupBuilder Builder;
+
+ for (const uint8_t *PartPtr = M68k::getMCInstrBeads(I); *PartPtr;
+ ++PartPtr) {
+ uint8_t Bead = *PartPtr;
+ unsigned Ext = Bead >> 4;
+ unsigned Op = Bead & 0xf;
+
+ switch (Op) {
+ case M68kBeads::Ctrl:
+ // Term will have already been skipped by the loop.
+ assert((Ext == M68kBeads::Ignore) && "unexpected command bead");
+ break;
+
+ case M68kBeads::Bits1:
+ Builder.addBits(1, Ext);
+ break;
+
+ case M68kBeads::Bits2:
+ Builder.addBits(2, Ext);
+ break;
+
+ case M68kBeads::Bits3:
+ Builder.addBits(3, Ext);
+ break;
+
+ case M68kBeads::Bits4:
+ Builder.addBits(4, Ext);
+ break;
+
+ case M68kBeads::DAReg:
+ case M68kBeads::DA:
+ case M68kBeads::DReg:
+ case M68kBeads::Reg:
+ if (Op != M68kBeads::DA)
+ Builder.skipBits(3);
+
+ if (Op != M68kBeads::Reg && Op != M68kBeads::DReg)
+ Builder.skipBits(1);
+
+ break;
+
+ case M68kBeads::Disp8:
+ Builder.skipBits(8);
+ break;
+
+ case M68kBeads::Imm8:
+ case M68kBeads::Imm16:
+ Builder.skipBits(16);
+ break;
+
+ case M68kBeads::Imm32:
+ Builder.skipBits(32);
+ break;
+
+ case M68kBeads::Imm3:
+ Builder.skipBits(3);
+ break;
+
+ default:
+ llvm_unreachable("unhandled bead type");
+ }
+ }
+
+ // Ignore instructions which are unmatchable (usually pseudo instructions).
+ if (!Builder.isValid())
+ continue;
+
+ Lookups.push_back(Builder.build(I));
+ }
+}
+
+unsigned M68kDisassembler::getRegOperandIndex(MCInst &Instr,
+ unsigned Bead) const {
+ unsigned Ext = Bead >> 4;
+
+ const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
+ auto MIOpIdx = M68k::getLogicalOperandIdx(Instr.getOpcode(), Ext & 7);
+
+ if (M68kII::hasMultiMIOperands(Instr.getOpcode(), Ext & 7)) {
+ bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
+ if (IsPCRel)
+ MIOpIdx += M68k::PCRelIndex;
+ else if (Ext & 8)
+ MIOpIdx += M68k::MemIndex;
+ else
+ MIOpIdx += M68k::MemBase;
+ }
+
+ return MIOpIdx;
+}
+
+unsigned M68kDisassembler::getImmOperandIndex(MCInst &Instr,
+ unsigned Bead) const {
+ unsigned Ext = Bead >> 4;
+
+ const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
+ auto MIOpIdx = M68k::getLogicalOperandIdx(Instr.getOpcode(), Ext & 7);
+
+ if (M68kII::hasMultiMIOperands(Instr.getOpcode(), Ext & 7)) {
+ bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
+ if (IsPCRel)
+ MIOpIdx += M68k::PCRelDisp;
+ else if (Ext & 8)
+ MIOpIdx += M68k::MemOuter;
+ else
+ MIOpIdx += M68k::MemDisp;
+ }
+
+ return MIOpIdx;
+}
+
+void M68kDisassembler::decodeReg(MCInst &Instr, unsigned Bead,
+ M68kInstructionReader &Reader,
+ unsigned &Scratch) const {
+ unsigned Op = Bead & 0xf;
+ LLVM_DEBUG(errs() << format("decodeReg %x\n", Bead));
+
+ if (Op != M68kBeads::DA)
+ Scratch = (Scratch & ~7) | Reader.readBits(3);
+
+ if (Op != M68kBeads::Reg) {
+ bool DA = (Op != M68kBeads::DReg) && Reader.readBits(1);
+ if (!DA)
+ Scratch |= 8;
+ else
+ Scratch &= ~8;
+ }
+}
+
+void M68kDisassembler::decodeImm(MCInst &Instr, unsigned Bead,
+ M68kInstructionReader &Reader,
+ unsigned &Scratch) const {
+ unsigned Op = Bead & 0xf;
+ LLVM_DEBUG(errs() << format("decodeImm %x\n", Bead));
+
+ unsigned NumToRead;
+ switch (Op) {
+ case M68kBeads::Disp8:
+ NumToRead = 8;
+ break;
+ case M68kBeads::Imm8:
+ case M68kBeads::Imm16:
+ NumToRead = 16;
+ break;
+ case M68kBeads::Imm32:
+ NumToRead = 32;
+ break;
+ case M68kBeads::Imm3:
+ NumToRead = 3;
+ break;
+ default:
+ llvm_unreachable("invalid imm");
+ }
+
+ Scratch = (Scratch << NumToRead) | Reader.readBits(NumToRead);
+}
+
+DecodeStatus M68kDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CStream) const {
+ // Read and shift the input (fetch as much as we can for now).
+ auto Buffer = M68kInstructionBuffer::fill(Bytes);
+ if (Buffer.size() == 0)
+ return Fail;
+
+ // Check through our lookup table.
+ bool Found = false;
+ for (unsigned I = 0, E = Lookups.size(); I < E; ++I) {
+ const M68kInstructionLookup &Lookup = Lookups[I];
+ if (!Lookup.matches(Buffer))
+ continue;
+
+ Found = true;
+ Size = Lookup.size() * 2;
+ Buffer.truncate(Lookup.size());
+ Instr.setOpcode(Lookup.OpCode);
+ LLVM_DEBUG(errs() << "decoding instruction " << MCII->getName(Lookup.OpCode)
+ << "\n");
+ break;
+ }
+
+ if (!Found)
+ return Fail;
+
+ M68kInstructionReader Reader(Buffer);
+ const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
+ unsigned NumOperands = Desc.NumOperands;
+
+ // Now use the beads to decode the operands.
+ enum class OperandType {
+ Invalid,
+ Reg,
+ Imm,
+ };
+
+ SmallVector<OperandType, 6> OpType(NumOperands, OperandType::Invalid);
+ SmallVector<unsigned, 6> Scratch(NumOperands, 0);
+ for (const uint8_t *PartPtr = M68k::getMCInstrBeads(Instr.getOpcode());
+ *PartPtr; ++PartPtr) {
+ uint8_t Bead = *PartPtr;
+ unsigned Ext = Bead >> 4;
+ unsigned Op = Bead & 0xf;
+ unsigned MIOpIdx;
+
+ switch (Op) {
+ case M68kBeads::Ctrl:
+ // Term will have already been skipped by the loop.
+ assert((Ext == M68kBeads::Ignore) && "unexpected command bead");
+ break;
+
+ // These bits are constant - if we're here we've already matched them.
+ case M68kBeads::Bits1:
+ Reader.readBits(1);
+ break;
+ case M68kBeads::Bits2:
+ Reader.readBits(2);
+ break;
+ case M68kBeads::Bits3:
+ Reader.readBits(3);
+ break;
+ case M68kBeads::Bits4:
+ Reader.readBits(4);
+ break;
+
+ case M68kBeads::DAReg:
+ case M68kBeads::DA:
+ case M68kBeads::DReg:
+ case M68kBeads::Reg:
+ MIOpIdx = getRegOperandIndex(Instr, Bead);
+ assert(((OpType[MIOpIdx] == OperandType::Invalid) ||
+ (OpType[MIOpIdx] == OperandType::Reg)) &&
+ "operands cannot change type");
+ OpType[MIOpIdx] = OperandType::Reg;
+ decodeReg(Instr, Bead, Reader, Scratch[MIOpIdx]);
+ break;
+
+ case M68kBeads::Disp8:
+ case M68kBeads::Imm8:
+ case M68kBeads::Imm16:
+ case M68kBeads::Imm32:
+ case M68kBeads::Imm3:
+ MIOpIdx = getImmOperandIndex(Instr, Bead);
+ assert(((OpType[MIOpIdx] == OperandType::Invalid) ||
+ (OpType[MIOpIdx] == OperandType::Imm)) &&
+ "operands cannot change type");
+ OpType[MIOpIdx] = OperandType::Imm;
+ decodeImm(Instr, Bead, Reader, Scratch[MIOpIdx]);
+ break;
+
+ default:
+ llvm_unreachable("unhandled bead type");
+ }
+ }
+
+ // Copy constrained operands.
+ for (unsigned DstMIOpIdx = 0; DstMIOpIdx < NumOperands; ++DstMIOpIdx) {
+ int TiedTo = Desc.getOperandConstraint(DstMIOpIdx, MCOI::TIED_TO);
+ if (TiedTo < 0)
+ continue;
+
+ unsigned SrcMIOpIdx = TiedTo;
+
+ unsigned OpCount = 0;
+ for (unsigned I = 0;; ++I) {
+ unsigned Offset = M68k::getLogicalOperandIdx(Instr.getOpcode(), I);
+ assert(Offset <= SrcMIOpIdx && "missing logical operand");
+ if (Offset == SrcMIOpIdx) {
+ OpCount = M68k::getLogicalOperandSize(Instr.getOpcode(), I);
+ break;
+ }
+ }
+ assert(OpCount != 0 && "operand count not found");
+
+ for (unsigned I = 0; I < OpCount; ++I) {
+ assert(OpType[DstMIOpIdx + I] == OperandType::Invalid &&
+ "tried to stomp over operand whilst applying constraints");
+ OpType[DstMIOpIdx + I] = OpType[SrcMIOpIdx + I];
+ Scratch[DstMIOpIdx + I] = Scratch[SrcMIOpIdx + I];
+ }
+ }
+
+ // Create the operands from our scratch space.
+ for (unsigned O = 0; O < NumOperands; ++O) {
+ switch (OpType[O]) {
+ case OperandType::Invalid:
+ assert(false && "operand not parsed");
+
+ case OperandType::Imm:
+ Instr.addOperand(MCOperand::createImm(Scratch[O]));
+ break;
+
+ case OperandType::Reg:
+ Instr.addOperand(MCOperand::createReg(RegisterDecode[Scratch[O]]));
+ break;
+ }
+ }
+
+ assert((Reader.size() == 0) && "wrong number of bits consumed");
+ return Success;
+}
+
+static MCDisassembler *createM68kDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new M68kDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheM68kTarget(),
+ createM68kDisassembler);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.cpp b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.cpp
new file mode 100644
index 0000000..c5931cb
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.cpp
@@ -0,0 +1,152 @@
+//===-- M68kCallLowering.cpp - Call lowering -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "M68kCallLowering.h"
+#include "M68kISelLowering.h"
+#include "M68kInstrInfo.h"
+#include "M68kSubtarget.h"
+#include "M68kTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
+
+using namespace llvm;
+
+M68kCallLowering::M68kCallLowering(const M68kTargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
+ OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB)
+ : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg, RegState::Implicit);
+ Register ExtReg = extendRegister(ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ llvm_unreachable("unimplemented");
+ }
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
+ llvm_unreachable("unimplemented");
+ }
+
+ MachineInstrBuilder MIB;
+};
+bool M68kCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI,
+ Register SwiftErrorVReg) const {
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(M68k::RTS);
+ bool Success = true;
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const M68kTargetLowering &TLI = *getTLI<M68kTargetLowering>();
+ CCAssignFn *AssignFn =
+ TLI.getCCAssignFn(F.getCallingConv(), true, F.isVarArg());
+ auto &DL = F.getParent()->getDataLayout();
+ if (!VRegs.empty()) {
+ SmallVector<ArgInfo, 8> SplitArgs;
+ ArgInfo OrigArg{VRegs, Val->getType(), 0};
+ setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
+ OutgoingValueAssigner ArgAssigner(AssignFn);
+ OutgoingArgHandler ArgHandler(MIRBuilder, MRI, MIB);
+ Success = determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgs,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg());
+ }
+ MIRBuilder.insertInstr(MIB);
+ return Success;
+}
+
+bool M68kCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const auto &DL = F.getParent()->getDataLayout();
+ auto &TLI = *getTLI<M68kTargetLowering>();
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ unsigned I = 0;
+ for (const auto &Arg : F.args()) {
+ ArgInfo OrigArg{VRegs[I], Arg.getType(), I};
+ setArgFlags(OrigArg, I + AttributeList::FirstArgIndex, DL, F);
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
+ ++I;
+ }
+
+ CCAssignFn *AssignFn =
+ TLI.getCCAssignFn(F.getCallingConv(), false, F.isVarArg());
+ IncomingValueAssigner ArgAssigner(AssignFn);
+ FormalArgHandler ArgHandler(MIRBuilder, MRI);
+ return determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgs,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg());
+}
+
+void M68kIncomingValueHandler::assignValueToReg(Register ValVReg,
+ Register PhysReg,
+ CCValAssign &VA) {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
+}
+
+void M68kIncomingValueHandler::assignValueToAddress(Register ValVReg,
+ Register Addr,
+ LLT MemTy,
+ MachinePointerInfo &MPO,
+ CCValAssign &VA) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, MemTy,
+ inferAlignFromPtrInfo(MF, MPO));
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+}
+
+Register M68kIncomingValueHandler::getStackAddress(uint64_t Size,
+ int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ const bool IsImmutable = !Flags.isByVal();
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+ // Build Frame Index
+ llvm::LLT FramePtr = LLT::pointer(
+ 0, MIRBuilder.getMF().getDataLayout().getPointerSizeInBits());
+ MachineInstrBuilder AddrReg = MIRBuilder.buildFrameIndex(FramePtr, FI);
+ StackUsed = std::max(StackUsed, Size + Offset);
+ return AddrReg.getReg(0);
+}
+
+bool M68kCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ return false;
+}
+
+bool M68kCallLowering::enableBigEndian() const { return true; }
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.h b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.h
new file mode 100644
index 0000000..9e0d462
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kCallLowering.h
@@ -0,0 +1,72 @@
+//===-- M68kCallLowering.h - Call lowering -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_GLSEL_M68KCALLLOWERING_H
+#define LLVM_LIB_TARGET_M68K_GLSEL_M68KCALLLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class M68kTargetLowering;
+
+class M68kCallLowering : public CallLowering {
+ // TODO: We are only supporting return instruction with no value at this time
+ // point
+
+public:
+ M68kCallLowering(const M68kTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
+ Register SwiftErrorVReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
+
+ bool enableBigEndian() const override;
+};
+struct M68kIncomingValueHandler : public CallLowering::IncomingValueHandler {
+ M68kIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI)
+ : CallLowering::IncomingValueHandler(MIRBuilder, MRI) {}
+
+ uint64_t StackUsed;
+
+private:
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override;
+
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override;
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override;
+};
+
+struct FormalArgHandler : public M68kIncomingValueHandler {
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : M68kIncomingValueHandler(MIRBuilder, MRI) {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_M68K_GLSEL_M68KCALLLOWERING_H
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kInstructionSelector.cpp b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kInstructionSelector.cpp
new file mode 100644
index 0000000..9ac4ab9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kInstructionSelector.cpp
@@ -0,0 +1,90 @@
+//===- M68kInstructionSelector.cpp ------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// M68k.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "M68kRegisterBankInfo.h"
+#include "M68kSubtarget.h"
+#include "M68kTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "m68k-isel"
+
+using namespace llvm;
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+namespace {
+
+class M68kInstructionSelector : public InstructionSelector {
+public:
+ M68kInstructionSelector(const M68kTargetMachine &TM, const M68kSubtarget &STI,
+ const M68kRegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I) override;
+ static const char *getName() { return DEBUG_TYPE; }
+
+private:
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+ const M68kTargetMachine &TM;
+ const M68kInstrInfo &TII;
+ const M68kRegisterInfo &TRI;
+ const M68kRegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+M68kInstructionSelector::M68kInstructionSelector(
+ const M68kTargetMachine &TM, const M68kSubtarget &STI,
+ const M68kRegisterBankInfo &RBI)
+ : InstructionSelector(), TM(TM), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "M68kGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+bool M68kInstructionSelector::select(MachineInstr &I) {
+ // Certain non-generic instructions also need some special handling.
+ if (!isPreISelGenericOpcode(I.getOpcode()))
+ return true;
+
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+
+ return false;
+}
+
+namespace llvm {
+InstructionSelector *
+createM68kInstructionSelector(const M68kTargetMachine &TM,
+ const M68kSubtarget &Subtarget,
+ const M68kRegisterBankInfo &RBI) {
+ return new M68kInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.cpp
new file mode 100644
index 0000000..bcbe628
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.cpp
@@ -0,0 +1,33 @@
+//===-- M68kLegalizerInfo.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for M68k.
+//===----------------------------------------------------------------------===//
+
+#include "M68kLegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+M68kLegalizerInfo::M68kLegalizerInfo(const M68kSubtarget &ST) {
+ using namespace TargetOpcode;
+ const LLT S32 = LLT::scalar(32);
+ const LLT P0 = LLT::pointer(0, 32);
+ getActionDefinitionsBuilder(G_LOAD).legalFor({S32});
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({P0});
+ getActionDefinitionsBuilder(G_ADD).legalFor({S32});
+ getActionDefinitionsBuilder(G_SUB).legalFor({S32});
+ getActionDefinitionsBuilder(G_MUL).legalFor({S32});
+ getActionDefinitionsBuilder(G_UDIV).legalFor({S32});
+ getLegacyLegalizerInfo().computeTables();
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.h b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.h
new file mode 100644
index 0000000..205aa81
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.h
@@ -0,0 +1,29 @@
+//===- M68kLegalizerInfo --------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the MachineLegalizer class for
+/// M68k.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_GLSEL_M68KLEGALIZERINFO_H
+#define LLVM_LIB_TARGET_M68K_GLSEL_M68KLEGALIZERINFO_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class M68kSubtarget;
+
+/// This struct provides the information for the target register banks.
+struct M68kLegalizerInfo : public LegalizerInfo {
+public:
+ M68kLegalizerInfo(const M68kSubtarget &ST);
+};
+} // end namespace llvm
+#endif // LLVM_LIB_TARGET_M68K_GLSEL_M68KLEGALIZERINFO_H
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp
new file mode 100644
index 0000000..d124786
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp
@@ -0,0 +1,27 @@
+//===-- M68kRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for M68k.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "M68kRegisterBankInfo.h"
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "M68kGenRegisterBank.inc"
+#undef GET_TARGET_REGBANK_IMPL
+
+using namespace llvm;
+
+M68kRegisterBankInfo::M68kRegisterBankInfo(const TargetRegisterInfo &TRI)
+ : M68kGenRegisterBankInfo() {}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.h b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.h
new file mode 100644
index 0000000..9b97cc4
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.h
@@ -0,0 +1,39 @@
+//===-- M68kRegisterBankInfo.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for M68k.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "M68kGenRegisterBank.inc"
+#undef GET_REGBANK_DECLARATIONS
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class M68kGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "M68kGenRegisterBank.inc"
+#undef GET_TARGET_REGBANK_CLASS
+};
+
+/// This class provides the information for the target register banks.
+class M68kRegisterBankInfo final : public M68kGenRegisterBankInfo {
+public:
+ M68kRegisterBankInfo(const TargetRegisterInfo &TRI);
+};
+} // end namespace llvm
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBanks.td b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBanks.td
new file mode 100644
index 0000000..2d1e74f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/GlSel/M68kRegisterBanks.td
@@ -0,0 +1,15 @@
+//===-- M68kRegisterBanks.td - Describe the M68k Banks -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Define the M68k register banks used for GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers. Here we define a register bank with name AnyGPR
+def GPRRegBank : RegisterBank<"AnyGPR", [DR8]>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68k.h b/src/llvm-project/llvm/lib/Target/M68k/M68k.h
new file mode 100644
index 0000000..cef40be
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68k.h
@@ -0,0 +1,57 @@
+//===- M68k.h - Top-level interface for M68k representation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the entry points for global functions defined in the
+/// M68k target library, as used by the LLVM JIT.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68K_H
+#define LLVM_LIB_TARGET_M68K_M68K_H
+
+namespace llvm {
+
+class FunctionPass;
+class InstructionSelector;
+class M68kRegisterBankInfo;
+class M68kSubtarget;
+class M68kTargetMachine;
+
+/// This pass converts a legalized DAG into a M68k-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createM68kISelDag(M68kTargetMachine &TM);
+
+/// Return a Machine IR pass that expands M68k-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createM68kExpandPseudoPass();
+
+/// This pass initializes a global base register for PIC on M68k.
+FunctionPass *createM68kGlobalBaseRegPass();
+
+/// Finds sequential MOVEM instruction and collapse them into a single one. This
+/// pass has to be run after all pseudo expansions and prologue/epilogue
+/// emission so that all possible MOVEM are already in place.
+FunctionPass *createM68kCollapseMOVEMPass();
+
+/// Finds MOVE instructions before any conditioanl branch instruction and
+/// replaces them with MOVEM instruction. Motorola's MOVEs do trash(V,C) flags
+/// register which prevents branch from taking the correct route. This pass
+/// has to be run after all pseudo expansions and prologue/epilogue emission
+/// so that all possible MOVEs are present.
+FunctionPass *createM68kConvertMOVToMOVMPass();
+
+InstructionSelector *
+createM68kInstructionSelector(const M68kTargetMachine &, const M68kSubtarget &,
+ const M68kRegisterBankInfo &);
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68k.td b/src/llvm-project/llvm/lib/Target/M68k/M68k.td
new file mode 100644
index 0000000..669eb32
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68k.td
@@ -0,0 +1,127 @@
+//===-- M68k.td - Motorola 680x0 target definitions ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is a target description file for the Motorola 680x0 family, referred
+/// to here as the "M68k" architecture.
+///
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// M68k Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureISA00
+ : SubtargetFeature<"isa-68000", "SubtargetKind", "M00",
+ "Is M68000 ISA supported">;
+
+def FeatureISA10
+ : SubtargetFeature<"isa-68010", "SubtargetKind", "M10",
+ "Is M68010 ISA supported",
+ [ FeatureISA00 ]>;
+
+def FeatureISA20
+ : SubtargetFeature<"isa-68020", "SubtargetKind", "M20",
+ "Is M68020 ISA supported",
+ [ FeatureISA10 ]>;
+
+def FeatureISA30
+ : SubtargetFeature<"isa-68030", "SubtargetKind", "M30",
+ "Is M68030 ISA supported",
+ [ FeatureISA20 ]>;
+
+def FeatureISA40
+ : SubtargetFeature<"isa-68040", "SubtargetKind", "M40",
+ "Is M68040 ISA supported",
+ [ FeatureISA30 ]>;
+
+def FeatureISA60
+ : SubtargetFeature<"isa-68060", "SubtargetKind", "M60",
+ "Is M68060 ISA supported",
+ [ FeatureISA40 ]>;
+
+foreach i = {0-6} in
+ def FeatureReserveA#i :
+ SubtargetFeature<"reserve-a"#i, "UserReservedRegister[M68k::A"#i#"]",
+ "true", "Reserve A"#i#" register">;
+foreach i = {0-7} in
+ def FeatureReserveD#i :
+ SubtargetFeature<"reserve-d"#i, "UserReservedRegister[M68k::D"#i#"]",
+ "true", "Reserve D"#i#" register">;
+
+//===----------------------------------------------------------------------===//
+// M68k processors supported.
+//===----------------------------------------------------------------------===//
+
+include "M68kSchedule.td"
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericM68kModel, Features>;
+
+def : Proc<"generic", [ FeatureISA00 ]>;
+def : Proc<"M68000", [ FeatureISA00 ]>;
+def : Proc<"M68010", [ FeatureISA10 ]>;
+def : Proc<"M68020", [ FeatureISA20 ]>;
+def : Proc<"M68030", [ FeatureISA30 ]>;
+def : Proc<"M68040", [ FeatureISA40 ]>;
+def : Proc<"M68060", [ FeatureISA60 ]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "M68kRegisterInfo.td"
+include "GlSel/M68kRegisterBanks.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "M68kInstrInfo.td"
+
+def M68kInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "M68kCallingConv.td"
+
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+def M68kAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ bit isMCAsmWriter = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+def M68kAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterName = 0;
+ let ShouldEmitMatchRegisterAltName = 0;
+}
+
+def M68kAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+def M68k : Target {
+ let InstructionSet = M68kInstrInfo;
+ let AssemblyParsers = [M68kAsmParser];
+ let AssemblyWriters = [M68kAsmWriter];
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
new file mode 100644
index 0000000..a6fc58b
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
@@ -0,0 +1,113 @@
+//===----- M68kAsmPrinter.cpp - M68k LLVM Assembly Printer -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains a printer that converts from our internal representation
+/// of machine-dependent LLVM code to GAS-format M68k assembly language.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO Conform to Motorola ASM syntax
+
+#include "M68kAsmPrinter.h"
+
+#include "M68k.h"
+#include "M68kMachineFunction.h"
+#include "MCTargetDesc/M68kInstPrinter.h"
+#include "TargetInfo/M68kTargetInfo.h"
+
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-asm-printer"
+
+bool M68kAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ MMFI = MF.getInfo<M68kMachineFunctionInfo>();
+ MCInstLowering = std::make_unique<M68kMCInstLower>(MF, *this);
+ AsmPrinter::runOnMachineFunction(MF);
+ return true;
+}
+
+void M68kAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &OS) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ OS << "%" << M68kInstPrinter::getRegisterName(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ OS << '#' << MO.getImm();
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(OS, MAI);
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, OS);
+ break;
+ case MachineOperand::MO_BlockAddress:
+ GetBlockAddressSymbol(MO.getBlockAddress())->print(OS, MAI);
+ break;
+ case MachineOperand::MO_ConstantPoolIndex: {
+ const DataLayout &DL = getDataLayout();
+ OS << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+ break;
+ }
+ default:
+ llvm_unreachable("not implemented");
+ }
+}
+
+bool M68kAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &OS) {
+ // Print the operand if there is no operand modifier.
+ if (!ExtraCode || !ExtraCode[0]) {
+ printOperand(MI, OpNo, OS);
+ return false;
+ }
+
+ // Fallback to the default implementation.
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS);
+}
+
+void M68kAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: {
+ if (MI->isPseudo()) {
+ LLVM_DEBUG(dbgs() << "Pseudo opcode(" << MI->getOpcode()
+ << ") found in EmitInstruction()\n");
+ llvm_unreachable("Cannot proceed");
+ }
+ break;
+ }
+ case M68k::TAILJMPj:
+ case M68k::TAILJMPq:
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("TAILCALL");
+ break;
+ }
+
+ MCInst TmpInst0;
+ MCInstLowering->Lower(MI, TmpInst0);
+ OutStreamer->emitInstruction(TmpInst0, getSubtargetInfo());
+}
+
+void M68kAsmPrinter::emitFunctionBodyStart() {}
+
+void M68kAsmPrinter::emitFunctionBodyEnd() {}
+
+void M68kAsmPrinter::emitStartOfAsmFile(Module &M) {
+ OutStreamer->emitSyntaxDirective();
+}
+
+void M68kAsmPrinter::emitEndOfAsmFile(Module &M) {}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kAsmPrinter() {
+ RegisterAsmPrinter<M68kAsmPrinter> X(getTheM68kTarget());
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.h b/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.h
new file mode 100644
index 0000000..dff3bb8
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kAsmPrinter.h
@@ -0,0 +1,69 @@
+//===----- M68kAsmPrinter.h - M68k LLVM Assembly Printer -------- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains M68k assembler printer declarations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KASMPRINTER_H
+#define LLVM_LIB_TARGET_M68K_M68KASMPRINTER_H
+
+#include "M68kMCInstLower.h"
+#include "M68kTargetMachine.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+#include <utility>
+
+namespace llvm {
+class MCStreamer;
+class MachineInstr;
+class MachineBasicBlock;
+class Module;
+class raw_ostream;
+
+class M68kSubtarget;
+class M68kMachineFunctionInfo;
+
+class LLVM_LIBRARY_VISIBILITY M68kAsmPrinter : public AsmPrinter {
+
+ void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
+ void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &OS);
+
+public:
+ const M68kSubtarget *Subtarget;
+ const M68kMachineFunctionInfo *MMFI;
+ std::unique_ptr<M68kMCInstLower> MCInstLowering;
+
+ explicit M68kAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {
+ Subtarget = static_cast<M68kTargetMachine &>(TM).getSubtargetImpl();
+ }
+
+ StringRef getPassName() const override { return "M68k Assembly Printer"; }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &OS) override;
+
+ void emitInstruction(const MachineInstr *MI) override;
+ void emitFunctionBodyStart() override;
+ void emitFunctionBodyEnd() override;
+ void emitStartOfAsmFile(Module &M) override;
+ void emitEndOfAsmFile(Module &M) override;
+};
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.h b/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.h
new file mode 100644
index 0000000..18f72c9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.h
@@ -0,0 +1,77 @@
+//===-- M68kCallingConv.h - M68k Custom CC Routines ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the custom routines for the M68k Calling Convention
+/// that aren't done by tablegen.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KCALLINGCONV_H
+#define LLVM_LIB_TARGET_M68K_M68KCALLINGCONV_H
+
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+
+namespace llvm {
+
+/// Custom state to propagate llvm type info to register CC assigner
+class M68kCCState : public CCState {
+public:
+ const llvm::Function &F;
+
+ M68kCCState(const llvm::Function &F, CallingConv::ID CC, bool IsVarArg,
+ MachineFunction &MF, SmallVectorImpl<CCValAssign> &Locs,
+ LLVMContext &C)
+ : CCState(CC, IsVarArg, MF, Locs, C), F(F) {}
+};
+
+/// NOTE this function is used to select registers for formal arguments and call
+/// FIXME: Handling on pointer arguments is not complete
+inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ M68kCCState CCInfo = static_cast<M68kCCState &>(State);
+
+ static const MCPhysReg DataRegList[] = {M68k::D0, M68k::D1, M68k::A0,
+ M68k::A1};
+
+ // Address registers have %a register priority
+ static const MCPhysReg AddrRegList[] = {
+ M68k::A0,
+ M68k::A1,
+ M68k::D0,
+ M68k::D1,
+ };
+
+ auto I = CCInfo.F.arg_begin();
+ int No = ValNo;
+ while (No > 0) {
+ No -= I->getType()->isIntegerTy(64) ? 2 : 1;
+ I++;
+ }
+
+ bool IsPtr = I != CCInfo.F.arg_end() && I->getType()->isPointerTy();
+
+ unsigned Reg =
+ IsPtr ? State.AllocateReg(AddrRegList) : State.AllocateReg(DataRegList);
+
+ if (Reg) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.td b/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.td
new file mode 100644
index 0000000..360f2199
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kCallingConv.td
@@ -0,0 +1,119 @@
+//===-- M68kCallingConv.td - Calling Conventions for M68k --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This describes the calling conventions for the M68k architectures. These
+/// conventions assume Int to be 4 bytes and 4 byte aligned.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO Verify C convention follows SysV M68K ABI
+
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const M68kSubtarget &>"
+ "(State.getMachineFunction().getSubtarget()).", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+/// Return-value conventions common to all M68k CC's.
+def RetCC_M68kCommon : CallingConv<[
+]>;
+
+/// M68k C return convention.
+/// TODO: Return via address register
+def RetCC_M68k_C : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8], CCAssignToReg<[BD0, BD1]>>,
+ CCIfType<[i16], CCAssignToReg<[WD0, WD1]>>,
+ CCIfType<[i32], CCAssignToReg<[D0, D1]>>,
+ CCDelegateTo<RetCC_M68kCommon>
+]>;
+
+/// M68k fastcc return convention.
+/// This convention allows to return up to 16 bytes in registers which can be
+/// split among 16 1-byte values or used for a single 16-byte value.
+/// TODO: Verify its functionality and write tests
+def RetCC_M68k_Fast : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8], CCAssignToReg<[BD0, BD1]>>,
+ CCIfType<[i16], CCAssignToReg<[WD0, WD1, WA0, WA1]>>,
+ CCIfType<[i32], CCAssignToReg<[D0, D1, A0, A1]>>,
+ CCDelegateTo<RetCC_M68kCommon>
+]>;
+
+/// This is the root return-value convention for the M68k backend.
+def RetCC_M68k : CallingConv<[
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_M68k_Fast>>,
+ CCDelegateTo<RetCC_M68k_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// M68k C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_M68k_Common - In all M68k calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_M68k_Common : CallingConv<[
+ /// Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ /// Integer values get stored in stack slots that are 4 bytes in
+ /// size and 4-byte aligned.
+ CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
+
+def CC_M68k_Fast : CallingConv<[
+ /// Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ /// The 'nest' parameter, if any, is passed in A1.
+ CCIfNest<CCAssignToReg<[A1]>>, // FIXME verify if this is correct
+
+ /// Since M68k uses %An for pointers and we want them be passed in regs
+ /// too we have to use custom function.
+ CCIfType<[i32], CCCustom<"CC_M68k_Any_AssignToReg">>,
+
+ /// Otherwise, same as everything else.
+ CCDelegateTo<CC_M68k_Common>
+]>;
+
+def CC_M68k_C : CallingConv<[
+ /// Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ /// The 'nest' parameter, if any, is passed in A1.
+ CCIfNest<CCAssignToReg<[A1]>>, // FIXME verify if this is correct
+
+ /// Use registers only if 'inreg' used and the call is not vararg
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[D0, D1]>>>>,
+
+ // TODO: Support for 'sret'
+
+ /// Otherwise, same as everything else.
+ CCDelegateTo<CC_M68k_Common>
+]>;
+
+/// This is the root argument convention for the M68k backend.
+def CC_M68k : CallingConv<[
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_M68k_Fast>>,
+ CCDelegateTo<CC_M68k_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+// A5 - BP
+// A6 - FP
+def CSR_STD : CalleeSavedRegs<(add D2, D3, D4, D5, D6, D7,
+ A2, A3, A4, A5, A6)>;
+
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
new file mode 100644
index 0000000..4149ae9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
@@ -0,0 +1,307 @@
+//===----- M68kCollapseMOVEMPass.cpp - Expand MOVEM pass --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// `MOVEM` is an instruction that moves multiple registers a time according to
+/// the given mask. Thus sometimes it's pretty expensive.
+/// This file contains a pass that collapses sequential MOVEM instructions into
+/// a single one.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68k.h"
+#include "M68kFrameLowering.h"
+#include "M68kInstrInfo.h"
+#include "M68kMachineFunction.h"
+#include "M68kSubtarget.h"
+
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "M68k-collapse-movem"
+
+namespace {
+
+enum UpdateType { Ascending, Descending, Intermixed };
+
+/// An abtraction of the MOVEM chain currently processing
+class MOVEMState {
+ MachineBasicBlock::iterator Begin;
+ MachineBasicBlock::iterator End;
+
+ unsigned Base;
+
+ int Start;
+ int Stop;
+
+ unsigned Mask;
+
+ enum class AccessTy { None, Load, Store };
+ AccessTy Access;
+
+public:
+ MOVEMState()
+ : Begin(nullptr), End(nullptr), Base(0), Start(INT_MIN), Stop(INT_MAX),
+ Mask(0), Access(AccessTy::None) {}
+
+ void setBegin(MachineBasicBlock::iterator &MI) {
+ assert(Begin == nullptr);
+ Begin = MI;
+ }
+
+ void setEnd(MachineBasicBlock::iterator &MI) {
+ assert(End == nullptr);
+ End = MI;
+ }
+
+ bool hasBase() const { return Base != 0; }
+
+ unsigned getBase() const {
+ assert(Base);
+ return Base;
+ }
+
+ MachineBasicBlock::iterator begin() {
+ assert(Begin != nullptr);
+ return Begin;
+ }
+
+ MachineBasicBlock::iterator end() {
+ assert(End != nullptr);
+ return End;
+ }
+
+ unsigned getMask() const { return Mask; }
+
+ void setBase(int Value) {
+ assert(!hasBase());
+ Base = Value;
+ }
+
+ // You need to call this before Mask update
+ UpdateType classifyUpdateByMask(unsigned NewMask) const {
+ assert(NewMask && "Mask needs to select at least one register");
+
+ if (NewMask > Mask) {
+ return Ascending;
+ } else if (NewMask < Mask) {
+ return Descending;
+ }
+
+ return Intermixed;
+ }
+
+ bool update(int O, int M) {
+ UpdateType Type = classifyUpdateByMask(M);
+ if (Type == Intermixed)
+ return false;
+ if (Start == INT_MIN) {
+ Start = Stop = O;
+ updateMask(M);
+ return true;
+ } else if (Type == Descending && O == Start - 4) {
+ Start -= 4;
+ updateMask(M);
+ return true;
+ } else if (Type == Ascending && O == Stop + 4) {
+ Stop += 4;
+ updateMask(M);
+ return true;
+ }
+
+ return false;
+ }
+
+ int getFinalOffset() const {
+ assert(
+ Start != INT_MIN &&
+ "MOVEM in control mode should increment the address in each iteration");
+ return Start;
+ }
+
+ bool updateMask(unsigned Value) {
+ assert(isUInt<16>(Value) && "Mask must fit 16 bit");
+ assert(!(Value & Mask) &&
+ "This is weird, there should be no intersections");
+ Mask |= Value;
+ return true;
+ }
+
+ void setLoad() { Access = AccessTy::Load; }
+ void setStore() { Access = AccessTy::Store; }
+
+ bool isLoad() const { return Access == AccessTy::Load; }
+ bool isStore() const { return Access == AccessTy::Store; }
+};
+
+/// This Pass first walks through all the MOVEM instructions
+/// that are chained together and record each of the
+/// instruction's properties like register mask and data
+/// access type into a `MOVEState` instance.
+/// Then we perform reduction / collapsing on this `MOVEMState`
+/// representation before creating a new `MOVEM` instruction
+/// based on the collapsed result, as well as removing
+/// redundant `MOVEM` instructions.
+class M68kCollapseMOVEM : public MachineFunctionPass {
+public:
+ static char ID;
+
+ const M68kSubtarget *STI;
+ const M68kInstrInfo *TII;
+ const M68kRegisterInfo *TRI;
+ const M68kMachineFunctionInfo *MFI;
+ const M68kFrameLowering *FL;
+
+ M68kCollapseMOVEM() : MachineFunctionPass(ID) {}
+
+ void Finish(MachineBasicBlock &MBB, MOVEMState &State) {
+ auto MI = State.begin();
+ auto End = State.end();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // No need to delete then add a single instruction
+ if (std::next(MI) == End) {
+ State = MOVEMState();
+ return;
+ }
+
+ // Delete all the MOVEM instruction till the end
+ while (MI != End) {
+ auto Next = std::next(MI);
+ MBB.erase(MI);
+ MI = Next;
+ }
+
+ // Add a unified one
+ if (State.isLoad()) {
+ BuildMI(MBB, End, DL, TII->get(M68k::MOVM32mp))
+ .addImm(State.getMask())
+ .addImm(State.getFinalOffset())
+ .addReg(State.getBase());
+ } else {
+ BuildMI(MBB, End, DL, TII->get(M68k::MOVM32pm))
+ .addImm(State.getFinalOffset())
+ .addReg(State.getBase())
+ .addImm(State.getMask());
+ }
+
+ State = MOVEMState();
+ }
+
+ bool ProcessMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MOVEMState &State, unsigned Mask, int Offset, unsigned Reg,
+ bool IsStore = false) {
+ if (State.hasBase()) {
+ // If current Type, Reg, Offset and Mask is in proper order then
+ // merge in the state
+ MOVEMState Temp = State;
+ if (State.isStore() == IsStore && State.getBase() == Reg &&
+ State.update(Offset, Mask)) {
+ return true;
+ // Otherwise we Finish processing of the current MOVEM sequance and
+ // start a new one
+ } else {
+ State = Temp;
+ State.setEnd(MI);
+ Finish(MBB, State);
+ return ProcessMI(MBB, MI, State, Mask, Offset, Reg, IsStore);
+ }
+ // If this is the first instruction is sequance then initialize the State
+ } else if (Reg == TRI->getStackRegister() ||
+ Reg == TRI->getBaseRegister() ||
+ Reg == TRI->getFrameRegister(*MBB.getParent())) {
+ State.setBegin(MI);
+ State.setBase(Reg);
+ State.update(Offset, Mask);
+ IsStore ? State.setStore() : State.setLoad();
+ return true;
+ }
+ return false;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ STI = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ MFI = MF.getInfo<M68kMachineFunctionInfo>();
+ FL = STI->getFrameLowering();
+
+ bool Modified = false;
+
+ MOVEMState State;
+
+ unsigned Mask = 0;
+ unsigned Reg = 0;
+ int Offset = 0;
+
+ for (auto &MBB : MF) {
+ auto MI = MBB.begin(), E = MBB.end();
+ while (MI != E) {
+ // Processing might change current instruction, save next first
+ auto NMI = std::next(MI);
+ switch (MI->getOpcode()) {
+ default:
+ if (State.hasBase()) {
+ State.setEnd(MI);
+ Finish(MBB, State);
+ Modified = true;
+ }
+ break;
+ case M68k::MOVM32jm:
+ Mask = MI->getOperand(1).getImm();
+ Reg = MI->getOperand(0).getReg();
+ Offset = 0;
+ Modified |= ProcessMI(MBB, MI, State, Mask, Offset, Reg, true);
+ break;
+ case M68k::MOVM32pm:
+ Mask = MI->getOperand(2).getImm();
+ Reg = MI->getOperand(1).getReg();
+ Offset = MI->getOperand(0).getImm();
+ Modified |= ProcessMI(MBB, MI, State, Mask, Offset, Reg, true);
+ break;
+ case M68k::MOVM32mj:
+ Mask = MI->getOperand(0).getImm();
+ Reg = MI->getOperand(1).getReg();
+ Offset = 0;
+ Modified |= ProcessMI(MBB, MI, State, Mask, Offset, Reg, false);
+ break;
+ case M68k::MOVM32mp:
+ Mask = MI->getOperand(0).getImm();
+ Reg = MI->getOperand(2).getReg();
+ Offset = MI->getOperand(1).getImm();
+ Modified |= ProcessMI(MBB, MI, State, Mask, Offset, Reg, false);
+ break;
+ }
+ MI = NMI;
+ }
+
+ if (State.hasBase()) {
+ State.setEnd(MI);
+ Finish(MBB, State);
+ }
+ }
+
+ return Modified;
+ }
+
+ StringRef getPassName() const override { return "M68k MOVEM collapser pass"; }
+};
+
+char M68kCollapseMOVEM::ID = 0;
+} // anonymous namespace.
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createM68kCollapseMOVEMPass() {
+ return new M68kCollapseMOVEM();
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
new file mode 100644
index 0000000..6a4aeaa
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -0,0 +1,320 @@
+//===--M68kExpandPseudo.cpp - Expand pseudo instructions ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains a pass that expands pseudo instructions into target
+/// instructions to allow proper scheduling, if-conversion, other late
+/// optimizations, or simply the encoding of the instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68k.h"
+#include "M68kFrameLowering.h"
+#include "M68kInstrInfo.h"
+#include "M68kMachineFunction.h"
+#include "M68kSubtarget.h"
+
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "M68k-expand-pseudos"
+
+namespace {
+class M68kExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ M68kExpandPseudo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const M68kSubtarget *STI;
+ const M68kInstrInfo *TII;
+ const M68kRegisterInfo *TRI;
+ const M68kMachineFunctionInfo *MFI;
+ const M68kFrameLowering *FL;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "M68k pseudo instruction expansion pass";
+ }
+
+private:
+ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char M68kExpandPseudo::ID = 0;
+} // End anonymous namespace.
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool M68kExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ unsigned Opcode = MI.getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+ /// TODO infer argument size to create less switch cases
+ switch (Opcode) {
+ default:
+ return false;
+
+ case M68k::MOVXd16d8:
+ return TII->ExpandMOVX_RR(MIB, MVT::i16, MVT::i8);
+ case M68k::MOVXd32d8:
+ return TII->ExpandMOVX_RR(MIB, MVT::i32, MVT::i8);
+ case M68k::MOVXd32d16:
+ return TII->ExpandMOVX_RR(MIB, MVT::i32, MVT::i16);
+
+ case M68k::MOVSXd16d8:
+ return TII->ExpandMOVSZX_RR(MIB, true, MVT::i16, MVT::i8);
+ case M68k::MOVSXd32d8:
+ return TII->ExpandMOVSZX_RR(MIB, true, MVT::i32, MVT::i8);
+ case M68k::MOVSXd32d16:
+ return TII->ExpandMOVSZX_RR(MIB, true, MVT::i32, MVT::i16);
+
+ case M68k::MOVZXd16d8:
+ return TII->ExpandMOVSZX_RR(MIB, false, MVT::i16, MVT::i8);
+ case M68k::MOVZXd32d8:
+ return TII->ExpandMOVSZX_RR(MIB, false, MVT::i32, MVT::i8);
+ case M68k::MOVZXd32d16:
+ return TII->ExpandMOVSZX_RR(MIB, false, MVT::i32, MVT::i16);
+
+ case M68k::MOVSXd16j8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dj), MVT::i16,
+ MVT::i8);
+ case M68k::MOVSXd32j8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dj), MVT::i32,
+ MVT::i8);
+ case M68k::MOVSXd32j16:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV16rj), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOVZXd16j8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8dj), MVT::i16,
+ MVT::i8);
+ case M68k::MOVZXd32j8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8dj), MVT::i32,
+ MVT::i8);
+ case M68k::MOVZXd32j16:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV16rj), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOVSXd16p8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dp), MVT::i16,
+ MVT::i8);
+ case M68k::MOVSXd32p8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dp), MVT::i32,
+ MVT::i8);
+ case M68k::MOVSXd32p16:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV16rp), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOVZXd16p8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8dp), MVT::i16,
+ MVT::i8);
+ case M68k::MOVZXd32p8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8dp), MVT::i32,
+ MVT::i8);
+ case M68k::MOVZXd32p16:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV16rp), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOVSXd16f8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8df), MVT::i16,
+ MVT::i8);
+ case M68k::MOVSXd32f8:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8df), MVT::i32,
+ MVT::i8);
+ case M68k::MOVSXd32f16:
+ return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV16rf), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOVZXd16f8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8df), MVT::i16,
+ MVT::i8);
+ case M68k::MOVZXd32f8:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8df), MVT::i32,
+ MVT::i8);
+ case M68k::MOVZXd32f16:
+ return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV16rf), MVT::i32,
+ MVT::i16);
+
+ case M68k::MOV8cd:
+ return TII->ExpandCCR(MIB, /*IsToCCR=*/true);
+ case M68k::MOV8dc:
+ return TII->ExpandCCR(MIB, /*IsToCCR=*/false);
+
+ case M68k::MOVM8jm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
+ case M68k::MOVM16jm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
+ case M68k::MOVM32jm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
+
+ case M68k::MOVM8pm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
+ case M68k::MOVM16pm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
+ case M68k::MOVM32pm_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
+
+ case M68k::MOVM8mj_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
+ case M68k::MOVM16mj_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
+ case M68k::MOVM32mj_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
+
+ case M68k::MOVM8mp_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
+ case M68k::MOVM16mp_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
+ case M68k::MOVM32mp_P:
+ return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
+
+ case M68k::TCRETURNq:
+ case M68k::TCRETURNj: {
+ MachineOperand &JumpTarget = MI.getOperand(0);
+ MachineOperand &StackAdjust = MI.getOperand(1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = MFI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+ // Incoporate the retaddr area.
+ Offset = StackAdj - MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Offset) {
+ // Check for possible merge with preceding ADD instruction.
+ Offset += FL->mergeSPUpdates(MBB, MBBI, true);
+ FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ }
+
+ // Jump to label or value in register.
+ if (Opcode == M68k::TCRETURNq) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(M68k::TAILJMPq));
+ if (JumpTarget.isGlobal()) {
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ } else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(M68k::TAILJMPj))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ MachineInstr &NewMI = *std::prev(MBBI);
+ NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+
+ return true;
+ }
+ case M68k::RET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ MachineInstrBuilder MIB;
+
+ if (StackAdj == 0) {
+ MIB = BuildMI(MBB, MBBI, DL, TII->get(M68k::RTS));
+ } else if (isUInt<16>(StackAdj)) {
+
+ if (STI->atLeastM68020()) {
+ llvm_unreachable("RTD is not implemented");
+ } else {
+ // Copy PC from stack to a free address(A0 or A1) register
+ // TODO check if pseudo expand uses free address register
+ BuildMI(MBB, MBBI, DL, TII->get(M68k::MOV32aj), M68k::A1)
+ .addReg(M68k::SP);
+
+ // Adjust SP
+ FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+
+ // Put the return address on stack
+ BuildMI(MBB, MBBI, DL, TII->get(M68k::MOV32ja))
+ .addReg(M68k::SP)
+ .addReg(M68k::A1);
+
+ // RTS
+ BuildMI(MBB, MBBI, DL, TII->get(M68k::RTS));
+ }
+ } else {
+ // TODO: RTD can only handle immediates as big as 2**16-1.
+ // If we need to pop off bytes before the return address, we
+ // must do it manually.
+ llvm_unreachable("Stack adjustment size not supported");
+ }
+
+ // FIXME: Can rest of the operands be ignored, if there is any?
+ MBB.erase(MBBI);
+ return true;
+ }
+ }
+ llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool M68kExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ // MBBI may be invalidated by the expansion.
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= ExpandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool M68kExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ MFI = MF.getInfo<M68kMachineFunctionInfo>();
+ FL = STI->getFrameLowering();
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= ExpandMBB(MBB);
+ return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createM68kExpandPseudoPass() {
+ return new M68kExpandPseudo();
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.cpp
new file mode 100644
index 0000000..26262b9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.cpp
@@ -0,0 +1,896 @@
+//===-- M68kFrameLowering.cpp - M68k Frame Information ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the M68k implementation of TargetFrameLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kFrameLowering.h"
+
+#include "M68kInstrBuilder.h"
+#include "M68kInstrInfo.h"
+#include "M68kMachineFunction.h"
+#include "M68kSubtarget.h"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+M68kFrameLowering::M68kFrameLowering(const M68kSubtarget &STI, Align Alignment)
+ : TargetFrameLowering(StackGrowsDown, Alignment, -4), STI(STI),
+ TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+ SlotSize = STI.getSlotSize();
+ StackPtr = TRI->getStackRegister();
+}
+
+bool M68kFrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ TRI->hasStackRealignment(MF);
+}
+
+// FIXME Make sure no other factors prevent us from reserving call frame
+bool M68kFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects() &&
+ !MF.getInfo<M68kMachineFunctionInfo>()->getHasPushSequences();
+}
+
+bool M68kFrameLowering::canSimplifyCallFramePseudos(
+ const MachineFunction &MF) const {
+ return hasReservedCallFrame(MF) ||
+ (hasFP(MF) && !TRI->hasStackRealignment(MF)) ||
+ TRI->hasBasePointer(MF);
+}
+
+bool M68kFrameLowering::needsFrameIndexResolution(
+ const MachineFunction &MF) const {
+ return MF.getFrameInfo().hasStackObjects() ||
+ MF.getInfo<M68kMachineFunctionInfo>()->getHasPushSequences();
+}
+
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// M68kRegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+StackOffset
+M68kFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (TRI->hasBasePointer(MF))
+ FrameReg = TRI->getBaseRegister();
+ else if (TRI->hasStackRealignment(MF))
+ FrameReg = TRI->getStackRegister();
+ else
+ FrameReg = TRI->getFrameRegister(MF);
+
+ // Offset will hold the offset from the stack pointer at function entry to the
+ // object.
+ // We need to factor in additional offsets applied during the prologue to the
+ // frame, base, and stack pointer depending on which is used.
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ const M68kMachineFunctionInfo *MMFI = MF.getInfo<M68kMachineFunctionInfo>();
+ uint64_t StackSize = MFI.getStackSize();
+ bool HasFP = hasFP(MF);
+
+ // TODO: Support tail calls
+ if (TRI->hasBasePointer(MF)) {
+ assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+ if (FI < 0) {
+ // Skip the saved FP.
+ return StackOffset::getFixed(Offset + SlotSize);
+ }
+
+ assert((-(Offset + StackSize)) % MFI.getObjectAlign(FI).value() == 0);
+ return StackOffset::getFixed(Offset + StackSize);
+ }
+ if (TRI->hasStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved FP.
+ return StackOffset::getFixed(Offset + SlotSize);
+ }
+
+ assert((-(Offset + StackSize)) % MFI.getObjectAlign(FI).value() == 0);
+ return StackOffset::getFixed(Offset + StackSize);
+ }
+
+ if (!HasFP)
+ return StackOffset::getFixed(Offset + StackSize);
+
+ // Skip the saved FP.
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ int TailCallReturnAddrDelta = MMFI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+
+ return StackOffset::getFixed(Offset);
+}
+
+/// Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const M68kRegisterInfo *TRI) {
+ const MachineFunction *MF = MBB.getParent();
+ if (MF->callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *TRI->getRegsForTailCall(*MF);
+
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
+ default:
+ return 0;
+ case TargetOpcode::PATCHABLE_RET:
+ case M68k::RET: {
+ SmallSet<uint16_t, 8> Uses;
+
+ for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MBBI->getOperand(i);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS))
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
+static bool isRegLiveIn(MachineBasicBlock &MBB, unsigned Reg) {
+ return llvm::any_of(MBB.liveins(),
+ [Reg](MachineBasicBlock::RegisterMaskPair RegMask) {
+ return RegMask.PhysReg == Reg;
+ });
+}
+
+uint64_t
+M68kFrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint64_t MaxAlign = MFI.getMaxAlign().value(); // Desired stack alignment.
+ unsigned StackAlign = getStackAlignment(); // ABI alignment
+ if (MF.getFunction().hasFnAttribute("stackrealign")) {
+ if (MFI.hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else if (MaxAlign < SlotSize)
+ MaxAlign = SlotSize;
+ }
+ return MaxAlign;
+}
+
+void M68kFrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned Reg,
+ uint64_t MaxAlign) const {
+ uint64_t Val = -MaxAlign;
+ unsigned AndOp = M68k::AND32di;
+ unsigned MovOp = M68k::MOV32rr;
+
+ // This function is normally used with SP which is Address Register, but AND,
+ // or any other logical instructions in M68k do not support ARs so we need
+ // to use a temp Data Register to perform the op.
+ unsigned Tmp = M68k::D0;
+
+ BuildMI(MBB, MBBI, DL, TII.get(MovOp), Tmp)
+ .addReg(Reg)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Tmp)
+ .addReg(Tmp)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The CCR implicit def is dead.
+ MI->getOperand(3).setIsDead();
+
+ BuildMI(MBB, MBBI, DL, TII.get(MovOp), Reg)
+ .addReg(Tmp)
+ .setMIFlag(MachineInstr::FrameSetup);
+}
+
+MachineBasicBlock::iterator M68kFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ bool ReserveCallFrame = hasReservedCallFrame(MF);
+ unsigned Opcode = I->getOpcode();
+ bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = !ReserveCallFrame ? I->getOperand(0).getImm() : 0;
+ uint64_t InternalAmt = (IsDestroy && Amount) ? I->getOperand(1).getImm() : 0;
+ I = MBB.erase(I);
+
+ if (!ReserveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub %SP, <amt>' and the
+ // adjcallstackdown instruction into 'add %SP, <amt>'
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned StackAlign = getStackAlignment();
+ Amount = alignTo(Amount, StackAlign);
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const auto &Fn = MF.getFunction();
+ bool DwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry();
+
+ // If we have any exception handlers in this function, and we adjust
+ // the SP before calls, we may need to indicate this to the unwinder
+ // using GNU_ARGS_SIZE. Note that this may be necessary even when
+ // Amount == 0, because the preceding function may have set a non-0
+ // GNU_ARGS_SIZE.
+ // TODO: We don't need to reset this between subsequent functions,
+ // if it didn't change.
+ bool HasDwarfEHHandlers = !MF.getLandingPads().empty();
+
+ if (HasDwarfEHHandlers && !IsDestroy &&
+ MF.getInfo<M68kMachineFunctionInfo>()->getHasPushSequences()) {
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+ }
+
+ if (Amount == 0)
+ return I;
+
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
+
+ // TODO: This is needed only if we require precise CFA.
+ // If this is a callee-pop calling convention, emit a CFA adjust for
+ // the amount the callee popped.
+ if (IsDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+ // Add Amount to SP to destroy a frame, or subtract to setup.
+ int64_t StackAdjustment = IsDestroy ? Amount : -Amount;
+ int64_t CfaAdjustment = -StackAdjustment;
+
+ if (StackAdjustment) {
+ // Merge with any previous or following adjustment instruction. Note: the
+ // instructions merged with here do not have CFI, so their stack
+ // adjustments do not feed into CfaAdjustment.
+ StackAdjustment += mergeSPUpdates(MBB, I, true);
+ StackAdjustment += mergeSPUpdates(MBB, I, false);
+
+ if (StackAdjustment) {
+ BuildStackAdjustment(MBB, I, DL, StackAdjustment, false);
+ }
+ }
+
+ if (DwarfCFI && !hasFP(MF)) {
+ // If we don't have FP, but need to generate unwind information,
+ // we need to set the correct CFA offset after the stack adjustment.
+ // How much we adjust the CFA offset depends on whether we're emitting
+ // CFI only for EH purposes or for debugging. EH only requires the CFA
+ // offset to be correct at each call site, while for debugging we want
+ // it to be more precise.
+
+ // TODO: When not using precise CFA, we also need to adjust for the
+ // InternalAmt here.
+ if (CfaAdjustment) {
+ BuildCFI(
+ MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
+ }
+ }
+
+ return I;
+ }
+
+ if (IsDestroy && InternalAmt) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ // We are not tracking the stack pointer adjustment by the callee, so make
+ // sure we restore the stack pointer immediately after the call, there may
+ // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ MachineBasicBlock::iterator CI = I;
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (CI != B && !std::prev(CI)->isCall())
+ --CI;
+ BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
+ }
+
+ return I;
+}
+
+/// Emit a series of instructions to increment / decrement the stack pointer by
+/// a constant value.
+void M68kFrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const {
+ bool IsSub = NumBytes < 0;
+ uint64_t Offset = IsSub ? -NumBytes : NumBytes;
+
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ while (Offset) {
+ if (Offset > Chunk) {
+ // Rather than emit a long series of instructions for large offsets,
+ // load the offset into a register and do one sub/add
+ Register Reg;
+
+ if (IsSub && !isRegLiveIn(MBB, M68k::D0))
+ Reg = M68k::D0;
+ else
+ Reg = findDeadCallerSavedReg(MBB, MBBI, TRI);
+
+ if (Reg) {
+ unsigned Opc = M68k::MOV32ri;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg).addImm(Offset);
+ Opc = IsSub ? M68k::SUB32rr : M68k::ADD32rr;
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addReg(Reg);
+ // ??? still no CCR
+ MI->getOperand(3).setIsDead(); // The CCR implicit def is dead.
+ Offset = 0;
+ continue;
+ }
+ }
+
+ uint64_t ThisVal = std::min(Offset, Chunk);
+
+ MachineInstrBuilder MI = BuildStackAdjustment(
+ MBB, MBBI, DL, IsSub ? -ThisVal : ThisVal, InEpilogue);
+ if (IsSub)
+ MI.setMIFlag(MachineInstr::FrameSetup);
+ else
+ MI.setMIFlag(MachineInstr::FrameDestroy);
+
+ Offset -= ThisVal;
+ }
+}
+
+int M68kFrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ bool MergeWithPrevious) const {
+ if ((MergeWithPrevious && MBBI == MBB.begin()) ||
+ (!MergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = MergeWithPrevious ? std::prev(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI =
+ MergeWithPrevious ? nullptr : std::next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if (!MergeWithPrevious && NI != MBB.end() &&
+ NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
+ // Don't merge with the next instruction if it has CFI.
+ return Offset;
+ }
+
+ if (Opc == M68k::ADD32ri && PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!MergeWithPrevious)
+ MBBI = NI;
+ } else if (Opc == M68k::SUB32ri && PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!MergeWithPrevious)
+ MBBI = NI;
+ }
+
+ return Offset;
+}
+
+MachineInstrBuilder M68kFrameLowering::BuildStackAdjustment(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
+ assert(Offset != 0 && "zero offset stack adjustment requested");
+
+ // TODO can `lea` be used to adjust stack?
+
+ bool IsSub = Offset < 0;
+ uint64_t AbsOffset = IsSub ? -Offset : Offset;
+ unsigned Opc = IsSub ? M68k::SUB32ri : M68k::ADD32ri;
+
+ MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(AbsOffset);
+ // FIXME Update CCR as well. For now we just
+ // conservatively say CCR implicit def is dead
+ MI->getOperand(3).setIsDead();
+ return MI;
+}
+
+void M68kFrameLowering::BuildCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ const MCCFIInstruction &CFIInst) const {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex = MF.addFrameInst(CFIInst);
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+void M68kFrameLowering::emitPrologueCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+ // Add callee saved registers to move list.
+ const auto &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ // Calculate offsets.
+ for (const auto &I : CSI) {
+ int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
+ unsigned Reg = I.getReg();
+
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ }
+}
+
+void M68kFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&STI == &MF.getSubtarget<M68kSubtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const auto &Fn = MF.getFunction();
+ MachineModuleInfo &MMI = MF.getMMI();
+ M68kMachineFunctionInfo *MMFI = MF.getInfo<M68kMachineFunctionInfo>();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+ uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
+ bool HasFP = hasFP(MF);
+ bool NeedsDwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ const unsigned MachineFramePtr = FramePtr;
+ unsigned BasePtr = TRI->getBaseRegister();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = MMFI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ MMFI->setCalleeSavedFrameSize(MMFI->getCalleeSavedFrameSize() -
+ TailCallReturnAddrDelta);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+ /*InEpilogue=*/false)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ uint64_t NumBytes = 0;
+ int stackGrowth = -SlotSize;
+
+ if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base
+ // pointer.
+ if (MMFI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+
+ NumBytes = FrameSize - MMFI->getCalleeSavedFrameSize();
+
+ // Callee-saved registers are pushed on stack before the stack is realigned.
+ if (TRI->hasStackRealignment(MF))
+ NumBytes = alignTo(NumBytes, MaxAlign);
+
+ // Get the offset of the stack slot for the FP register, which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI.setOffsetAdjustment(-NumBytes);
+
+ // Save FP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(M68k::PUSH32r))
+ .addReg(MachineFramePtr, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark the place where FP was saved.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, 2 * stackGrowth));
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ int DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ assert(DwarfFramePtr > 0);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,
+ 2 * stackGrowth));
+ }
+
+ // Update FP with the new base value.
+ BuildMI(MBB, MBBI, DL, TII.get(M68k::MOV32aa), FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the FP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+ }
+
+ // Mark the FramePtr as live-in in every block. Don't do this again for
+ // funclet prologues.
+ for (MachineBasicBlock &EveryMBB : MF)
+ EveryMBB.addLiveIn(MachineFramePtr);
+ } else {
+ NumBytes = StackSize - MMFI->getCalleeSavedFrameSize();
+ }
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) &&
+ MBBI->getOpcode() == M68k::PUSH32r) {
+ PushedRegs = true;
+ ++MBBI;
+
+ if (!HasFP && NeedsDwarfCFI) {
+ // Mark callee-saved push instruction.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, StackOffset));
+ StackOffset += stackGrowth;
+ }
+ }
+
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ if (TRI->hasStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+ }
+
+ // If there is an SUB32ri of SP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+ // Adjust stack pointer: ESP -= numbytes.
+ emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+
+ unsigned SPOrEstablisher = StackPtr;
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (TRI->hasBasePointer(MF)) {
+ // Update the base pointer with the current stack pointer.
+ BuildMI(MBB, MBBI, DL, TII.get(M68k::MOV32aa), BasePtr)
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (MMFI->getRestoreBasePointer()) {
+ // Stash value of base pointer. Saving SP instead of FP shortens
+ // dependence chain. Used by SjLj EH.
+ unsigned Opm = M68k::MOV32ja;
+ M68k::addRegIndirectWithDisp(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+ FramePtr, true,
+ MMFI->getRestoreBasePointerOffset())
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+ // Mark end of stack pointer adjustment.
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(
+ MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackSize + stackGrowth));
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ if (PushedRegs)
+ emitPrologueCalleeSavedFrameMoves(MBB, MBBI, DL);
+ }
+
+ // TODO Interrupt handlers
+ // M68k Interrupt handling function cannot assume anything about the
+ // direction flag (DF in CCR register). Clear this flag by creating "cld"
+ // instruction in each prologue of interrupt handler function. The "cld"
+ // instruction should only in these cases:
+ // 1. The interrupt handling function uses any of the "rep" instructions.
+ // 2. Interrupt handling function calls another function.
+}
+
+static bool isTailCallOpcode(unsigned Opc) {
+ return Opc == M68k::TCRETURNj || Opc == M68k::TCRETURNq;
+}
+
+void M68kFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ M68kMachineFunctionInfo *MMFI = MF.getInfo<M68kMachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ Optional<unsigned> RetOpcode;
+ if (MBBI != MBB.end())
+ RetOpcode = MBBI->getOpcode();
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned MachineFramePtr = FramePtr;
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI.getStackSize();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF);
+ unsigned CSSize = MMFI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ NumBytes = FrameSize - CSSize;
+
+ // Callee-saved registers were pushed on stack before the stack was
+ // realigned.
+ if (TRI->hasStackRealignment(MF))
+ NumBytes = alignTo(FrameSize, MaxAlign);
+
+ // Pop FP.
+ BuildMI(MBB, MBBI, DL, TII.get(M68k::POP32r), MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if ((Opc != M68k::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ Opc != M68k::DBG_VALUE && !PI->isTerminator())
+ break;
+
+ --MBBI;
+ }
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of SP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI.hasVarSizedObjects())
+ NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+ // If dynamic alloca is used, then reset SP to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned. Don't do this if this was a funclet epilogue, since the funclets
+ // will not do realignment or dynamic stack allocation.
+ if ((TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects())) {
+ if (TRI->hasStackRealignment(MF))
+ MBBI = FirstCSPop;
+ uint64_t LEAAmount = -CSSize;
+
+ // 'move %FramePtr, SP' will not be recognized as an epilogue sequence.
+ // However, we may use this sequence if we have a frame pointer because the
+ // effects of the prologue can safely be undone.
+ if (LEAAmount != 0) {
+ unsigned Opc = M68k::LEA32p;
+ M68k::addRegIndirectWithDisp(
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false,
+ LEAAmount);
+ --MBBI;
+ } else {
+ unsigned Opc = (M68k::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);
+ --MBBI;
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: SP += numbytes.
+ emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ --MBBI;
+ }
+
+ if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int Offset = -1 * MMFI->getTCReturnAddrDelta();
+ assert(Offset >= 0 && "TCDelta should never be positive");
+ if (Offset) {
+ MBBI = MBB.getFirstTerminator();
+
+ // Check for possible merge with preceding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, true);
+ emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ }
+ }
+}
+
+void M68kFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ M68kMachineFunctionInfo *M68kFI = MF.getInfo<M68kMachineFunctionInfo>();
+ int64_t TailCallReturnAddrDelta = M68kFI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [FP]
+ MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+ TailCallReturnAddrDelta - SlotSize, true);
+ }
+
+ // Spill the BasePtr if it's used.
+ if (TRI->hasBasePointer(MF)) {
+ SavedRegs.set(TRI->getBaseRegister());
+ }
+}
+
+bool M68kFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ M68kMachineFunctionInfo *M68kFI = MF.getInfo<M68kMachineFunctionInfo>();
+
+ int SpillSlotOffset = getOffsetOfLocalArea() + M68kFI->getTCReturnAddrDelta();
+
+ if (hasFP(MF)) {
+ // emitPrologue always spills frame register the first thing.
+ SpillSlotOffset -= SlotSize;
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+ // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+ // the frame register, we can delete it from CSI list and not have to worry
+ // about avoiding it later.
+ unsigned FPReg = TRI->getFrameRegister(MF);
+ for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
+ if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {
+ CSI.erase(CSI.begin() + i);
+ break;
+ }
+ }
+ }
+
+ // The rest is fine
+ return false;
+}
+
+bool M68kFrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ auto &MRI = *static_cast<const M68kRegisterInfo *>(TRI);
+ auto DL = MBB.findDebugLoc(MI);
+
+ int FI = 0;
+ unsigned Mask = 0;
+ for (const auto &Info : CSI) {
+ FI = std::max(FI, Info.getFrameIdx());
+ unsigned Reg = Info.getReg();
+ unsigned Shift = MRI.getSpillRegisterOrder(Reg);
+ Mask |= 1 << Shift;
+ }
+
+ auto I =
+ M68k::addFrameReference(BuildMI(MBB, MI, DL, TII.get(M68k::MOVM32pm)), FI)
+ .addImm(Mask)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Append implicit registers and mem locations
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineRegisterInfo &RI = MF.getRegInfo();
+ for (const auto &Info : CSI) {
+ unsigned Reg = Info.getReg();
+ bool IsLiveIn = RI.isLiveIn(Reg);
+ if (!IsLiveIn)
+ MBB.addLiveIn(Reg);
+ I.addReg(Reg, IsLiveIn ? RegState::Implicit : RegState::ImplicitKill);
+ M68k::addMemOperand(I, Info.getFrameIdx(), 0);
+ }
+
+ return true;
+}
+
+bool M68kFrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ auto &MRI = *static_cast<const M68kRegisterInfo *>(TRI);
+ auto DL = MBB.findDebugLoc(MI);
+
+ int FI = 0;
+ unsigned Mask = 0;
+ for (const auto &Info : CSI) {
+ FI = std::max(FI, Info.getFrameIdx());
+ unsigned Reg = Info.getReg();
+ unsigned Shift = MRI.getSpillRegisterOrder(Reg);
+ Mask |= 1 << Shift;
+ }
+
+ auto I = M68k::addFrameReference(
+ BuildMI(MBB, MI, DL, TII.get(M68k::MOVM32mp)).addImm(Mask), FI)
+ .setMIFlag(MachineInstr::FrameDestroy);
+
+ // Append implicit registers and mem locations
+ for (const auto &Info : CSI) {
+ I.addReg(Info.getReg(), RegState::ImplicitDefine);
+ M68k::addMemOperand(I, Info.getFrameIdx(), 0);
+ }
+
+ return true;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.h b/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.h
new file mode 100644
index 0000000..0eba9e0
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kFrameLowering.h
@@ -0,0 +1,172 @@
+//===- M68kFrameLowering.h - Define frame lowering for M68k -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the M68k declaration of TargetFrameLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KFRAMELOWERING_H
+#define LLVM_LIB_TARGET_M68K_M68KFRAMELOWERING_H
+
+#include "M68k.h"
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+
+namespace llvm {
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class M68kSubtarget;
+class M68kRegisterInfo;
+struct Align;
+
+class M68kFrameLowering : public TargetFrameLowering {
+ // Cached subtarget predicates.
+ const M68kSubtarget &STI;
+ const TargetInstrInfo &TII;
+ const M68kRegisterInfo *TRI;
+
+ /// Stack slot size in bytes.
+ unsigned SlotSize;
+
+ unsigned StackPtr;
+
+ /// If we're forcing a stack realignment we can't rely on just the frame
+ /// info, we need to know the ABI stack alignment as well in case we have a
+ /// call out. Otherwise just make sure we have some alignment - we'll go
+ /// with the minimum SlotSize.
+ uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+ /// Adjusts the stack pointer using LEA, SUB, or ADD.
+ MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset,
+ bool InEpilogue) const;
+
+ /// Aligns the stack pointer by ANDing it with -MaxAlign.
+ void BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned Reg, uint64_t MaxAlign) const;
+
+ /// Wraps up getting a CFI index and building a MachineInstr for it.
+ void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+
+ void emitPrologueCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const;
+
+ unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+public:
+ explicit M68kFrameLowering(const M68kSubtarget &sti, Align Alignment);
+
+ static const M68kFrameLowering *create(const M68kSubtarget &ST);
+
+ /// This method is called during prolog/epilog code insertion to eliminate
+ /// call frame setup and destroy pseudo instructions (but only if the Target
+ /// is using them). It is responsible for eliminating these instructions,
+ /// replacing them with concrete instructions. This method need only be
+ /// implemented if using call frame setup/destroy pseudo instructions.
+ /// Returns an iterator pointing to the instruction after the replaced one.
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ /// Insert prolog code into the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ /// Insert epilog code into the function.
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ /// This method determines which of the registers reported by
+ /// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved.
+ /// The default implementation checks populates the \p SavedRegs bitset with
+ /// all registers which are modified in the function, targets may override
+ /// this function to save additional registers.
+ /// This method also sets up the register scavenger ensuring there is a free
+ /// register or a frameindex available.
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ /// Allows target to override spill slot assignment logic. If implemented,
+ /// assignCalleeSavedSpillSlots() should assign frame slots to all CSI
+ /// entries and return true. If this method returns false, spill slots will
+ /// be assigned using generic implementation. assignCalleeSavedSpillSlots()
+ /// may add, delete or rearrange elements of CSI.
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ /// Issues instruction(s) to spill all callee saved registers and returns
+ /// true if it isn't possible / profitable to do so by issuing a series of
+ /// store instructions via storeRegToStackSlot(). Returns false otherwise.
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Issues instruction(s) to restore all callee saved registers and returns
+ /// true if it isn't possible / profitable to do so by issuing a series of
+ /// load instructions via loadRegToStackSlot(). Returns false otherwise.
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Return true if the specified function should have a dedicated frame
+ /// pointer register. This is true if the function has variable sized
+ /// allocas, if it needs dynamic stack realignment, if frame pointer
+ /// elimination is disabled, or if the frame address is taken.
+ bool hasFP(const MachineFunction &MF) const override;
+
+ /// Under normal circumstances, when a frame pointer is not required, we
+ /// reserve argument space for call sites in the function immediately on
+ /// entry to the current function. This eliminates the need for add/sub sp
+ /// brackets around call sites. Returns true if the call frame is included as
+ /// part of the stack frame.
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ /// If there is a reserved call frame, the call frame pseudos can be
+ /// simplified. Having a FP, as in the default implementation, is not
+ /// sufficient here since we can't always use it. Use a more nuanced
+ /// condition.
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+
+ // Do we need to perform FI resolution for this function. Normally, this is
+ // required only when the function has any stack objects. However, FI
+ // resolution actually has another job, not apparent from the title - it
+ // resolves callframe setup/destroy that were not simplified earlier.
+ //
+ // So, this is required for M68k functions that have push sequences even
+ // when there are no stack objects.
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+ /// This method should return the base register and offset used to reference
+ /// a frame index location. The offset is returned directly, and the base
+ /// register is returned via FrameReg.
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
+
+ /// Check the instruction before/after the passed instruction. If
+ /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+ /// stack adjustment is returned as a positive value for ADD/LEA and
+ /// a negative for SUB.
+ int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const;
+
+ /// Emit a series of instructions to increment / decrement the stack
+ /// pointer by a constant value.
+ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const;
+};
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
new file mode 100644
index 0000000..0076c26
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
@@ -0,0 +1,899 @@
+//===- M68kISelDAGToDAG.cpp - M68k Dag to Dag Inst Selector -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines an instruction selector for the M68K target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68k.h"
+
+#include "M68kMachineFunction.h"
+#include "M68kRegisterInfo.h"
+#include "M68kTargetMachine.h"
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-isel"
+
+namespace {
+
+// For reference, the full order of operands for memory references is:
+// (Operand), Displacement, Base, Index, Scale
+struct M68kISelAddressMode {
+ enum class AddrType {
+ ARI, // Address Register Indirect
+ ARIPI, // Address Register Indirect with Postincrement
+ ARIPD, // Address Register Indirect with Postdecrement
+ ARID, // Address Register Indirect with Displacement
+ ARII, // Address Register Indirect with Index
+ PCD, // Program Counter Indirect with Displacement
+ PCI, // Program Counter Indirect with Index
+ AL, // Absolute
+ };
+ AddrType AM;
+
+ enum class Base { RegBase, FrameIndexBase };
+ Base BaseType;
+
+ int64_t Disp;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue BaseReg;
+ int BaseFrameIndex;
+
+ SDValue IndexReg;
+ unsigned Scale;
+
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ MCSymbol *MCSym;
+ int JT;
+ Align Alignment; // CP alignment.
+
+ unsigned char SymbolFlags; // M68kII::MO_*
+
+ M68kISelAddressMode(AddrType AT)
+ : AM(AT), BaseType(Base::RegBase), Disp(0), BaseFrameIndex(0), IndexReg(),
+ Scale(1), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+ MCSym(nullptr), JT(-1), Alignment(), SymbolFlags(M68kII::MO_NO_FLAG) {}
+
+ bool hasSymbolicDisplacement() const {
+ return GV != nullptr || CP != nullptr || ES != nullptr ||
+ MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+ }
+
+ bool hasBase() const {
+ return BaseType == Base::FrameIndexBase || BaseReg.getNode() != nullptr;
+ }
+
+ bool hasFrameIndex() const { return BaseType == Base::FrameIndexBase; }
+
+ bool hasBaseReg() const {
+ return BaseType == Base::RegBase && BaseReg.getNode() != nullptr;
+ }
+
+ bool hasIndexReg() const {
+ return BaseType == Base::RegBase && IndexReg.getNode() != nullptr;
+ }
+
+ /// True if address mode type supports displacement
+ bool isDispAddrType() const {
+ return AM == AddrType::ARII || AM == AddrType::PCI ||
+ AM == AddrType::ARID || AM == AddrType::PCD || AM == AddrType::AL;
+ }
+
+ unsigned getDispSize() const {
+ switch (AM) {
+ default:
+ return 0;
+ case AddrType::ARII:
+ case AddrType::PCI:
+ return 8;
+ // These two in the next chip generations can hold upto 32 bit
+ case AddrType::ARID:
+ case AddrType::PCD:
+ return 16;
+ case AddrType::AL:
+ return 32;
+ }
+ }
+
+ bool hasDisp() const { return getDispSize() != 0; }
+ bool isDisp8() const { return getDispSize() == 8; }
+ bool isDisp16() const { return getDispSize() == 16; }
+ bool isDisp32() const { return getDispSize() == 32; }
+
+ /// Return true if this addressing mode is already PC-relative.
+ bool isPCRelative() const {
+ if (BaseType != Base::RegBase)
+ return false;
+ if (auto *RegNode = dyn_cast_or_null<RegisterSDNode>(BaseReg.getNode()))
+ return RegNode->getReg() == M68k::PC;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = Base::RegBase;
+ BaseReg = Reg;
+ }
+
+ void setIndexReg(SDValue Reg) { IndexReg = Reg; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ dbgs() << "M68kISelAddressMode " << this;
+ dbgs() << "\nDisp: " << Disp;
+ dbgs() << ", BaseReg: ";
+ if (BaseReg.getNode())
+ BaseReg.getNode()->dump();
+ else
+ dbgs() << "null";
+ dbgs() << ", BaseFI: " << BaseFrameIndex;
+ dbgs() << ", IndexReg: ";
+ if (IndexReg.getNode()) {
+ IndexReg.getNode()->dump();
+ } else {
+ dbgs() << "null";
+ dbgs() << ", Scale: " << Scale;
+ }
+ dbgs() << '\n';
+ }
+#endif
+};
+} // end anonymous namespace
+
+namespace {
+
+class M68kDAGToDAGISel : public SelectionDAGISel {
+public:
+ explicit M68kDAGToDAGISel(M68kTargetMachine &TM)
+ : SelectionDAGISel(TM), Subtarget(nullptr) {}
+
+ StringRef getPassName() const override {
+ return "M68k DAG->DAG Pattern Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ /// Keep a pointer to the M68kSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const M68kSubtarget *Subtarget;
+
+// Include the pieces autogenerated from the target description.
+#include "M68kGenDAGISel.inc"
+
+ /// getTargetMachine - Return a reference to the TargetMachine, casted
+ /// to the target-specific type.
+ const M68kTargetMachine &getTargetMachine() {
+ return static_cast<const M68kTargetMachine &>(TM);
+ }
+
+ void Select(SDNode *N) override;
+
+ // Insert instructions to initialize the global base register in the
+ // first MBB of the function.
+ // HMM... do i need this?
+ void initGlobalBaseReg(MachineFunction &MF);
+
+ bool foldOffsetIntoAddress(uint64_t Offset, M68kISelAddressMode &AM);
+
+ bool matchLoadInAddress(LoadSDNode *N, M68kISelAddressMode &AM);
+ bool matchAddress(SDValue N, M68kISelAddressMode &AM);
+ bool matchAddressBase(SDValue N, M68kISelAddressMode &AM);
+ bool matchAddressRecursively(SDValue N, M68kISelAddressMode &AM,
+ unsigned Depth);
+ bool matchADD(SDValue &N, M68kISelAddressMode &AM, unsigned Depth);
+ bool matchWrapper(SDValue N, M68kISelAddressMode &AM);
+
+ std::pair<bool, SDNode *> selectNode(SDNode *Node);
+
+ bool SelectARI(SDNode *Parent, SDValue N, SDValue &Base);
+ bool SelectARIPI(SDNode *Parent, SDValue N, SDValue &Base);
+ bool SelectARIPD(SDNode *Parent, SDValue N, SDValue &Base);
+ bool SelectARID(SDNode *Parent, SDValue N, SDValue &Imm, SDValue &Base);
+ bool SelectARII(SDNode *Parent, SDValue N, SDValue &Imm, SDValue &Base,
+ SDValue &Index);
+ bool SelectAL(SDNode *Parent, SDValue N, SDValue &Sym);
+ bool SelectPCD(SDNode *Parent, SDValue N, SDValue &Imm);
+ bool SelectPCI(SDNode *Parent, SDValue N, SDValue &Imm, SDValue &Index);
+
+ // If Address Mode represents Frame Index store FI in Disp and
+ // Displacement bit size in Base. These values are read symmetrically by
+ // M68kRegisterInfo::eliminateFrameIndex method
+ inline bool getFrameIndexAddress(M68kISelAddressMode &AM, const SDLoc &DL,
+ SDValue &Disp, SDValue &Base) {
+ if (AM.BaseType == M68kISelAddressMode::Base::FrameIndexBase) {
+ Disp = getI32Imm(AM.Disp, DL);
+ Base = CurDAG->getTargetFrameIndex(
+ AM.BaseFrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
+ return true;
+ }
+
+ return false;
+ }
+
+ // Gets a symbol plus optional displacement
+ inline bool getSymbolicDisplacement(M68kISelAddressMode &AM, const SDLoc &DL,
+ SDValue &Sym) {
+ if (AM.GV) {
+ Sym = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ return true;
+ }
+
+ if (AM.CP) {
+ Sym = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
+ AM.Disp, AM.SymbolFlags);
+ return true;
+ }
+
+ if (AM.ES) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+ Sym = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ return true;
+ }
+
+ if (AM.MCSym) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+ assert(AM.SymbolFlags == 0 && "oo");
+ Sym = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+ return true;
+ }
+
+ if (AM.JT != -1) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+ Sym = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ return true;
+ }
+
+ if (AM.BlockAddr) {
+ Sym = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ return true;
+ }
+
+ return false;
+ }
+
+ /// Return a target constant with the specified value of type i8.
+ inline SDValue getI8Imm(int64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+ }
+
+ /// Return a target constant with the specified value of type i8.
+ inline SDValue getI16Imm(int64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i16);
+ }
+
+ /// Return a target constant with the specified value, of type i32.
+ inline SDValue getI32Imm(int64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+ /// Return a reference to the TargetInstrInfo, casted to the target-specific
+ /// type.
+ const M68kInstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+
+ /// Return an SDNode that returns the value of the global base register.
+ /// Output instructions required to initialize the global base register,
+ /// if necessary.
+ SDNode *getGlobalBaseReg();
+};
+} // namespace
+
+bool M68kDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+/// This pass converts a legalized DAG into a M68k-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createM68kISelDag(M68kTargetMachine &TM) {
+ return new M68kDAGToDAGISel(TM);
+}
+
+static bool doesDispFitFI(M68kISelAddressMode &AM) {
+ if (!AM.isDispAddrType())
+ return false;
+ // -1 to make sure that resolved FI will fit into Disp field
+ return isIntN(AM.getDispSize() - 1, AM.Disp);
+}
+
+static bool doesDispFit(M68kISelAddressMode &AM, int64_t Val) {
+ if (!AM.isDispAddrType())
+ return false;
+ return isIntN(AM.getDispSize(), Val);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *M68kDAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ auto &DL = MF->getDataLayout();
+ return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+bool M68kDAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+ M68kISelAddressMode &AM) {
+ // Cannot combine ExternalSymbol displacements with integer offsets.
+ if (Offset != 0 && (AM.ES || AM.MCSym))
+ return false;
+
+ int64_t Val = AM.Disp + Offset;
+
+ if (doesDispFit(AM, Val)) {
+ AM.Disp = Val;
+ return true;
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Matchers
+//===----------------------------------------------------------------------===//
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool M68kDAGToDAGISel::matchAddressBase(SDValue N, M68kISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.hasBase()) {
+ // If so, check to see if the scale index register is set.
+ if (!AM.hasIndexReg()) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return true;
+ }
+
+ // Otherwise, we cannot select it.
+ return false;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = M68kISelAddressMode::Base::RegBase;
+ AM.BaseReg = N;
+ return true;
+}
+
+/// TODO Add TLS support
+bool M68kDAGToDAGISel::matchLoadInAddress(LoadSDNode *N,
+ M68kISelAddressMode &AM) {
+ return false;
+}
+
+bool M68kDAGToDAGISel::matchAddressRecursively(SDValue N,
+ M68kISelAddressMode &AM,
+ unsigned Depth) {
+ SDLoc DL(N);
+
+ // Limit recursion.
+ if (Depth > 5)
+ return matchAddressBase(N, AM);
+
+ // If this is already a %PC relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // PC relative addressing: %PC + 16-bit displacement!
+ if (AM.isPCRelative()) {
+ // FIXME JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but should be fixed for
+ // consistency.
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+ if (foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ return true;
+ return false;
+ }
+
+ switch (N.getOpcode()) {
+ default:
+ break;
+
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (foldOffsetIntoAddress(Val, AM))
+ return true;
+ break;
+ }
+
+ case M68kISD::Wrapper:
+ case M68kISD::WrapperPC:
+ if (matchWrapper(N, AM))
+ return true;
+ break;
+
+ case ISD::LOAD:
+ if (matchLoadInAddress(cast<LoadSDNode>(N), AM))
+ return true;
+ break;
+
+ case ISD::OR:
+ // We want to look through a transform in InstCombine and DAGCombiner that
+ // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+ // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+ // An 'lea' can then be used to match the shift (multiply) and add:
+ // and $1, %esi
+ // lea (%rsi, %rdi, 8), %rax
+ if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+ matchADD(N, AM, Depth))
+ return true;
+ break;
+
+ case ISD::ADD:
+ if (matchADD(N, AM, Depth))
+ return true;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.isDispAddrType() &&
+ AM.BaseType == M68kISelAddressMode::Base::RegBase &&
+ AM.BaseReg.getNode() == nullptr && doesDispFitFI(AM)) {
+ AM.BaseType = M68kISelAddressMode::Base::FrameIndexBase;
+ AM.BaseFrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return true;
+ }
+ break;
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool M68kDAGToDAGISel::matchAddress(SDValue N, M68kISelAddressMode &AM) {
+ // TODO: Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ // And make sure it is an indexed mode
+
+ // TODO: Post-processing: Convert foo to foo(%pc), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // Make sure this must be done only if PC* modes are currently being matched
+ return matchAddressRecursively(N, AM, 0);
+}
+
+bool M68kDAGToDAGISel::matchADD(SDValue &N, M68kISelAddressMode &AM,
+ unsigned Depth) {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ M68kISelAddressMode Backup = AM;
+ if (matchAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
+ matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth + 1)) {
+ return true;
+ }
+ AM = Backup;
+
+ // Try again after commuting the operands.
+ if (matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth + 1) &&
+ matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) {
+ return true;
+ }
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (!AM.hasBase() && !AM.hasIndexReg()) {
+ N = Handle.getValue();
+ AM.BaseReg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return true;
+ }
+
+ N = Handle.getValue();
+ return false;
+}
+
+/// Try to match M68kISD::Wrapper and M68kISD::WrapperPC nodes into an
+/// addressing mode. These wrap things that will resolve down into a symbol
+/// reference. If no match is possible, this returns true, otherwise it returns
+/// false.
+bool M68kDAGToDAGISel::matchWrapper(SDValue N, M68kISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return false;
+
+ SDValue N0 = N.getOperand(0);
+
+ if (N.getOpcode() == M68kISD::WrapperPC) {
+
+ // If cannot match here just restore the old version
+ M68kISelAddressMode Backup = AM;
+
+ if (AM.hasBase()) {
+ return false;
+ }
+
+ if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ if (!foldOffsetIntoAddress(G->getOffset(), AM)) {
+ AM = Backup;
+ return false;
+ }
+ } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Alignment = CP->getAlign();
+ AM.SymbolFlags = CP->getTargetFlags();
+ if (!foldOffsetIntoAddress(CP->getOffset(), AM)) {
+ AM = Backup;
+ return false;
+ }
+ } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ if (!foldOffsetIntoAddress(BA->getOffset(), AM)) {
+ AM = Backup;
+ return false;
+ }
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ AM.setBaseReg(CurDAG->getRegister(M68k::PC, MVT::i32));
+ return true;
+ }
+
+ // This wrapper requires 32bit disp/imm field for Medium CM
+ if (!AM.isDisp32()) {
+ return false;
+ }
+
+ if (N.getOpcode() == M68kISD::Wrapper) {
+ if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Alignment = CP->getAlign();
+ AM.Disp += CP->getOffset();
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.Disp += BA->getOffset();
+ AM.SymbolFlags = BA->getTargetFlags();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+ return true;
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Selectors
+//===----------------------------------------------------------------------===//
+
+void M68kDAGToDAGISel::Select(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+ SDLoc DL(Node);
+
+ LLVM_DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ if (Node->isMachineOpcode()) {
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ Node->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (Opcode) {
+ default:
+ break;
+
+ case M68kISD::GLOBAL_BASE_REG:
+ ReplaceNode(Node, getGlobalBaseReg());
+ return;
+ }
+
+ SelectCode(Node);
+}
+
+bool M68kDAGToDAGISel::SelectARIPI(SDNode *Parent, SDValue N, SDValue &Base) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::ARIPI: ");
+ LLVM_DEBUG(dbgs() << "NOT IMPLEMENTED\n");
+ return false;
+}
+
+bool M68kDAGToDAGISel::SelectARIPD(SDNode *Parent, SDValue N, SDValue &Base) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::ARIPD: ");
+ LLVM_DEBUG(dbgs() << "NOT IMPLEMENTED\n");
+ return false;
+}
+
+bool M68kDAGToDAGISel::SelectARID(SDNode *Parent, SDValue N, SDValue &Disp,
+ SDValue &Base) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::ARID: ");
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::ARID);
+
+ if (!matchAddress(N, AM))
+ return false;
+
+ if (AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match PC relative address\n");
+ return false;
+ }
+
+ // If this is a frame index, grab it
+ if (getFrameIndexAddress(AM, SDLoc(N), Disp, Base)) {
+ LLVM_DEBUG(dbgs() << "SUCCESS matched FI\n");
+ return true;
+ }
+
+ if (AM.hasIndexReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Index\n");
+ return false;
+ }
+
+ if (!AM.hasBaseReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: No Base reg\n");
+ return false;
+ }
+
+ if (getSymbolicDisplacement(AM, SDLoc(N), Disp)) {
+ assert(!AM.Disp && "Should not be any displacement");
+ LLVM_DEBUG(dbgs() << "SUCCESS, matched Symbol\n");
+ return true;
+ }
+
+ // Give a chance to AddrType::ARI
+ if (AM.Disp == 0) {
+ LLVM_DEBUG(dbgs() << "REJECT: No displacement\n");
+ return false;
+ }
+
+ Base = AM.BaseReg;
+ Disp = getI16Imm(AM.Disp, SDLoc(N));
+
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+}
+
+static bool isAddressBase(const SDValue &N) {
+ switch (N.getOpcode()) {
+ case ISD::ADD:
+ case ISD::ADDC:
+ return llvm::any_of(N.getNode()->ops(),
+ [](const SDUse &U) { return isAddressBase(U.get()); });
+ case M68kISD::Wrapper:
+ case M68kISD::WrapperPC:
+ case M68kISD::GLOBAL_BASE_REG:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool M68kDAGToDAGISel::SelectARII(SDNode *Parent, SDValue N, SDValue &Disp,
+ SDValue &Base, SDValue &Index) {
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::ARII);
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::ARII: ");
+
+ if (!matchAddress(N, AM))
+ return false;
+
+ if (AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: PC relative\n");
+ return false;
+ }
+
+ if (!AM.hasIndexReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: No Index\n");
+ return false;
+ }
+
+ if (!AM.hasBaseReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: No Base\n");
+ return false;
+ }
+
+ if (!isAddressBase(AM.BaseReg) && isAddressBase(AM.IndexReg)) {
+ Base = AM.IndexReg;
+ Index = AM.BaseReg;
+ } else {
+ Base = AM.BaseReg;
+ Index = AM.IndexReg;
+ }
+
+ if (AM.hasSymbolicDisplacement()) {
+ LLVM_DEBUG(dbgs() << "REJECT, Cannot match symbolic displacement\n");
+ return false;
+ }
+
+ // The idea here is that we want to use AddrType::ARII without displacement
+ // only if necessary like memory operations, otherwise this must be lowered
+ // into addition
+ if (AM.Disp == 0 && (!Parent || (Parent->getOpcode() != ISD::LOAD &&
+ Parent->getOpcode() != ISD::STORE))) {
+ LLVM_DEBUG(dbgs() << "REJECT: Displacement is Zero\n");
+ return false;
+ }
+
+ Disp = getI8Imm(AM.Disp, SDLoc(N));
+
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+}
+
+bool M68kDAGToDAGISel::SelectAL(SDNode *Parent, SDValue N, SDValue &Sym) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::AL: ");
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::AL);
+
+ if (!matchAddress(N, AM)) {
+ LLVM_DEBUG(dbgs() << "REJECT: Match failed\n");
+ return false;
+ }
+
+ if (AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match PC relative address\n");
+ return false;
+ }
+
+ if (AM.hasBase()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Base\n");
+ return false;
+ }
+
+ if (AM.hasIndexReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Index\n");
+ return false;
+ }
+
+ if (getSymbolicDisplacement(AM, SDLoc(N), Sym)) {
+ LLVM_DEBUG(dbgs() << "SUCCESS: Matched symbol\n");
+ return true;
+ }
+
+ if (AM.Disp) {
+ Sym = getI32Imm(AM.Disp, SDLoc(N));
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+ }
+
+ LLVM_DEBUG(dbgs() << "REJECT: Not Symbol or Disp\n");
+ return false;
+ ;
+}
+
+bool M68kDAGToDAGISel::SelectPCD(SDNode *Parent, SDValue N, SDValue &Disp) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::PCD: ");
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::PCD);
+
+ if (!matchAddress(N, AM))
+ return false;
+
+ if (!AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Not PC relative\n");
+ return false;
+ }
+
+ if (AM.hasIndexReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Index\n");
+ return false;
+ }
+
+ if (getSymbolicDisplacement(AM, SDLoc(N), Disp)) {
+ LLVM_DEBUG(dbgs() << "SUCCESS, matched Symbol\n");
+ return true;
+ }
+
+ Disp = getI16Imm(AM.Disp, SDLoc(N));
+
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+}
+
+bool M68kDAGToDAGISel::SelectPCI(SDNode *Parent, SDValue N, SDValue &Disp,
+ SDValue &Index) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::PCI: ");
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::PCI);
+
+ if (!matchAddress(N, AM))
+ return false;
+
+ if (!AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Not PC relative\n");
+ return false;
+ }
+
+ if (!AM.hasIndexReg()) {
+ LLVM_DEBUG(dbgs() << "REJECT: No Index\n");
+ return false;
+ }
+
+ Index = AM.IndexReg;
+
+ if (getSymbolicDisplacement(AM, SDLoc(N), Disp)) {
+ assert(!AM.Disp && "Should not be any displacement");
+ LLVM_DEBUG(dbgs() << "SUCCESS, matched Symbol\n");
+ return true;
+ }
+
+ Disp = getI8Imm(AM.Disp, SDLoc(N));
+
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+}
+
+bool M68kDAGToDAGISel::SelectARI(SDNode *Parent, SDValue N, SDValue &Base) {
+ LLVM_DEBUG(dbgs() << "Selecting AddrType::ARI: ");
+ M68kISelAddressMode AM(M68kISelAddressMode::AddrType::ARI);
+
+ if (!matchAddress(N, AM)) {
+ LLVM_DEBUG(dbgs() << "REJECT: Match failed\n");
+ return false;
+ }
+
+ if (AM.isPCRelative()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match PC relative address\n");
+ return false;
+ }
+
+ // AddrType::ARI does not use these
+ if (AM.hasIndexReg() || AM.Disp != 0) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Index or Disp\n");
+ return false;
+ }
+
+ // Must be matched by AddrType::AL
+ if (AM.hasSymbolicDisplacement()) {
+ LLVM_DEBUG(dbgs() << "REJECT: Cannot match Symbolic Disp\n");
+ return false;
+ }
+
+ if (AM.hasBaseReg()) {
+ Base = AM.BaseReg;
+ LLVM_DEBUG(dbgs() << "SUCCESS\n");
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp
new file mode 100644
index 0000000..3e7cee9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -0,0 +1,3422 @@
+//===-- M68kISelLowering.cpp - M68k DAG Lowering Impl ------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the interfaces that M68k uses to lower LLVM code into a
+/// selection DAG.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kISelLowering.h"
+#include "M68kCallingConv.h"
+#include "M68kMachineFunction.h"
+#include "M68kSubtarget.h"
+#include "M68kTargetMachine.h"
+#include "M68kTargetObjectFile.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "M68k-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
+ const M68kSubtarget &STI)
+ : TargetLowering(TM), Subtarget(STI), TM(TM) {
+
+ MVT PtrVT = MVT::i32;
+
+ setBooleanContents(ZeroOrOneBooleanContent);
+
+ auto *RegInfo = Subtarget.getRegisterInfo();
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &M68k::DR8RegClass);
+ addRegisterClass(MVT::i16, &M68k::XR16RegClass);
+ addRegisterClass(MVT::i32, &M68k::XR32RegClass);
+
+ for (auto VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ }
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ setOperationAction(ISD::MUL, MVT::i8, Promote);
+ setOperationAction(ISD::MUL, MVT::i16, Legal);
+ if (Subtarget.atLeastM68020())
+ setOperationAction(ISD::MUL, MVT::i32, Legal);
+ else
+ setOperationAction(ISD::MUL, MVT::i32, LibCall);
+ setOperationAction(ISD::MUL, MVT::i64, LibCall);
+
+ for (auto OP :
+ {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::UDIVREM, ISD::SDIVREM,
+ ISD::MULHS, ISD::MULHU, ISD::UMUL_LOHI, ISD::SMUL_LOHI}) {
+ setOperationAction(OP, MVT::i8, Promote);
+ setOperationAction(OP, MVT::i16, Legal);
+ setOperationAction(OP, MVT::i32, LibCall);
+ }
+
+ for (auto OP : {ISD::UMUL_LOHI, ISD::SMUL_LOHI}) {
+ setOperationAction(OP, MVT::i8, Expand);
+ setOperationAction(OP, MVT::i16, Expand);
+ }
+
+ // FIXME It would be better to use a custom lowering
+ for (auto OP : {ISD::SMULO, ISD::UMULO}) {
+ setOperationAction(OP, MVT::i8, Expand);
+ setOperationAction(OP, MVT::i16, Expand);
+ setOperationAction(OP, MVT::i32, Expand);
+ }
+
+ // Add/Sub overflow ops with MVT::Glues are lowered to CCR dependences.
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
+ setOperationAction(ISD::ADDC, VT, Custom);
+ setOperationAction(ISD::ADDE, VT, Custom);
+ setOperationAction(ISD::SUBC, VT, Custom);
+ setOperationAction(ISD::SUBE, VT, Custom);
+ }
+
+ // SADDO and friends are legal with this setup, i hope
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+ }
+
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SETCCCARRY, VT, Custom);
+ }
+
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ }
+
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ // 2^2 bytes
+ // FIXME can it be just 2^1?
+ setMinFunctionAlignment(Align::Constant<2>());
+}
+
+EVT M68kTargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext &Context, EVT VT) const {
+ // M68k SETcc producess either 0x00 or 0xFF
+ return MVT::i8;
+}
+
+MVT M68kTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+ EVT Ty) const {
+ if (Ty.isSimple()) {
+ return Ty.getSimpleVT();
+ }
+ return MVT::getIntegerVT(8 * DL.getPointerSize(0));
+}
+
+#include "M68kGenCallingConv.inc"
+
+enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn };
+
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ if (Outs.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+ if (Ins.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &DL) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), DL, MVT::i32);
+
+ return DAG.getMemcpy(
+ Chain, DL, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+ /*isVolatile=*/false, /*AlwaysInline=*/true,
+ /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) { return false; }
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static bool MatchingStackOffset(SDValue Arg, unsigned Offset,
+ ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI,
+ const MachineRegisterInfo *MRI,
+ const M68kInstrInfo *TII,
+ const CCValAssign &VA) {
+ unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = Arg.getOpcode();
+ if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+ Arg = Arg.getOperand(0);
+ continue;
+ }
+ if (Op == ISD::TRUNCATE) {
+ const SDValue &TruncInput = Arg.getOperand(0);
+ if (TruncInput.getOpcode() == ISD::AssertZext &&
+ cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+ Arg.getValueType()) {
+ Arg = TruncInput.getOperand(0);
+ continue;
+ }
+ }
+ break;
+ }
+
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!Register::isVirtualRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(*Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == M68k::LEA32p || Opcode == M68k::LEA32f) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (auto *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+ FI = FINode->getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI.isFixedObjectIndex(FI))
+ return false;
+
+ if (Offset != MFI.getObjectOffset(FI))
+ return false;
+
+ if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
+ // If the argument location is wider than the argument type, check that any
+ // extension flags match.
+ if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+ Flags.isSExt() != MFI.isObjectSExt(FI)) {
+ return false;
+ }
+ }
+
+ return Bytes == MFI.getObjectSize(FI);
+}
+
+SDValue
+M68kTargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ M68kMachineFunctionInfo *FuncInfo = MF.getInfo<M68kMachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = Subtarget.getSlotSize();
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(
+ SlotSize, -(int64_t)SlotSize, false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+SDValue M68kTargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+ SDValue &OutRetAddr,
+ SDValue Chain,
+ bool IsTailCall, int FPDiff,
+ const SDLoc &DL) const {
+ EVT VT = getPointerTy(DAG.getDataLayout());
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, DL, Chain, OutRetAddr, MachinePointerInfo());
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+SDValue M68kTargetLowering::EmitTailCallStoreRetAddr(
+ SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetFI,
+ EVT PtrVT, unsigned SlotSize, int FPDiff, const SDLoc &DL) const {
+ if (!FPDiff)
+ return Chain;
+
+ // Calculate the new stack slot for the return address.
+ int NewFO = MF.getFrameInfo().CreateFixedObject(
+ SlotSize, (int64_t)FPDiff - SlotSize, false);
+
+ SDValue NewFI = DAG.getFrameIndex(NewFO, PtrVT);
+ // Store the return address to the appropriate stack slot.
+ Chain = DAG.getStore(
+ Chain, DL, RetFI, NewFI,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFO));
+ return Chain;
+}
+
+SDValue
+M68kTargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo &MFI,
+ unsigned ArgIdx) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[ArgIdx].Flags;
+ EVT ValVT;
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // Because we are dealing with BE architecture we need to offset loading of
+ // partial types
+ int Offset = VA.getLocMemOffset();
+ if (VA.getValVT() == MVT::i8) {
+ Offset += 3;
+ } else if (VA.getValVT() == MVT::i16) {
+ Offset += 2;
+ }
+
+ // TODO Interrupt handlers
+ // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+ // taken by a return address.
+
+ // FIXME For now, all byval parameter objects are marked mutable. This can
+ // be changed with more analysis. In case of tail call optimization mark all
+ // arguments mutable. Since they could be overwritten by lowering of arguments
+ // in case of a tail call.
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
+ CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+ bool IsImmutable = !AlwaysUseMutable && !Flags.isByVal();
+
+ if (Flags.isByVal()) {
+ unsigned Bytes = Flags.getByValSize();
+ if (Bytes == 0)
+ Bytes = 1; // Don't create zero-sized stack objects.
+ int FI = MFI.CreateFixedObject(Bytes, Offset, IsImmutable);
+ // TODO Interrupt handlers
+ // Adjust SP offset of interrupt parameter.
+ return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ } else {
+ int FI =
+ MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, Offset, IsImmutable);
+
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI.setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI.setObjectSExt(FI, true);
+ }
+
+ // TODO Interrupt handlers
+ // Adjust SP offset of interrupt parameter.
+
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue Val = DAG.getLoad(
+ ValVT, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ return VA.isExtInLoc() ? DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val)
+ : Val;
+ }
+}
+
+SDValue M68kTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+ SDValue Arg, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, DL);
+ PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ if (Flags.isByVal())
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, DL);
+
+ return DAG.getStore(
+ Chain, DL, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+//===----------------------------------------------------------------------===//
+// Call
+//===----------------------------------------------------------------------===//
+
+SDValue M68kTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ StructReturnType SR = callIsStructReturn(Outs);
+ bool IsSibcall = false;
+ M68kMachineFunctionInfo *MFI = MF.getInfo<M68kMachineFunctionInfo>();
+ // const M68kRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ if (CallConv == CallingConv::M68k_INTR)
+ report_fatal_error("M68k interrupts may not be called directly");
+
+ auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
+ if (Attr.getValueAsBool())
+ IsTailCall = false;
+
+ // FIXME Add tailcalls support
+
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ IsTailCall = true;
+ } else if (IsTailCall) {
+ // Check if it's really possible to do a tail call.
+ IsTailCall = IsEligibleForTailCallOptimization(
+ Callee, CallConv, IsVarArg, SR != NotStructReturn,
+ MF.getFunction().hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins,
+ DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!MF.getTarget().Options.GuaranteedTailCallOpt && IsTailCall)
+ IsSibcall = true;
+
+ if (IsTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(IsVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling convention fastcc");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ // It is empty for LibCall
+ const Function *CalleeFunc = CLI.CB ? CLI.CB->getCalledFunction() : nullptr;
+ M68kCCState CCInfo(*CalleeFunc, CallConv, IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(Outs, CC_M68k);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+ if (IsSibcall) {
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ } else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ canGuaranteeTCO(CallConv)) {
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+ }
+
+ int FPDiff = 0;
+ if (IsTailCall && !IsSibcall && !IsMustTail) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed = MFI->getBytesToPopOnReturn();
+
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < MFI->getTCReturnAddrDelta())
+ MFI->setTCReturnAddrDelta(FPDiff);
+ }
+
+ unsigned NumBytesToPush = NumBytes;
+ unsigned NumBytesToPop = NumBytes;
+
+ // If we have an inalloca argument, all stack space has already been allocated
+ // for us and be right at the top of the stack. We don't support multiple
+ // arguments passed in memory when using inalloca.
+ if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+ NumBytesToPush = 0;
+ if (!ArgLocs.back().isMemLoc())
+ report_fatal_error("cannot use inalloca attribute on a register "
+ "parameter");
+ if (ArgLocs.back().getLocMemOffset() != 0)
+ report_fatal_error("any parameter with the inalloca attribute must be "
+ "the only memory argument");
+ }
+
+ if (!IsSibcall)
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+ NumBytes - NumBytesToPush, DL);
+
+ SDValue RetFI;
+ // Load return address for tail calls.
+ if (IsTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetFI, Chain, IsTailCall, FPDiff, DL);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ const M68kRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ // Skip inalloca arguments, they have already been written.
+ if (Flags.isInAlloca())
+ continue;
+
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[i];
+ bool IsByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getBitcast(RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(
+ Chain, DL, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ Arg = SpillSlot;
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else if (!IsSibcall && (!IsTailCall || IsByVal)) {
+ assert(VA.isMemLoc());
+ if (!StackPtr.getNode()) {
+ StackPtr = DAG.getCopyFromReg(Chain, DL, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ }
+ MemOpChains.push_back(
+ LowerMemOpCallTo(Chain, StackPtr, Arg, DL, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // FIXME Make sure PIC style GOT works as expected
+ // The only time GOT is really needed is for Medium-PIC static data
+ // otherwise we are happy with pc-rel or static references
+
+ if (IsVarArg && IsMustTail) {
+ const auto &Forwards = MFI->getForwardedMustTailRegParms();
+ for (const auto &F : Forwards) {
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
+ RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ }
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && IsTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset() + FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
+ FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), DL);
+ if (!StackPtr.getNode()) {
+ StackPtr = DAG.getCopyFromReg(Chain, DL, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ }
+ Source = DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
+ StackPtr, Source);
+
+ MemOpChains2.push_back(
+ CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, DL));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(
+ ArgChain, DL, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains2);
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetFI,
+ getPointerTy(DAG.getDataLayout()),
+ Subtarget.getSlotSize(), FPDiff, DL);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (Callee->getOpcode() == ISD::GlobalAddress) {
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+ // it.
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ const GlobalValue *GV = G->getGlobal();
+ if (!GV->hasDLLImportStorageClass()) {
+ unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
+
+ Callee = DAG.getTargetGlobalAddress(
+ GV, DL, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
+
+ if (OpFlags == M68kII::MO_GOTPCREL) {
+
+ // Add a wrapper.
+ Callee = DAG.getNode(M68kISD::WrapperPC, DL,
+ getPointerTy(DAG.getDataLayout()), Callee);
+
+ // Add extra indirection
+ Callee = DAG.getLoad(
+ getPointerTy(DAG.getDataLayout()), DL, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlags =
+ Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
+
+ Callee = DAG.getTargetExternalSymbol(
+ S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && IsTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, DL, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(M68kISD::TC_RETURN, DL, NodeTys, Ops);
+ }
+
+ Chain = DAG.getNode(M68kISD::CALL, DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPop;
+ if (M68k::isCalleePop(CallConv, IsVarArg,
+ DAG.getTarget().Options.GuaranteedTailCallOpt)) {
+ NumBytesForCalleeToPop = NumBytes; // Callee pops everything
+ } else if (!canGuaranteeTCO(CallConv) && SR == StackStructReturn) {
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ NumBytesForCalleeToPop = 4;
+ } else {
+ NumBytesForCalleeToPop = 0; // Callee pops nothing.
+ }
+
+ if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
+ // No need to reset the stack after the call if the call doesn't return. To
+ // make the MI verify, we'll pretend the callee does it for us.
+ NumBytesForCalleeToPop = NumBytes;
+ }
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(
+ Chain, DAG.getIntPtrConstant(NumBytesToPop, DL, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals);
+}
+
+SDValue M68kTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_M68k);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getLocVT();
+
+ /// ??? is this correct?
+ Chain = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), CopyVT, InFlag)
+ .getValue(1);
+ SDValue Val = Chain.getValue(0);
+
+ if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue M68kTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CCID, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ M68kMachineFunctionInfo *MMFI = MF.getInfo<M68kMachineFunctionInfo>();
+ // const TargetFrameLowering &TFL = *Subtarget.getFrameLowering();
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ M68kCCState CCInfo(MF.getFunction(), CCID, IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_M68k);
+
+ unsigned LastVal = ~0U;
+ SDValue ArgValue;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ assert(VA.getValNo() != LastVal && "Same value in different locations");
+
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = &M68k::XR32RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt) {
+ ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ } else if (VA.getLocInfo() == CCValAssign::ZExt) {
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ } else if (VA.getLocInfo() == CCValAssign::BCvt) {
+ ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+ }
+
+ if (VA.isExtInLoc()) {
+ ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue = LowerMemArgument(Chain, CCID, Ins, DL, DAG, VA, MFI, i);
+ }
+
+ // If value is passed via pointer - do a load.
+ // TODO Make sure this handling on indirect arguments is correct
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+
+ InVals.push_back(ArgValue);
+ }
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // Swift calling convention does not require we copy the sret argument
+ // into %D0 for the return. We don't set SRetReturnReg for Swift.
+ if (CCID == CallingConv::Swift)
+ continue;
+
+ // ABI require that for returning structs by value we copy the sret argument
+ // into %D0 for the return. Save the argument into a virtual register so
+ // that we can access it from the return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = MMFI->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy(DAG.getDataLayout());
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ MMFI->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+ break;
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (shouldGuaranteeTCO(CCID, MF.getTarget().Options.GuaranteedTailCallOpt))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (MFI.hasVAStart()) {
+ MMFI->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
+ }
+
+ if (IsVarArg && MFI.hasMustTailInVarArgFunc()) {
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = MVT::i32;
+ RegParmTypes.push_back(IntVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ // ??? what is this for?
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ MMFI->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_M68k);
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, DL, F.VReg, RegVal);
+ }
+ }
+
+ // Some CCs need callee pop.
+ if (M68k::isCalleePop(CCID, IsVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ MMFI->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else {
+ MMFI->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!canGuaranteeTCO(CCID) && argsAreStructReturn(Ins) == StackStructReturn)
+ MMFI->setBytesToPopOnReturn(4);
+ }
+
+ MMFI->setArgumentStackSize(StackSize);
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue
+M68kTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CCID,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ M68kMachineFunctionInfo *MFI = MF.getInfo<M68kMachineFunctionInfo>();
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CCID, IsVarArg, MF, RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_M68k);
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ // Operand #0 = Chain (updated below)
+ RetOps.push_back(Chain);
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(
+ DAG.getTargetConstant(MFI->getBytesToPopOnReturn(), DL, MVT::i32));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = OutVals[i];
+ EVT ValVT = ValToCopy.getValueType();
+
+ // Promote values to the appropriate types.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt) {
+ if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), ValToCopy);
+ else
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), ValToCopy);
+ } else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), ValToCopy, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // Swift calling convention does not require we copy the sret argument
+ // into %d0 for the return, and SRetReturnReg is not set for Swift.
+
+ // ABI require that for returning structs by value we copy the sret argument
+ // into %D0 for the return. Save the argument into a virtual register so that
+ // we can access it from the return points.
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If MFI.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case MFI->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = MFI->getSRetReturnReg()) {
+ // ??? Can i just move this to the top and escape this explanation?
+ // When we have both sret and another return value, we should use the
+ // original Chain stored in RetOps[0], instead of the current Chain updated
+ // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+ // For the case of sret and another return value, we have
+ // Chain_0 at the function entry
+ // Chain_1 = getCopyToReg(Chain_0) in the above loop
+ // If we use Chain_1 in getCopyFromReg, we will have
+ // Val = getCopyFromReg(Chain_1)
+ // Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+ // getCopyToReg(Chain_0) will be glued together with
+ // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+ // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+ // Data dependency from Unit B to Unit A due to usage of Val in
+ // getCopyToReg(Chain_1, Val)
+ // Chain dependency from Unit A to Unit B
+
+ // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+ SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
+ getPointerTy(MF.getDataLayout()));
+
+ // ??? How will this work if CC does not use registers for args passing?
+ // ??? What if I return multiple structs?
+ unsigned RetValReg = M68k::D0;
+ Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ RetOps.push_back(
+ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(M68kISD::RET, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On M68k_64 architecture with GOT-style position independent code only
+// local (within module) calls are supported at the moment. To keep the stack
+// aligned according to platform abi the function GetAlignedArgumentStackSize
+// ensures that argument delta is always multiples of stack alignment. (Dynamic
+// linkers need this - darwin's dyld for example) If a tail called function
+// callee has more arguments than the caller the caller needs to make sure that
+// there is room to move the RETADDR to. This is achieved by reserving an area
+// the size of the argument delta right after the original RETADDR, but before
+// the saved framepointer or the spilled registers e.g. caller(arg1, arg2)
+// calls callee(arg1, arg2,arg3,arg4) stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+M68kTargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const {
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = Subtarget.getSlotSize();
+ if ((Offset & AlignMask) <= (StackAlignment - SlotSize)) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset =
+ ((~AlignMask) & Offset) + StackAlignment + (StackAlignment - SlotSize);
+ }
+ return Offset;
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool M68kTargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
+ bool IsCalleeStructRet, bool IsCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const auto &CallerF = MF.getFunction();
+
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ const M68kRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (RegInfo->hasStackRealignment(MF))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (IsCalleeStructRet || IsCallerStructRet)
+ return false;
+
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ LLVMContext &C = *DAG.getContext();
+ if (IsVarArg && !Outs.empty()) {
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_M68k);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, RetCC_M68k,
+ RetCC_M68k))
+ return false;
+
+ // The callee has to preserve all registers the caller needs to preserve.
+ const M68kRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ unsigned StackArgsSize = 0;
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_M68k);
+ StackArgsSize = CCInfo.getNextStackOffset();
+
+ if (CCInfo.getNextStackOffset()) {
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const M68kInstrInfo *TII = Subtarget.getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
+ TII, VA))
+ return false;
+ }
+ }
+ }
+
+ bool PositionIndependent = isPositionIndependent();
+ // If the tailcall address may be in a register, then make sure it's
+ // possible to register allocate for it. The call address can
+ // only target %A0 or %A1 since the tail call must be scheduled after
+ // callee-saved registers are restored. These happen to be the same
+ // registers used to pass 'inreg' arguments so watch out for those.
+ if ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ PositionIndependent) {
+ unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs = PositionIndependent ? 1 : 2;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ unsigned Reg = VA.getLocReg();
+ switch (Reg) {
+ default:
+ break;
+ case M68k::A0:
+ case M68k::A1:
+ if (++NumInRegs == MaxInRegs)
+ return false;
+ break;
+ }
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+ }
+
+ bool CalleeWillPop = M68k::isCalleePop(
+ CalleeCC, IsVarArg, MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<M68kMachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Lower
+//===----------------------------------------------------------------------===//
+
+SDValue M68kTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Should not custom lower this!");
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO:
+ return LowerXALUO(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
+ case ISD::SETCCCARRY:
+ return LowerSETCCCARRY(Op, DAG);
+ case ISD::SELECT:
+ return LowerSELECT(Op, DAG);
+ case ISD::BRCOND:
+ return LowerBRCOND(Op, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ConstantPool:
+ return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::ExternalSymbol:
+ return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ }
+}
+
+bool M68kTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
+ // Shifts and add instructions in M68000 and M68010 support
+ // up to 32 bits, but mul only has 16-bit variant. So it's almost
+ // certainly beneficial to lower 8/16/32-bit mul to their
+ // add / shifts counterparts. But for 64-bits mul, it might be
+ // safer to just leave it to compiler runtime implementations.
+ return VT.bitsLE(MVT::i32) || Subtarget.atLeastM68020();
+}
+
+SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ unsigned Cond = 0;
+ SDLoc DL(Op);
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ BaseOp = M68kISD::ADD;
+ Cond = M68k::COND_VS;
+ break;
+ case ISD::UADDO:
+ BaseOp = M68kISD::ADD;
+ Cond = M68k::COND_CS;
+ break;
+ case ISD::SSUBO:
+ BaseOp = M68kISD::SUB;
+ Cond = M68k::COND_VS;
+ break;
+ case ISD::USUBO:
+ BaseOp = M68kISD::SUB;
+ Cond = M68k::COND_CS;
+ break;
+ }
+
+ // Also sets CCR.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i8);
+ SDValue Arith = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+ SDValue SetCC = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
+ DAG.getConstant(Cond, DL, MVT::i8),
+ SDValue(Arith.getNode(), 1));
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Arith, SetCC);
+}
+
+/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
+/// according to equal/not-equal condition code \p CC.
+static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
+ const SDLoc &DL, SelectionDAG &DAG) {
+ // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok.
+ if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
+ Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (Src.getValueType() != BitNo.getValueType())
+ BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
+
+ SDValue BT = DAG.getNode(M68kISD::BT, DL, MVT::i32, Src, BitNo);
+
+ // NOTE BTST sets CCR.Z flag
+ M68k::CondCode Cond = CC == ISD::SETEQ ? M68k::COND_NE : M68k::COND_EQ;
+ return DAG.getNode(M68kISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(Cond, DL, MVT::i8), BT);
+}
+
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue LHS, RHS;
+ if (Op1.getOpcode() == ISD::SHL)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (isOneConstant(Op0.getOperand(0))) {
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ auto Known = DAG.computeKnownBits(Op0);
+ if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
+ return SDValue();
+ }
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
+ } else if (auto *AndRHS = dyn_cast<ConstantSDNode>(Op1)) {
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
+ SDValue AndLHS = Op0;
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+
+ // Use BT if the immediate can't be encoded in a TEST instruction.
+ if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), DL, LHS.getValueType());
+ }
+ }
+
+ if (LHS.getNode())
+ return getBitTestCondition(LHS, RHS, CC, DL, DAG);
+
+ return SDValue();
+}
+
+static M68k::CondCode TranslateIntegerM68kCC(ISD::CondCode SetCCOpcode) {
+ switch (SetCCOpcode) {
+ default:
+ llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ:
+ return M68k::COND_EQ;
+ case ISD::SETGT:
+ return M68k::COND_GT;
+ case ISD::SETGE:
+ return M68k::COND_GE;
+ case ISD::SETLT:
+ return M68k::COND_LT;
+ case ISD::SETLE:
+ return M68k::COND_LE;
+ case ISD::SETNE:
+ return M68k::COND_NE;
+ case ISD::SETULT:
+ return M68k::COND_CS;
+ case ISD::SETUGE:
+ return M68k::COND_CC;
+ case ISD::SETUGT:
+ return M68k::COND_HI;
+ case ISD::SETULE:
+ return M68k::COND_LS;
+ }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the M68k-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateM68kCC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+ bool IsFP, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG) {
+ if (!IsFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return M68k::COND_PL;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return M68k::COND_MI;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return M68k::COND_LE;
+ }
+ }
+
+ return TranslateIntegerM68kCC(SetCCOpcode);
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default:
+ break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default:
+ llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ:
+ return M68k::COND_EQ;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ return M68k::COND_HI;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ return M68k::COND_CC;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT:
+ return M68k::COND_CS;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE:
+ return M68k::COND_LS;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ return M68k::COND_NE;
+ case ISD::SETOEQ:
+ case ISD::SETUNE:
+ return M68k::COND_INVALID;
+ }
+}
+
+// Convert (truncate (srl X, N) to i1) to (bt X, N)
+static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, const SDLoc &DL,
+ SelectionDAG &DAG) {
+
+ assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
+ "Expected TRUNCATE to i1 node");
+
+ if (Op.getOperand(0).getOpcode() != ISD::SRL)
+ return SDValue();
+
+ SDValue ShiftRight = Op.getOperand(0);
+ return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
+ CC, DL, DAG);
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
+SDValue M68kTargetLowering::EmitTest(SDValue Op, unsigned M68kCC,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (M68kCC) {
+ default:
+ break;
+ case M68k::COND_HI:
+ case M68k::COND_CC:
+ case M68k::COND_CS:
+ case M68k::COND_LS:
+ NeedCF = true;
+ break;
+ case M68k::COND_GT:
+ case M68k::COND_GE:
+ case M68k::COND_LT:
+ case M68k::COND_LE:
+ case M68k::COND_VS:
+ case M68k::COND_VC: {
+ // Check if we really need to set the
+ // Overflow flag. If NoSignedWrap is present
+ // that is not actually needed.
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SHL: {
+ if (Op.getNode()->getFlags().hasNoSignedWrap())
+ break;
+ LLVM_FALLTHROUGH;
+ }
+ default:
+ NeedOF = true;
+ break;
+ }
+ break;
+ }
+ }
+ // See if we can use the CCR value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(M68kISD::CMP, DL, MVT::i8,
+ DAG.getConstant(0, DL, Op.getValueType()), Op);
+ }
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+
+ // Truncate operations may prevent the merge of the SETCC instruction
+ // and the arithmetic instruction before it. Attempt to truncate the operands
+ // of the arithmetic instruction and use a reduced bit-width instruction.
+ bool NeedTruncation = false;
+ SDValue ArithOp = Op;
+ if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+ SDValue Arith = Op->getOperand(0);
+ // Both the trunc and the arithmetic op need to have one user each.
+ if (Arith->hasOneUse())
+ switch (Arith.getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ NeedTruncation = true;
+ ArithOp = Arith;
+ }
+ }
+ }
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
+ case ISD::ADD:
+ Opcode = M68kISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((M68kCC == M68k::COND_EQ || M68kCC == M68k::COND_NE) &&
+ Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) &&
+ !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
+ break;
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ Op = DAG.getNode(ISD::AND, DL, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, DL, VT));
+ }
+ break;
+
+ case ISD::AND:
+ // If the primary 'and' result isn't used, don't bother using
+ // M68kISD::AND, because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op)) {
+ SDValue Op0 = ArithOp->getOperand(0);
+ SDValue Op1 = ArithOp->getOperand(1);
+ EVT VT = ArithOp.getValueType();
+ bool IsAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
+ bool IsLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+
+ // But if we can combine this into an ANDN operation, then create an AND
+ // now and allow it to be pattern matched into an ANDN.
+ if (/*!Subtarget.hasBMI() ||*/ !IsAndn || !IsLegalAndnType)
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ // Due to the ISEL shortcoming noted above, be conservative if this op is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (const auto *U : Op.getNode()->uses())
+ if (U->getOpcode() == ISD::STORE)
+ goto default_case;
+
+ // Otherwise use a regular CCR-setting instruction.
+ switch (ArithOp.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected operator!");
+ case ISD::SUB:
+ Opcode = M68kISD::SUB;
+ break;
+ case ISD::XOR:
+ Opcode = M68kISD::XOR;
+ break;
+ case ISD::AND:
+ Opcode = M68kISD::AND;
+ break;
+ case ISD::OR:
+ Opcode = M68kISD::OR;
+ break;
+ }
+
+ NumOperands = 2;
+ break;
+ case M68kISD::ADD:
+ case M68kISD::SUB:
+ case M68kISD::OR:
+ case M68kISD::XOR:
+ case M68kISD::AND:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+
+ // If we found that truncation is beneficial, perform the truncation and
+ // update 'Op'.
+ if (NeedTruncation) {
+ EVT VT = Op.getValueType();
+ SDValue WideVal = Op->getOperand(0);
+ EVT WideVT = WideVal.getValueType();
+ unsigned ConvertedOp = 0;
+ // Use a target machine opcode to prevent further DAGCombine
+ // optimizations that may separate the arithmetic operations
+ // from the setcc node.
+ switch (WideVal.getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ ConvertedOp = M68kISD::ADD;
+ break;
+ case ISD::SUB:
+ ConvertedOp = M68kISD::SUB;
+ break;
+ case ISD::AND:
+ ConvertedOp = M68kISD::AND;
+ break;
+ case ISD::OR:
+ ConvertedOp = M68kISD::OR;
+ break;
+ case ISD::XOR:
+ ConvertedOp = M68kISD::XOR;
+ break;
+ }
+
+ if (ConvertedOp) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+ SDValue V0 = DAG.getNode(ISD::TRUNCATE, DL, VT, WideVal.getOperand(0));
+ SDValue V1 = DAG.getNode(ISD::TRUNCATE, DL, VT, WideVal.getOperand(1));
+ Op = DAG.getNode(ConvertedOp, DL, VT, V0, V1);
+ }
+ }
+ }
+
+ if (Opcode == 0) {
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(M68kISD::CMP, DL, MVT::i8,
+ DAG.getConstant(0, DL, Op.getValueType()), Op);
+ }
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i8);
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+ SDValue New = DAG.getNode(Opcode, DL, VTs, Ops);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isM68kCCUnsigned(unsigned M68kCC) {
+ switch (M68kCC) {
+ default:
+ llvm_unreachable("Invalid integer condition!");
+ case M68k::COND_EQ:
+ case M68k::COND_NE:
+ case M68k::COND_CS:
+ case M68k::COND_HI:
+ case M68k::COND_LS:
+ case M68k::COND_CC:
+ return true;
+ case M68k::COND_GT:
+ case M68k::COND_GE:
+ case M68k::COND_LT:
+ case M68k::COND_LE:
+ return false;
+ }
+}
+
+SDValue M68kTargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned M68kCC,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ if (isNullConstant(Op1))
+ return EmitTest(Op0, M68kCC, DL, DAG);
+
+ assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+ "Unexpected comparison operation for MVT::i1 operands");
+
+ if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+ Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if ((Op0.getValueType() == MVT::i16 &&
+ (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
+ !DAG.getMachineFunction().getFunction().hasMinSize()) {
+ unsigned ExtendOp =
+ isM68kCCUnsigned(M68kCC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ Op0 = DAG.getNode(ExtendOp, DL, MVT::i32, Op0);
+ Op1 = DAG.getNode(ExtendOp, DL, MVT::i32, Op1);
+ }
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i8);
+ SDValue Sub = DAG.getNode(M68kISD::SUB, DL, VTs, Op0, Op1);
+ return SDValue(Sub.getNode(), 1);
+ }
+ return DAG.getNode(M68kISD::CMP, DL, MVT::i8, Op0, Op1);
+}
+
+/// Result of 'and' or 'trunc to i1' is compared against zero.
+/// Change to a BT node if possible.
+SDValue M68kTargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
+ const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ if (Op.getOpcode() == ISD::AND)
+ return LowerAndToBT(Op, CC, DL, DAG);
+ if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
+ return LowerTruncateToBT(Op, CC, DL, DAG);
+ return SDValue();
+}
+
+SDValue M68kTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDLoc DL(Op);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ // Lower (trunc (X >> N) to i1) to BT(X, N).
+ if (Op0.hasOneUse() && isNullConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (SDValue NewSetCC = LowerToBT(Op0, CC, DL, DAG)) {
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, NewSetCC);
+ return NewSetCC;
+ }
+ }
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == M68kISD::SETCC) {
+ M68k::CondCode CCode = (M68k::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+ if (!Invert)
+ return Op0;
+
+ CCode = M68k::GetOppositeBranchCondition(CCode);
+ SDValue SetCC =
+ DAG.getNode(M68kISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CCode, DL, MVT::i8), Op0.getOperand(1));
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ return SetCC;
+ }
+ }
+ if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isOneConstant(Op1)) {
+ ISD::CondCode NewCC = ISD::GlobalISel::getSetCCInverse(CC, true);
+ return DAG.getSetCC(DL, VT, Op0, DAG.getConstant(0, DL, MVT::i1), NewCC);
+ }
+ if (!isNullConstant(Op1)) {
+ SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i1, Op0, Op1);
+ return DAG.getSetCC(DL, VT, Xor, DAG.getConstant(0, DL, MVT::i1), CC);
+ }
+ }
+
+ bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
+ unsigned M68kCC = TranslateM68kCC(CC, DL, IsFP, Op0, Op1, DAG);
+ if (M68kCC == M68k::COND_INVALID)
+ return SDValue();
+
+ SDValue CCR = EmitCmp(Op0, Op1, M68kCC, DL, DAG);
+ return DAG.getNode(M68kISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(M68kCC, DL, MVT::i8), CCR);
+}
+
+SDValue M68kTargetLowering::LowerSETCCCARRY(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
+ M68k::CondCode CC = TranslateIntegerM68kCC(cast<CondCodeSDNode>(Cond)->get());
+
+ EVT CarryVT = Carry.getValueType();
+ APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+ Carry = DAG.getNode(M68kISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+ DAG.getConstant(NegOne, DL, CarryVT));
+
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp =
+ DAG.getNode(M68kISD::SUBX, DL, VTs, LHS, RHS, Carry.getValue(1));
+
+ return DAG.getNode(M68kISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+}
+
+/// Return true if opcode is a M68k logical comparison.
+static bool isM68kLogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getNode()->getOpcode();
+ if (Opc == M68kISD::CMP)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == M68kISD::ADD || Opc == M68kISD::SUB || Opc == M68kISD::ADDX ||
+ Opc == M68kISD::SUBX || Opc == M68kISD::SMUL || Opc == M68kISD::UMUL ||
+ Opc == M68kISD::OR || Opc == M68kISD::XOR || Opc == M68kISD::AND))
+ return true;
+
+ if (Op.getResNo() == 2 && Opc == M68kISD::UMUL)
+ return true;
+
+ return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0,
+ APInt::getHighBitsSet(InBits, InBits - Bits));
+}
+
+SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Cond = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ SDLoc DL(Op);
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ Cond = NewCond;
+ }
+
+ // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+ // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+ // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+ // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ if (Cond.getOpcode() == M68kISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == M68kISD::CMP &&
+ isNullConstant(Cond.getOperand(1).getOperand(0))) {
+ SDValue Cmp = Cond.getOperand(1);
+
+ unsigned CondCode =
+ cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+ if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (CondCode == M68k::COND_EQ || CondCode == M68k::COND_NE)) {
+ SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+ SDValue CmpOp0 = Cmp.getOperand(1);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (isNullConstant(Y) &&
+ (isAllOnesConstant(Op1) == (CondCode == M68k::COND_NE))) {
+
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+
+ SDValue Neg =
+ DAG.getNode(M68kISD::SUB, DL, VTs,
+ DAG.getConstant(0, DL, CmpOp0.getValueType()), CmpOp0);
+
+ SDValue Res = DAG.getNode(M68kISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(M68k::COND_CS, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
+ Cmp = DAG.getNode(M68kISD::CMP, DL, MVT::i8,
+ DAG.getConstant(1, DL, CmpOp0.getValueType()), CmpOp0);
+
+ SDValue Res = // Res = 0 or -1.
+ DAG.getNode(M68kISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(M68k::COND_CS, DL, MVT::i8), Cmp);
+
+ if (isAllOnesConstant(Op1) != (CondCode == M68k::COND_EQ))
+ Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+ if (!isNullConstant(Op2))
+ Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+ return Res;
+ }
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == M68kISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a M68kISD::CMP, then use it as the condition
+ // setting operand in place of the M68kISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == M68kISD::SETCC || CondOpcode == M68kISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+
+ bool IllegalFPCMov = false;
+
+ if ((isM68kLogicalCmp(Cmp) && !IllegalFPCMov) || Opc == M68kISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned MxOpcode;
+ unsigned MxCond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO:
+ MxOpcode = M68kISD::ADD;
+ MxCond = M68k::COND_CS;
+ break;
+ case ISD::SADDO:
+ MxOpcode = M68kISD::ADD;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::USUBO:
+ MxOpcode = M68kISD::SUB;
+ MxCond = M68k::COND_CS;
+ break;
+ case ISD::SSUBO:
+ MxOpcode = M68kISD::SUB;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::UMULO:
+ MxOpcode = M68kISD::UMUL;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::SMULO:
+ MxOpcode = M68kISD::SMUL;
+ MxCond = M68k::COND_VS;
+ break;
+ default:
+ llvm_unreachable("unexpected overflowing operator");
+ }
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = MxOp.getValue(2);
+ else
+ Cond = MxOp.getValue(1);
+
+ CC = DAG.getConstant(MxCond, DL, MVT::i8);
+ addTest = false;
+ }
+
+ if (addTest) {
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(M68k::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, M68k::COND_NE, DL, DAG);
+ }
+
+ // a < b ? -1 : 0 -> RES = ~setcc_carry
+ // a < b ? 0 : -1 -> RES = setcc_carry
+ // a >= b ? -1 : 0 -> RES = setcc_carry
+ // a >= b ? 0 : -1 -> RES = ~setcc_carry
+ if (Cond.getOpcode() == M68kISD::SUB) {
+ unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+ if ((CondCode == M68k::COND_CC || CondCode == M68k::COND_CS) &&
+ (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (isNullConstant(Op1) || isNullConstant(Op2))) {
+ SDValue Res =
+ DAG.getNode(M68kISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(M68k::COND_CS, DL, MVT::i8), Cond);
+ if (isAllOnesConstant(Op1) != (CondCode == M68k::COND_CS))
+ return DAG.getNOT(DL, Res, Res.getValueType());
+ return Res;
+ }
+ }
+
+ // M68k doesn't have an i8 cmov. If both operands are the result of a
+ // truncate widen the cmov and push the truncate through. This avoids
+ // introducing a new branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE &&
+ Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Blacklist CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg &&
+ T2.getOpcode() != ISD::CopyFromReg) {
+ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+ SDValue Cmov = DAG.getNode(M68kISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
+ // M68kISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {Op2, Op1, CC, Cond};
+ return DAG.getNode(M68kISD::CMOV, DL, VTs, Ops);
+}
+
+/// Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes
+/// each of which has no other use apart from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (M68k::IsSETCC(Op.getOperand(0).getOpcode()) &&
+ Op.getOperand(0).hasOneUse() &&
+ M68k::IsSETCC(Op.getOperand(1).getOpcode()) &&
+ Op.getOperand(1).hasOneUse());
+}
+
+/// Return true if node is an ISD::XOR of a M68kISD::SETCC and 1 and that the
+/// SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ if (isOneConstant(Op.getOperand(1)))
+ return Op.getOperand(0).getOpcode() == M68kISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ return false;
+}
+
+SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ bool AddTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ SDLoc DL(Op);
+ SDValue CC;
+ bool Inverted = false;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ // Check for setcc([su]{add,sub}o == 0).
+ if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ isNullConstant(Cond.getOperand(1)) &&
+ Cond.getOperand(0).getResNo() == 1 &&
+ (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::USUBO)) {
+ Inverted = true;
+ Cond = Cond.getOperand(0);
+ } else {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ Cond = NewCond;
+ }
+ }
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == M68kISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a M68kISD::CMP, then use it as the condition
+ // setting operand in place of the M68kISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == M68kISD::SETCC || CondOpcode == M68kISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+
+ if (isM68kLogicalCmp(Cmp) || Opc == M68kISD::BT) {
+ Cond = Cmp;
+ AddTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default:
+ break;
+ case M68k::COND_VS:
+ case M68k::COND_CS:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getNode()->getOperand(1);
+ AddTest = false;
+ break;
+ }
+ }
+ }
+ CondOpcode = Cond.getOpcode();
+ if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned MxOpcode;
+ unsigned MxCond;
+ SDVTList VTs;
+ // Keep this in sync with LowerXALUO, otherwise we might create redundant
+ // instructions that can't be removed afterwards (i.e. M68kISD::ADD and
+ // M68kISD::INC).
+ switch (CondOpcode) {
+ case ISD::UADDO:
+ MxOpcode = M68kISD::ADD;
+ MxCond = M68k::COND_CS;
+ break;
+ case ISD::SADDO:
+ MxOpcode = M68kISD::ADD;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::USUBO:
+ MxOpcode = M68kISD::SUB;
+ MxCond = M68k::COND_CS;
+ break;
+ case ISD::SSUBO:
+ MxOpcode = M68kISD::SUB;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::UMULO:
+ MxOpcode = M68kISD::UMUL;
+ MxCond = M68k::COND_VS;
+ break;
+ case ISD::SMULO:
+ MxOpcode = M68kISD::SMUL;
+ MxCond = M68k::COND_VS;
+ break;
+ default:
+ llvm_unreachable("unexpected overflowing operator");
+ }
+
+ if (Inverted)
+ MxCond = M68k::GetOppositeBranchCondition((M68k::CondCode)MxCond);
+
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i8);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i8);
+
+ SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = MxOp.getValue(2);
+ else
+ Cond = MxOp.getValue(1);
+
+ CC = DAG.getConstant(MxCond, DL, MVT::i8);
+ AddTest = false;
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) && isM68kLogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(M68kISD::BRCOND, DL, Op.getValueType(), Chain,
+ Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) && isM68kLogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ M68k::CondCode CCode =
+ (M68k::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = M68k::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, DL, MVT::i8);
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(M68kISD::BRCOND, DL, Op.getValueType(), Chain,
+ Dest, CC, Cmp);
+ M68k::CondCode CCode =
+ (M68k::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = M68k::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, DL, MVT::i8);
+ Cond = Cmp;
+ AddTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ M68k::CondCode CCode =
+ (M68k::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = M68k::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, DL, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ AddTest = false;
+ }
+ }
+
+ if (AddTest) {
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result is compared against zero. Try to match it to BT.
+ if (Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ M68k::CondCode MxCond = Inverted ? M68k::COND_EQ : M68k::COND_NE;
+ CC = DAG.getConstant(MxCond, DL, MVT::i8);
+ Cond = EmitTest(Cond, MxCond, DL, DAG);
+ }
+ return DAG.getNode(M68kISD::BRCOND, DL, Op.getValueType(), Chain, Dest, CC,
+ Cond);
+}
+
+SDValue M68kTargetLowering::LowerADDC_ADDE_SUBC_SUBE(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getNode()->getSimpleValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i8);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Invalid code");
+ case ISD::ADDC:
+ Opc = M68kISD::ADD;
+ break;
+ case ISD::ADDE:
+ Opc = M68kISD::ADDX;
+ ExtraOp = true;
+ break;
+ case ISD::SUBC:
+ Opc = M68kISD::SUB;
+ break;
+ case ISD::SUBE:
+ Opc = M68kISD::SUBX;
+ ExtraOp = true;
+ break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the M68kISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue M68kTargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in PCRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ unsigned WrapperKind = M68kISD::Wrapper;
+ if (M68kII::isPCRelGlobalReference(OpFlag)) {
+ WrapperKind = M68kISD::WrapperPC;
+ }
+
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
+
+ SDLoc DL(CP);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (M68kII::isGlobalRelativeToPICBase(OpFlag)) {
+ Result = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(M68kISD::GLOBAL_BASE_REG, SDLoc(), PtrVT),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue M68kTargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+ // In PIC mode (unless we're in PCRel PIC mode) we add an offset to the
+ // global base reg.
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlag = Subtarget.classifyExternalReference(*Mod);
+
+ unsigned WrapperKind = M68kISD::Wrapper;
+ if (M68kII::isPCRelGlobalReference(OpFlag)) {
+ WrapperKind = M68kISD::WrapperPC;
+ }
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
+
+ SDLoc DL(Op);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (M68kII::isGlobalRelativeToPICBase(OpFlag)) {
+ Result = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(M68kISD::GLOBAL_BASE_REG, SDLoc(), PtrVT),
+ Result);
+ }
+
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (M68kII::isGlobalStubReference(OpFlag)) {
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ return Result;
+}
+
+SDValue M68kTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned char OpFlags = Subtarget.classifyBlockAddressReference();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+ SDLoc DL(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Create the TargetBlockAddressAddress node.
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+
+ if (M68kII::isPCRelBlockReference(OpFlags)) {
+ Result = DAG.getNode(M68kISD::WrapperPC, DL, PtrVT, Result);
+ } else {
+ Result = DAG.getNode(M68kISD::Wrapper, DL, PtrVT, Result);
+ }
+
+ // With PIC, the address is actually $g + Offset.
+ if (M68kII::isGlobalRelativeToPICBase(OpFlags)) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(M68kISD::GLOBAL_BASE_REG, DL, PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue M68kTargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+ const SDLoc &DL, int64_t Offset,
+ SelectionDAG &DAG) const {
+ unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ SDValue Result;
+ if (M68kII::isDirectGlobalReference(OpFlags)) {
+ Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Offset);
+ Offset = 0;
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+ }
+
+ if (M68kII::isPCRelGlobalReference(OpFlags))
+ Result = DAG.getNode(M68kISD::WrapperPC, DL, PtrVT, Result);
+ else
+ Result = DAG.getNode(M68kISD::Wrapper, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (M68kII::isGlobalRelativeToPICBase(OpFlags)) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(M68kISD::GLOBAL_BASE_REG, DL, PtrVT), Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (M68kII::isGlobalStubReference(OpFlags)) {
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0) {
+ Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+ DAG.getConstant(Offset, DL, PtrVT));
+ }
+
+ return Result;
+}
+
+SDValue M68kTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Lower Jump Table
+//===----------------------------------------------------------------------===//
+
+SDValue M68kTargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in PCRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ unsigned WrapperKind = M68kISD::Wrapper;
+ if (M68kII::isPCRelGlobalReference(OpFlag)) {
+ WrapperKind = M68kISD::WrapperPC;
+ }
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+ SDLoc DL(JT);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (M68kII::isGlobalRelativeToPICBase(OpFlag)) {
+ Result = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(M68kISD::GLOBAL_BASE_REG, SDLoc(), PtrVT),
+ Result);
+ }
+
+ return Result;
+}
+
+unsigned M68kTargetLowering::getJumpTableEncoding() const {
+ return Subtarget.getJumpTableEncoding();
+}
+
+const MCExpr *M68kTargetLowering::LowerCustomJumpTableEntry(
+ const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
+ unsigned uid, MCContext &Ctx) const {
+ return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF,
+ Ctx);
+}
+
+SDValue M68kTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (getJumpTableEncoding() == MachineJumpTableInfo::EK_Custom32)
+ return DAG.getNode(M68kISD::GLOBAL_BASE_REG, SDLoc(),
+ getPointerTy(DAG.getDataLayout()));
+
+ // MachineJumpTableInfo::EK_LabelDifference32 entry
+ return Table;
+}
+
+// NOTE This only used for MachineJumpTableInfo::EK_LabelDifference32 entries
+const MCExpr *M68kTargetLowering::getPICJumpTableRelocBaseExpr(
+ const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const {
+ return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx);
+}
+
+M68kTargetLowering::ConstraintType
+M68kTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() > 0) {
+ switch (Constraint[0]) {
+ case 'a':
+ case 'd':
+ return C_RegisterClass;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ return C_Immediate;
+ case 'C':
+ if (Constraint.size() == 2)
+ switch (Constraint[1]) {
+ case '0':
+ case 'i':
+ case 'j':
+ return C_Immediate;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+void M68kTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ if (Constraint.size() == 1) {
+ // Constant constraints
+ switch (Constraint[0]) {
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P': {
+ auto *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C)
+ return;
+
+ int64_t Val = C->getSExtValue();
+ switch (Constraint[0]) {
+ case 'I': // constant integer in the range [1,8]
+ if (Val > 0 && Val <= 8)
+ break;
+ return;
+ case 'J': // constant signed 16-bit integer
+ if (isInt<16>(Val))
+ break;
+ return;
+ case 'K': // constant that is NOT in the range of [-0x80, 0x80)
+ if (Val < -0x80 || Val >= 0x80)
+ break;
+ return;
+ case 'L': // constant integer in the range [-8,-1]
+ if (Val < 0 && Val >= -8)
+ break;
+ return;
+ case 'M': // constant that is NOT in the range of [-0x100, 0x100]
+ if (Val < -0x100 || Val >= 0x100)
+ break;
+ return;
+ case 'N': // constant integer in the range [24,31]
+ if (Val >= 24 && Val <= 31)
+ break;
+ return;
+ case 'O': // constant integer 16
+ if (Val == 16)
+ break;
+ return;
+ case 'P': // constant integer in the range [8,15]
+ if (Val >= 8 && Val <= 15)
+ break;
+ return;
+ default:
+ llvm_unreachable("Unhandled constant constraint");
+ }
+
+ Result = DAG.getTargetConstant(Val, SDLoc(Op), Op.getValueType());
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ if (Constraint.size() == 2) {
+ switch (Constraint[0]) {
+ case 'C':
+ // Constant constraints start with 'C'
+ switch (Constraint[1]) {
+ case '0':
+ case 'i':
+ case 'j': {
+ auto *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C)
+ break;
+
+ int64_t Val = C->getSExtValue();
+ switch (Constraint[1]) {
+ case '0': // constant integer 0
+ if (!Val)
+ break;
+ return;
+ case 'i': // constant integer
+ break;
+ case 'j': // integer constant that doesn't fit in 16 bits
+ if (!isInt<16>(C->getSExtValue()))
+ break;
+ return;
+ default:
+ llvm_unreachable("Unhandled constant constraint");
+ }
+
+ Result = DAG.getTargetConstant(Val, SDLoc(Op), Op.getValueType());
+ break;
+ }
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+M68kTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ case 'd':
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ return std::make_pair(0U, &M68k::DR8RegClass);
+ case MVT::i16:
+ return std::make_pair(0U, &M68k::DR16RegClass);
+ case MVT::i32:
+ return std::make_pair(0U, &M68k::DR32RegClass);
+ default:
+ break;
+ }
+ break;
+ case 'a':
+ switch (VT.SimpleTy) {
+ case MVT::i16:
+ return std::make_pair(0U, &M68k::AR16RegClass);
+ case MVT::i32:
+ return std::make_pair(0U, &M68k::AR32RegClass);
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool M68k::isCalleePop(CallingConv::ID CallingConv, bool IsVarArg,
+ bool GuaranteeTCO) {
+ return false;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case M68k::CMOV8d:
+ case M68k::CMOV16d:
+ case M68k::CMOV32r:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+// The CCR operand of SelectItr might be missing a kill marker
+// because there were multiple uses of CCR, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateCCRKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock *BB,
+ const TargetRegisterInfo *TRI) {
+ // Scan forward through BB for a use/def of CCR.
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr &mi = *miI;
+ if (mi.readsRegister(M68k::CCR))
+ return false;
+ if (mi.definesRegister(M68k::CCR))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether CCR is live into a
+ // successor.
+ if (miI == BB->end())
+ for (const auto *SBB : BB->successors())
+ if (SBB->isLiveIn(M68k::CCR))
+ return false;
+
+ // We found a def, or hit the end of the basic block and CCR wasn't live
+ // out. SelectMI should have a kill flag on CCR.
+ SelectItr->addRegisterKilled(M68k::CCR, TRI);
+ return true;
+}
+
+MachineBasicBlock *
+M68kTargetLowering::EmitLoweredSelect(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator It = ++MBB->getIterator();
+
+ // ThisMBB:
+ // ...
+ // TrueVal = ...
+ // cmp ccX, r1, r2
+ // bcc Copy1MBB
+ // fallthrough --> Copy0MBB
+ MachineBasicBlock *ThisMBB = MBB;
+ MachineFunction *F = MBB->getParent();
+
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a MBB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted MBB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2, we lower cascaded CMOVs such as
+ //
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ //
+ // to two successives branches.
+ MachineInstr *CascadedCMOV = nullptr;
+ MachineInstr *LastCMOV = &MI;
+ M68k::CondCode CC = M68k::CondCode(MI.getOperand(3).getImm());
+ M68k::CondCode OppCC = M68k::GetOppositeBranchCondition(CC);
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition.
+ while (NextMIIt != MBB->end() && isCMOVPseudo(*NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ ++NextMIIt;
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == &MI && NextMIIt != MBB->end() &&
+ NextMIIt->getOpcode() == MI.getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+ NextMIIt->getOperand(1).isKill()) {
+ CascadedCMOV = &*NextMIIt;
+ }
+
+ MachineBasicBlock *Jcc1MBB = nullptr;
+
+ // If we have a cascaded CMOV, we lower it to two successive branches to
+ // the same block. CCR is used by both, so mark it as live in the second.
+ if (CascadedCMOV) {
+ Jcc1MBB = F->CreateMachineBasicBlock(BB);
+ F->insert(It, Jcc1MBB);
+ Jcc1MBB->addLiveIn(M68k::CCR);
+ }
+
+ MachineBasicBlock *Copy0MBB = F->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+ F->insert(It, Copy0MBB);
+ F->insert(It, SinkMBB);
+
+ // If the CCR register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ MachineInstr *LastCCRSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
+ if (!LastCCRSUser->killsRegister(M68k::CCR) &&
+ !checkAndUpdateCCRKill(LastCCRSUser, MBB, TRI)) {
+ Copy0MBB->addLiveIn(M68k::CCR);
+ SinkMBB->addLiveIn(M68k::CCR);
+ }
+
+ // Transfer the remainder of MBB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // Add the true and fallthrough blocks as its successors.
+ if (CascadedCMOV) {
+ // The fallthrough block may be Jcc1MBB, if we have a cascaded CMOV.
+ MBB->addSuccessor(Jcc1MBB);
+
+ // In that case, Jcc1MBB will itself fallthrough the Copy0MBB, and
+ // jump to the SinkMBB.
+ Jcc1MBB->addSuccessor(Copy0MBB);
+ Jcc1MBB->addSuccessor(SinkMBB);
+ } else {
+ MBB->addSuccessor(Copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always SinkMBB.
+ MBB->addSuccessor(SinkMBB);
+
+ // Create the conditional branch instruction.
+ unsigned Opc = M68k::GetCondBranchFromCond(CC);
+ BuildMI(MBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+
+ if (CascadedCMOV) {
+ unsigned Opc2 = M68k::GetCondBranchFromCond(
+ (M68k::CondCode)CascadedCMOV->getOperand(3).getImm());
+ BuildMI(Jcc1MBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
+ }
+
+ // Copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to SinkMBB
+ Copy0MBB->addSuccessor(SinkMBB);
+
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, Copy0MBB ], [ %TrueValue, ThisMBB ]
+ // ...
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB =
+ BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(M68k::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(Copy0MBB)
+ .addReg(Op2Reg)
+ .addMBB(ThisMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // If we have a cascaded CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (CascadedCMOV) {
+ MIB.addReg(MI.getOperand(2).getReg()).addMBB(Jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY),
+ CascadedCMOV->getOperand(0).getReg())
+ .addReg(MI.getOperand(0).getReg());
+ CascadedCMOV->eraseFromParent();
+ }
+
+ // Now remove the CMOV(s).
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;)
+ (MIIt++)->eraseFromParent();
+
+ return SinkMBB;
+}
+
+MachineBasicBlock *
+M68kTargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ llvm_unreachable("Cannot lower Segmented Stack Alloca with stack-split on");
+}
+
+MachineBasicBlock *
+M68kTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case M68k::CMOV8d:
+ case M68k::CMOV16d:
+ case M68k::CMOV32r:
+ return EmitLoweredSelect(MI, BB);
+ case M68k::SALLOCA:
+ return EmitLoweredSegAlloca(MI, BB);
+ }
+}
+
+SDValue M68kTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ M68kMachineFunctionInfo *FuncInfo = MF.getInfo<M68kMachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SDLoc DL(Op);
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+
+ SDLoc DL(Op);
+
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Node->getValueType(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+ SDValue Result;
+ if (SplitStack) {
+ auto &MRI = MF.getRegInfo();
+ auto SPTy = getPointerTy(DAG.getDataLayout());
+ auto *ARClass = getRegClassFor(SPTy);
+ unsigned Vreg = MRI.createVirtualRegister(ARClass);
+ Chain = DAG.getCopyToReg(Chain, DL, Vreg, Size);
+ Result = DAG.getNode(M68kISD::SEG_ALLOCA, DL, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ auto &TLI = DAG.getTargetLoweringInfo();
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, SPReg, VT);
+ Chain = SP.getValue(1);
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Result = DAG.getNode(ISD::SUB, DL, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Result = DAG.getNode(ISD::AND, DL, VT, Result,
+ DAG.getConstant(-(uint64_t)Align, DL, VT));
+ Chain = DAG.getCopyToReg(Chain, DL, SPReg, Result); // Output chain
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
+
+ SDValue Ops[2] = {Result, Chain};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+//===----------------------------------------------------------------------===//
+// DAG Combine
+//===----------------------------------------------------------------------===//
+
+static SDValue getSETCC(M68k::CondCode Cond, SDValue CCR, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ return DAG.getNode(M68kISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, dl, MVT::i8), CCR);
+}
+// When legalizing carry, we create carries via add X, -1
+// If that comes from an actual carry, via setcc, we use the
+// carry directly.
+static SDValue combineCarryThroughADD(SDValue CCR) {
+ if (CCR.getOpcode() == M68kISD::ADD) {
+ if (isAllOnesConstant(CCR.getOperand(1))) {
+ SDValue Carry = CCR.getOperand(0);
+ while (Carry.getOpcode() == ISD::TRUNCATE ||
+ Carry.getOpcode() == ISD::ZERO_EXTEND ||
+ Carry.getOpcode() == ISD::SIGN_EXTEND ||
+ Carry.getOpcode() == ISD::ANY_EXTEND ||
+ (Carry.getOpcode() == ISD::AND &&
+ isOneConstant(Carry.getOperand(1))))
+ Carry = Carry.getOperand(0);
+ if (Carry.getOpcode() == M68kISD::SETCC ||
+ Carry.getOpcode() == M68kISD::SETCC_CARRY) {
+ if (Carry.getConstantOperandVal(0) == M68k::COND_CS)
+ return Carry.getOperand(1);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize a CCR definition used according to the condition code \p CC into
+/// a simpler CCR value, potentially returning a new \p CC and replacing uses
+/// of chain values.
+static SDValue combineSetCCCCR(SDValue CCR, M68k::CondCode &CC,
+ SelectionDAG &DAG,
+ const M68kSubtarget &Subtarget) {
+ if (CC == M68k::COND_CS)
+ if (SDValue Flags = combineCarryThroughADD(CCR))
+ return Flags;
+
+ return SDValue();
+}
+
+// Optimize RES = M68kISD::SETCC CONDCODE, CCR_INPUT
+static SDValue combineM68kSetCC(SDNode *N, SelectionDAG &DAG,
+ const M68kSubtarget &Subtarget) {
+ SDLoc DL(N);
+ M68k::CondCode CC = M68k::CondCode(N->getConstantOperandVal(0));
+ SDValue CCR = N->getOperand(1);
+
+ // Try to simplify the CCR and condition code operands.
+ if (SDValue Flags = combineSetCCCCR(CCR, CC, DAG, Subtarget))
+ return getSETCC(CC, Flags, DL, DAG);
+
+ return SDValue();
+}
+static SDValue combineM68kBrCond(SDNode *N, SelectionDAG &DAG,
+ const M68kSubtarget &Subtarget) {
+ SDLoc DL(N);
+ M68k::CondCode CC = M68k::CondCode(N->getConstantOperandVal(2));
+ SDValue CCR = N->getOperand(3);
+
+ // Try to simplify the CCR and condition code operands.
+ // Make sure to not keep references to operands, as combineSetCCCCR can
+ // RAUW them under us.
+ if (SDValue Flags = combineSetCCCCR(CCR, CC, DAG, Subtarget)) {
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ return DAG.getNode(M68kISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+ N->getOperand(1), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineSUBX(SDNode *N, SelectionDAG &DAG) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(M68kISD::SUBX, SDLoc(N), VTs, N->getOperand(0),
+ N->getOperand(1), Flags);
+ }
+
+ return SDValue();
+}
+
+// Optimize RES, CCR = M68kISD::ADDX LHS, RHS, CCR
+static SDValue combineADDX(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(M68kISD::ADDX, SDLoc(N), VTs, N->getOperand(0),
+ N->getOperand(1), Flags);
+ }
+
+ return SDValue();
+}
+
+SDValue M68kTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ case M68kISD::SUBX:
+ return combineSUBX(N, DAG);
+ case M68kISD::ADDX:
+ return combineADDX(N, DAG, DCI);
+ case M68kISD::SETCC:
+ return combineM68kSetCC(N, DAG, Subtarget);
+ case M68kISD::BRCOND:
+ return combineM68kBrCond(N, DAG, Subtarget);
+ }
+
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// M68kISD Node Names
+//===----------------------------------------------------------------------===//
+const char *M68kTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ case M68kISD::CALL:
+ return "M68kISD::CALL";
+ case M68kISD::TAIL_CALL:
+ return "M68kISD::TAIL_CALL";
+ case M68kISD::RET:
+ return "M68kISD::RET";
+ case M68kISD::TC_RETURN:
+ return "M68kISD::TC_RETURN";
+ case M68kISD::ADD:
+ return "M68kISD::ADD";
+ case M68kISD::SUB:
+ return "M68kISD::SUB";
+ case M68kISD::ADDX:
+ return "M68kISD::ADDX";
+ case M68kISD::SUBX:
+ return "M68kISD::SUBX";
+ case M68kISD::SMUL:
+ return "M68kISD::SMUL";
+ case M68kISD::UMUL:
+ return "M68kISD::UMUL";
+ case M68kISD::OR:
+ return "M68kISD::OR";
+ case M68kISD::XOR:
+ return "M68kISD::XOR";
+ case M68kISD::AND:
+ return "M68kISD::AND";
+ case M68kISD::CMP:
+ return "M68kISD::CMP";
+ case M68kISD::BT:
+ return "M68kISD::BT";
+ case M68kISD::SELECT:
+ return "M68kISD::SELECT";
+ case M68kISD::CMOV:
+ return "M68kISD::CMOV";
+ case M68kISD::BRCOND:
+ return "M68kISD::BRCOND";
+ case M68kISD::SETCC:
+ return "M68kISD::SETCC";
+ case M68kISD::SETCC_CARRY:
+ return "M68kISD::SETCC_CARRY";
+ case M68kISD::GLOBAL_BASE_REG:
+ return "M68kISD::GLOBAL_BASE_REG";
+ case M68kISD::Wrapper:
+ return "M68kISD::Wrapper";
+ case M68kISD::WrapperPC:
+ return "M68kISD::WrapperPC";
+ case M68kISD::SEG_ALLOCA:
+ return "M68kISD::SEG_ALLOCA";
+ default:
+ return NULL;
+ }
+}
+
+CCAssignFn *M68kTargetLowering::getCCAssignFn(CallingConv::ID CC, bool Return,
+ bool IsVarArg) const {
+ if (Return)
+ return RetCC_M68k_C;
+ else
+ return CC_M68k_C;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.h b/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.h
new file mode 100644
index 0000000..6a5a40a
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.h
@@ -0,0 +1,279 @@
+//===-- M68kISelLowering.h - M68k DAG Lowering Interface ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the interfaces that M68k uses to lower LLVM code into a
+/// selection DAG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KISELLOWERING_H
+#define LLVM_LIB_TARGET_M68K_M68KISELLOWERING_H
+
+#include "M68k.h"
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Function.h"
+
+#include <deque>
+
+namespace llvm {
+namespace M68kISD {
+
+/// M68k Specific DAG nodes
+enum NodeType {
+ /// Start the numbering from where ISD NodeType finishes.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ CALL,
+ RET,
+ TAIL_CALL,
+ TC_RETURN,
+
+ /// M68k compare and logical compare instructions. Subtracts the source
+ /// operand from the destination data register and sets the condition
+ /// codes according to the result. Immediate always goes first.
+ CMP,
+
+ /// M68k bit-test instructions.
+ BT,
+
+ /// M68k Select
+ SELECT,
+
+ /// M68k SetCC. Operand 0 is condition code, and operand 1 is the CCR
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ // Same as SETCC except it's materialized with a subx and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// M68k conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// M68k conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ // Arithmetic operations with CCR results.
+ ADD,
+ SUB,
+ ADDX,
+ SUBX,
+ SMUL,
+ UMUL,
+ OR,
+ XOR,
+ AND,
+
+ // GlobalBaseReg,
+ GLOBAL_BASE_REG,
+
+ /// A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// Special wrapper used under M68k PIC mode for PC
+ /// relative displacements.
+ WrapperPC,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+};
+} // namespace M68kISD
+
+/// Define some predicates that are used for node matching.
+namespace M68k {
+
+/// Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool isCalleePop(CallingConv::ID CallingConv, bool IsVarArg, bool GuaranteeTCO);
+
+} // end namespace M68k
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+
+class M68kMachineFunctionInfo;
+class M68kSubtarget;
+
+class M68kTargetLowering : public TargetLowering {
+ const M68kSubtarget &Subtarget;
+ const M68kTargetMachine &TM;
+
+public:
+ explicit M68kTargetLowering(const M68kTargetMachine &TM,
+ const M68kSubtarget &STI);
+
+ static const M68kTargetLowering *create(const M68kTargetMachine &TM,
+ const M68kSubtarget &STI);
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ /// Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ /// EVT is not used in-tree, but is used by out-of-tree target.
+ virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+
+ /// Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Return the entry encoding for a jump table in the current function.
+ /// The returned value is a member of the MachineJumpTableInfo::JTEntryKind
+ /// enum.
+ unsigned getJumpTableEncoding() const override;
+
+ const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,
+ MCContext &Ctx) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+
+ /// This returns the relocation base for the given PIC jumptable,
+ /// the same as getPICJumpTableRelocBase, but as an MCExpr.
+ const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI,
+ MCContext &Ctx) const override;
+
+ ConstraintType getConstraintType(StringRef ConstraintStr) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ // Lower operand with C_Immediate and C_Other constraint type
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ CCAssignFn *getCCAssignFn(CallingConv::ID CC, bool Return,
+ bool IsVarArg) const;
+
+private:
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ /// Emit a load of return address if tail call
+ /// optimization is performed and it is required.
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, int FPDiff,
+ const SDLoc &DL) const;
+
+ /// Emit a store of the return address if tail call
+ /// optimization is performed and it is required (FPDiff!=0).
+ SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize, int FPDiff,
+ const SDLoc &DL) const;
+
+ SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo &MFI,
+ unsigned ArgIdx) const;
+
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const CCValAssign &VA, ISD::ArgFlagsTy Flags) const;
+
+ SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL,
+ SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ /// LowerFormalArguments - transform physical registers into virtual
+ /// registers and generate load operations for arguments places on the stack.
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CCID,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ /// Lower the result values of a call into the
+ /// appropriate copies out of appropriate physical registers.
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CCID, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+ MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given M68k condition code.
+ SDValue EmitTest(SDValue Op0, unsigned M68kCC, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given M68k condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned M68kCC, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+
+ /// Check whether the call is eligible for tail call optimization. Targets
+ /// that want to do tail call optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
+ bool IsCalleeStructRet, bool IsCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+};
+} // namespace llvm
+
+#endif // M68kISELLOWERING_H
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrArithmetic.td
new file mode 100644
index 0000000..f65ad57
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -0,0 +1,880 @@
+//===-- M68kInstrArithmetic.td - Integer Arith Instrs ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the integer arithmetic instructions in the M68k
+/// architecture. Here is the current status of the file:
+///
+/// Machine:
+///
+/// ADD [~] ADDA [~] ADDI [~] ADDQ [ ] ADDX [~]
+/// CLR [ ] CMP [~] CMPA [~] CMPI [~] CMPM [ ]
+/// CMP2 [ ] DIVS/DIVU [~] DIVSL/DIVUL [ ] EXT [~] EXTB [ ]
+/// MULS/MULU [~] NEG [~] NEGX [~] SUB [~] SUBA [~]
+/// SUBI [~] SUBQ [ ] SUBX [~]
+///
+/// Map:
+///
+/// [ ] - was not touched at all
+/// [!] - requires extarnal stuff implemented
+/// [~] - functional implementation
+/// [X] - complete implementation
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encoding
+//===----------------------------------------------------------------------===//
+
+/// Encoding for Normal forms
+/// ----------------------------------------------------
+/// F E D C | B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// ----------------------------------------------------
+/// | | | EFFECTIVE ADDRESS
+/// x x x x | REG | OP MODE | MODE | REG
+/// ----------------------------------------------------
+class MxArithEncoding<MxBead4Bits CMD, MxEncOpMode OPMODE, MxBead REG,
+ MxEncEA EA, MxEncExt EXT>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, OPMODE.B0, OPMODE.B1, OPMODE.B2, REG,
+ CMD,EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+/// Encoding for Extended forms
+/// ------------------------------------------------------
+/// F E D C | B A 9 | 8 | 7 6 | 5 4 | 3 | 2 1 0
+/// ------------------------------------------------------
+/// x x x x | REG Rx | 1 | SIZE | 0 0 | M | REG Ry
+/// ------------------------------------------------------
+/// Rx - destination
+/// Ry - source
+/// M - address mode switch
+class MxArithXEncoding<MxBead4Bits CMD, MxEncSize SIZE, MxBead1Bit MODE,
+ MxBeadDReg SRC, MxBeadDReg DST>
+ : MxEncoding<SRC, MODE, MxBead2Bits<0b00>, SIZE, MxBead1Bit<0b1>, DST, CMD>;
+
+/// Encoding for Immediate forms
+/// ---------------------------------------------------
+/// F E D C B A 9 8 | 7 6 | 5 4 3 | 2 1 0
+/// ---------------------------------------------------
+/// | | EFFECTIVE ADDRESS
+/// x x x x x x x x | SIZE | MODE | REG
+/// ---------------------------------------------------
+/// 16-BIT WORD DATA | 8-BIT BYTE DATA
+/// ---------------------------------------------------
+/// 32-BIT LONG DATA
+/// ---------------------------------------------------
+/// NOTE It is used to store an immediate to memory, imm-to-reg are handled with
+/// normal version
+class MxArithImmEncoding<MxBead4Bits CMD, MxEncSize SIZE,
+ MxEncEA DST_EA, MxEncExt DST_EXT, MxEncExt SRC_EXT>
+ : MxEncoding<DST_EA.Reg, DST_EA.DA, DST_EA.Mode, SIZE, CMD, MxBead4Bits<0>,
+ // Source
+ SRC_EXT.Imm, SRC_EXT.B8, SRC_EXT.Scale,
+ SRC_EXT.WL, SRC_EXT.DAReg,
+ // Destination
+ DST_EXT.Imm, DST_EXT.B8, DST_EXT.Scale,
+ DST_EXT.WL, DST_EXT.DAReg>;
+
+
+//===----------------------------------------------------------------------===//
+// Add/Sub
+//===----------------------------------------------------------------------===//
+
+let Defs = [CCR] in {
+let Constraints = "$src = $dst" in {
+
+// $reg, $ccr <- $reg op $reg
+class MxBiArOp_RFRR_xEA<string MN, SDNode NODE, MxType TYPE, bits<4> CMD, MxBead REG>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
+ MxArithEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
+ REG,
+ !cast<MxEncEA>("MxEncEA"#TYPE.RLet#"_2"),
+ MxExtEmpty>>;
+
+/// This Op is similar to the one above except it uses reversed opmode, some
+/// commands(e.g. eor) do not support dEA or rEA modes and require EAd for
+/// register only operations.
+/// NOTE when using dd commands it is irrelevant which opmode to use(as it seems)
+/// but some opcodes support address register and some do not which creates this
+/// mess.
+class MxBiArOp_RFRR_EAd<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
+ MxArithEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EAd"),
+ MxBeadDReg<2>, MxEncEAd_0, MxExtEmpty>>;
+
+// $reg <- $reg op $imm
+class MxBiArOp_RFRI_xEA<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.IOp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))],
+ MxArithEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
+ MxBeadDReg<0>, MxEncEAi,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_2")>>;
+
+// Again, there are two ways to write an immediate to Dn register either dEA
+// opmode or using *I encoding, and again some instrucitons also support address
+// registers some do not.
+class MxBiArOp_RFRI<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.IOp:$opd),
+ MN#"i."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))],
+ MxArithImmEncoding<MxBead4Bits<CMD>, !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ !cast<MxEncEA>("MxEncEA"#TYPE.RLet#"_0"), MxExtEmpty,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_2")>>;
+
+let mayLoad = 1 in
+class MxBiArOp_RFRM<string MN, SDNode NODE, MxType TYPE, MxOperand OPD, ComplexPattern PAT,
+ bits<4> CMD, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, OPD:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, (TYPE.Load PAT:$opd)))],
+ MxArithEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
+ MxBeadDReg<0>, EA, EXT>>;
+
+} // Constraints
+
+let mayLoad = 1, mayStore = 1 in {
+
+// FIXME MxBiArOp_FMR/FMI cannot consume CCR from MxAdd/MxSub which leads for
+// MxAdd to survive the match and subsequent mismatch.
+class MxBiArOp_FMR<string MN, SDNode NODE, MxType TYPE,
+ MxOperand MEMOpd, ComplexPattern MEMPat,
+ bits<4> CMD, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [],
+ MxArithEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EA"#TYPE.RLet),
+ MxBeadDReg<1>, EA, EXT>>;
+
+class MxBiArOp_FMI<string MN, SDNode NODE, MxType TYPE,
+ MxOperand MEMOpd, ComplexPattern MEMPat,
+ bits<4> CMD, MxEncEA MEMEA, MxEncExt MEMExt>
+ : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [],
+ MxArithImmEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MEMEA, MEMExt,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_1")>>;
+} // mayLoad, mayStore
+} // Defs = [CCR]
+
+multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
+ bits<4> CMD, bits<4> CMDI> {
+
+ // op $mem, $reg
+ def NAME#"8dk" : MxBiArOp_RFRM<MN, NODE, MxType8d, MxType8.KOp, MxType8.KPat,
+ CMD, MxEncEAk, MxExtBrief_2>;
+ def NAME#"16dk" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.KOp, MxType16.KPat,
+ CMD, MxEncEAk, MxExtBrief_2>;
+ def NAME#"32dk" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.KOp, MxType32.KPat,
+ CMD, MxEncEAk, MxExtBrief_2>;
+
+ def NAME#"8dq" : MxBiArOp_RFRM<MN, NODE, MxType8d, MxType8.QOp, MxType8.QPat,
+ CMD, MxEncEAq, MxExtI16_2>;
+ def NAME#"16dq" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.QOp, MxType16.QPat,
+ CMD, MxEncEAq, MxExtI16_2>;
+ def NAME#"32dq" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.QOp, MxType32.QPat,
+ CMD, MxEncEAq, MxExtI16_2>;
+
+ def NAME#"8dp" : MxBiArOp_RFRM<MN, NODE, MxType8d, MxType8.POp, MxType8.PPat,
+ CMD, MxEncEAp_2, MxExtI16_2>;
+ def NAME#"16dp" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.POp, MxType16.PPat,
+ CMD, MxEncEAp_2, MxExtI16_2>;
+ def NAME#"32dp" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.POp, MxType32.PPat,
+ CMD, MxEncEAp_2, MxExtI16_2>;
+
+ def NAME#"8df" : MxBiArOp_RFRM<MN, NODE, MxType8d, MxType8.FOp, MxType8.FPat,
+ CMD, MxEncEAf_2, MxExtBrief_2>;
+ def NAME#"16df" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.FOp, MxType16.FPat,
+ CMD, MxEncEAf_2, MxExtBrief_2>;
+ def NAME#"32df" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.FOp, MxType32.FPat,
+ CMD, MxEncEAf_2, MxExtBrief_2>;
+
+ def NAME#"8dj" : MxBiArOp_RFRM<MN, NODE, MxType8d, MxType8.JOp, MxType8.JPat,
+ CMD, MxEncEAj_2, MxExtEmpty>;
+ def NAME#"16dj" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.JOp, MxType16.JPat,
+ CMD, MxEncEAj_2, MxExtEmpty>;
+ def NAME#"32dj" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.JOp, MxType32.JPat,
+ CMD, MxEncEAj_2, MxExtEmpty>;
+
+ // op $imm, $reg
+ def NAME#"8di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType8d, CMD>;
+ def NAME#"16di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType16d, CMD>;
+ def NAME#"32di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32d, CMD>;
+
+ // op $reg, $mem
+ def NAME#"8pd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.POp, MxType8.PPat,
+ CMD, MxEncEAp_0, MxExtI16_0>;
+ def NAME#"16pd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.POp, MxType16.PPat,
+ CMD, MxEncEAp_0, MxExtI16_0>;
+ def NAME#"32pd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.POp, MxType32.PPat,
+ CMD, MxEncEAp_0, MxExtI16_0>;
+
+ def NAME#"8fd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.FOp, MxType8.FPat,
+ CMD, MxEncEAf_0, MxExtBrief_0>;
+ def NAME#"16fd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.FOp, MxType16.FPat,
+ CMD, MxEncEAf_0, MxExtBrief_0>;
+ def NAME#"32fd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.FOp, MxType32.FPat,
+ CMD, MxEncEAf_0, MxExtBrief_0>;
+
+ def NAME#"8jd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.JOp, MxType8.JPat,
+ CMD, MxEncEAj_0, MxExtEmpty>;
+ def NAME#"16jd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.JOp, MxType16.JPat,
+ CMD, MxEncEAj_0, MxExtEmpty>;
+ def NAME#"32jd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.JOp, MxType32.JPat,
+ CMD, MxEncEAj_0, MxExtEmpty>;
+
+ // op $imm, $mem
+ def NAME#"8pi" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.POp, MxType8.PPat,
+ CMDI, MxEncEAp_0, MxExtI16_0>;
+ def NAME#"16pi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.POp, MxType16.PPat,
+ CMDI, MxEncEAp_0, MxExtI16_0>;
+ def NAME#"32pi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.POp, MxType32.PPat,
+ CMDI, MxEncEAp_0, MxExtI16_0>;
+
+ def NAME#"8fi" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.FOp, MxType8.FPat,
+ CMDI, MxEncEAf_0, MxExtBrief_0>;
+ def NAME#"16fi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.FOp, MxType16.FPat,
+ CMDI, MxEncEAf_0, MxExtBrief_0>;
+ def NAME#"32fi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.FOp, MxType32.FPat,
+ CMDI, MxEncEAf_0, MxExtBrief_0>;
+
+ def NAME#"8ji" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.JOp, MxType8.JPat,
+ CMDI, MxEncEAj_0, MxExtEmpty>;
+ def NAME#"16ji" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.JOp, MxType16.JPat,
+ CMDI, MxEncEAj_0, MxExtEmpty>;
+ def NAME#"32ji" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.JOp, MxType32.JPat,
+ CMDI, MxEncEAj_0, MxExtEmpty>;
+
+ let isCommutable = isComm in {
+
+ def NAME#"8dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType8d, CMD, MxBeadDReg<0>>;
+ def NAME#"16dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, CMD, MxBeadDReg<0>>;
+ def NAME#"32dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, CMD, MxBeadDReg<0>>;
+
+ } // isComm
+
+} // MxBiArOp_DF
+
+
+// These special snowflakes allowed to match address registers but since *A
+// operations do not produce CCR we should not match them against Mx nodes that
+// produce it.
+let Pattern = [(null_frag)] in
+multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm,
+ bits<4> CMD, bits<4> CMDI> {
+
+ def NAME#"32rk" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.KOp, MxType32.KPat,
+ CMD, MxEncEAk, MxExtBrief_2>;
+ def NAME#"32rq" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.QOp, MxType32.QPat,
+ CMD, MxEncEAq, MxExtI16_2>;
+ def NAME#"32rf" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.FOp, MxType32.FPat,
+ CMD, MxEncEAf_2, MxExtBrief_2>;
+ def NAME#"32rp" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.POp, MxType32.PPat,
+ CMD, MxEncEAp_2, MxExtI16_2>;
+ def NAME#"32rj" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.JOp, MxType32.JPat,
+ CMD, MxEncEAj_2, MxExtEmpty>;
+ def NAME#"32ri" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32r, CMD>;
+
+ let isCommutable = isComm in
+ def NAME#"32rr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32r, CMD, MxBeadReg<0>>;
+
+} // MxBiArOp_AF
+
+// NOTE These naturally produce CCR
+
+defm ADD : MxBiArOp_DF<"add", MxAdd, 1, 0xD, 0x6>;
+defm ADD : MxBiArOp_AF<"add", MxAdd, 1, 0xD, 0x6>;
+defm SUB : MxBiArOp_DF<"sub", MxSub, 0, 0x9, 0x4>;
+defm SUB : MxBiArOp_AF<"sub", MxSub, 0, 0x9, 0x4>;
+
+
+let Uses = [CCR], Defs = [CCR] in {
+let Constraints = "$src = $dst" in {
+
+// $reg, ccr <- $reg op $reg op ccr
+class MxBiArOp_RFRRF<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd, CCR))],
+ MxArithXEncoding<MxBead4Bits<CMD>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MxBead1Bit<0>, MxBeadDReg<2>, MxBeadDReg<0>>>;
+
+} // Constraints
+} // Uses, Defs
+
+multiclass MxBiArOp_RFF<string MN, SDNode NODE, bit isComm, bits<4> CMD> {
+
+let isCommutable = isComm in {
+
+ def NAME#"8dd" : MxBiArOp_RFRRF<MN, NODE, MxType8d, CMD>;
+ def NAME#"16dd" : MxBiArOp_RFRRF<MN, NODE, MxType16d, CMD>;
+ def NAME#"32dd" : MxBiArOp_RFRRF<MN, NODE, MxType32d, CMD>;
+
+} // isComm
+
+} // MxBiArOp_RFF
+
+// NOTE These consume and produce CCR
+defm ADDX : MxBiArOp_RFF<"addx", MxAddX, 1, 0xD>;
+defm SUBX : MxBiArOp_RFF<"subx", MxSubX, 0, 0x9>;
+
+
+//===----------------------------------------------------------------------===//
+// And/Xor/Or
+//===----------------------------------------------------------------------===//
+
+defm AND : MxBiArOp_DF<"and", MxAnd, 1, 0xC, 0x2>;
+defm OR : MxBiArOp_DF<"or", MxOr, 1, 0x8, 0x0>;
+
+multiclass MxBiArOp_DF_EAd<string MN, SDNode NODE, bits<4> CMD, bits<4> CMDI> {
+
+ let isCommutable = 1 in {
+
+ def NAME#"8dd" : MxBiArOp_RFRR_EAd<MN, NODE, MxType8d, CMD>;
+ def NAME#"16dd" : MxBiArOp_RFRR_EAd<MN, NODE, MxType16d, CMD>;
+ def NAME#"32dd" : MxBiArOp_RFRR_EAd<MN, NODE, MxType32d, CMD>;
+
+ } // isCommutable = 1
+
+ def NAME#"8di" : MxBiArOp_RFRI<MN, NODE, MxType8d, CMDI>;
+ def NAME#"16di" : MxBiArOp_RFRI<MN, NODE, MxType16d, CMDI>;
+ def NAME#"32di" : MxBiArOp_RFRI<MN, NODE, MxType32d, CMDI>;
+
+} // MxBiArOp_DF_EAd
+
+defm XOR : MxBiArOp_DF_EAd<"eor", MxXor, 0xB, 0xA>;
+
+
+//===----------------------------------------------------------------------===//
+// CMP
+//===----------------------------------------------------------------------===//
+
+let Defs = [CCR] in {
+class MxCmp_RR<MxType TYPE>
+ : MxInst<(outs), (ins TYPE.ROp:$lhs, TYPE.ROp:$rhs),
+ "cmp."#TYPE.Prefix#"\t$lhs, $rhs",
+ [(set CCR, (MxCmp TYPE.VT:$lhs, TYPE.VT:$rhs))],
+ MxArithEncoding<MxBead4Bits<0xB>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"dEA"),
+ MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
+
+class MxCmp_RI<MxType TYPE>
+ : MxInst<(outs), (ins TYPE.IOp:$imm, TYPE.ROp:$reg),
+ "cmpi."#TYPE.Prefix#"\t$imm, $reg",
+ [(set CCR, (MxCmp TYPE.IPat:$imm, TYPE.VT:$reg))],
+ MxArithImmEncoding<MxBead4Bits<0xC>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MxEncEAd_1, MxExtEmpty,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+
+let mayLoad = 1 in {
+
+class MxCmp_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins TYPE.IOp:$imm, MEMOpd:$mem),
+ "cmpi."#TYPE.Prefix#"\t$imm, $mem",
+ [(set CCR, (MxCmp TYPE.IPat:$imm, (load MEMPat:$mem)))],
+ MxArithImmEncoding<MxBead4Bits<0xC>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ EA, EXT,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+
+class MxCmp_BI<MxType TYPE>
+ : MxInst<(outs), (ins TYPE.IOp:$imm, MxAL32:$abs),
+ "cmpi."#TYPE.Prefix#"\t$imm, $abs",
+ [(set CCR, (MxCmp TYPE.IPat:$imm,
+ (load (i32 (MxWrapper tglobaladdr:$abs)))))],
+ MxArithImmEncoding<MxBead4Bits<0xC>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MxEncEAb, MxExtI32_1,
+ !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+
+class MxCmp_RM<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins TYPE.ROp:$reg, MEMOpd:$mem),
+ "cmp."#TYPE.Prefix#"\t$mem, $reg",
+ [(set CCR, (MxCmp (load MEMPat:$mem), TYPE.ROp:$reg))],
+ MxArithEncoding<MxBead4Bits<0xB>,
+ !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"dEA"),
+ MxBeadDReg<0>, EA, EXT>>;
+} // let mayLoad = 1
+
+} // let Defs = [CCR]
+
+multiclass MMxCmp_RM<MxType TYPE> {
+ def NAME#TYPE.KOp.Letter : MxCmp_RM<TYPE, TYPE.KOp, TYPE.KPat, MxEncEAk,
+ MxExtBrief_1>;
+ def NAME#TYPE.QOp.Letter : MxCmp_RM<TYPE, TYPE.QOp, TYPE.QPat, MxEncEAq,
+ MxExtI16_1>;
+ def NAME#TYPE.POp.Letter : MxCmp_RM<TYPE, TYPE.POp, TYPE.PPat, MxEncEAp_1,
+ MxExtI16_1>;
+ def NAME#TYPE.FOp.Letter : MxCmp_RM<TYPE, TYPE.FOp, TYPE.FPat, MxEncEAf_1,
+ MxExtBrief_1>;
+ def NAME#TYPE.JOp.Letter : MxCmp_RM<TYPE, TYPE.JOp, TYPE.JPat, MxEncEAj_1,
+ MxExtEmpty>;
+}
+
+multiclass MMxCmp_MI<MxType TYPE> {
+ def NAME#TYPE.KOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.KOp, TYPE.KPat, MxEncEAk,
+ MxExtBrief_1>;
+ def NAME#TYPE.QOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.QOp, TYPE.QPat, MxEncEAq,
+ MxExtI16_1>;
+ def NAME#TYPE.POp.Letter#"i" : MxCmp_MI<TYPE, TYPE.POp, TYPE.PPat, MxEncEAp_1,
+ MxExtI16_1>;
+ def NAME#TYPE.FOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.FOp, TYPE.FPat, MxEncEAf_1,
+ MxExtBrief_1>;
+ def NAME#TYPE.JOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.JOp, TYPE.JPat, MxEncEAj_1,
+ MxExtEmpty>;
+}
+
+foreach S = [8, 16, 32] in {
+ def CMP#S#dd : MxCmp_RR<!cast<MxType>("MxType"#S#"d")>;
+ def CMP#S#di : MxCmp_RI<!cast<MxType>("MxType"#S#"d")>;
+ def CMP#S#bi : MxCmp_BI<!cast<MxType>("MxType"#S#"d")>;
+} // foreach
+
+// cmp mem, Dn
+defm CMP8d : MMxCmp_RM<MxType8d>;
+defm CMP16d : MMxCmp_RM<MxType16d>;
+defm CMP32d : MMxCmp_RM<MxType32d>;
+
+// cmp #imm, mem
+defm CMP8 : MMxCmp_MI<MxType8d>;
+defm CMP16 : MMxCmp_MI<MxType16d>;
+defm CMP32 : MMxCmp_MI<MxType32d>;
+
+
+//===----------------------------------------------------------------------===//
+// EXT
+//===----------------------------------------------------------------------===//
+
+def MxExtOpmode_wb : MxBead3Bits<0b010>;
+def MxExtOpmode_lw : MxBead3Bits<0b011>;
+def MxExtOpmode_lb : MxBead3Bits<0b111>;
+
+/// ---------------------------------------------------
+/// F E D C B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// ---------------------------------------------------
+/// 0 1 0 0 1 0 0 | OPMODE | 0 0 0 | REG
+/// ---------------------------------------------------
+class MxExtEncoding<MxBead3Bits OPMODE>
+ : MxEncoding<MxBeadDReg<0>, MxBead3Bits<0b000>, OPMODE,
+ MxBead3Bits<0b100>, MxBead4Bits<0b0100>>;
+
+let Defs = [CCR] in
+let Constraints = "$src = $dst" in
+class MxExt<MxType TO, MxType FROM>
+ : MxInst<(outs TO.ROp:$dst), (ins TO.ROp:$src),
+ "ext."#TO.Prefix#"\t$src", [],
+ MxExtEncoding<!cast<MxBead3Bits>("MxExtOpmode_"#TO.Prefix#FROM.Prefix)>>;
+
+def EXT16 : MxExt<MxType16d, MxType8d>;
+def EXT32 : MxExt<MxType32d, MxType16d>;
+
+def : Pat<(sext_inreg i16:$src, i8), (EXT16 $src)>;
+def : Pat<(sext_inreg i32:$src, i16), (EXT32 $src)>;
+def : Pat<(sext_inreg i32:$src, i8),
+ (EXT32 (MOVXd32d16 (EXT16 (EXTRACT_SUBREG $src, MxSubRegIndex16Lo))))>;
+
+
+//===----------------------------------------------------------------------===//
+// DIV/MUL
+//===----------------------------------------------------------------------===//
+
+def MxSDiMuOpmode : MxBead3Bits<0b111>;
+def MxUDiMuOpmode : MxBead3Bits<0b011>;
+
+/// Word operation:
+/// ----------------------------------------------------
+/// F E D C | B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// ----------------------------------------------------
+/// | | | EFFECTIVE ADDRESS
+/// x x x x | REG | OP MODE | MODE | REG
+/// ----------------------------------------------------
+class MxDiMuEncoding<MxBead4Bits CMD, MxBead3Bits OPMODE, MxEncEA EA, MxEncExt EXT>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, OPMODE, MxBeadDReg<0>, CMD,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+let Defs = [CCR] in {
+let Constraints = "$src = $dst" in {
+// $reg <- $reg op $reg
+class MxDiMuOp_DD<string MN, bits<4> CMD, MxBead3Bits OPMODE,
+ MxOperand DST, MxOperand OPD>
+ : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", [],
+ MxDiMuEncoding<MxBead4Bits<CMD>, OPMODE, MxEncEAd_2, MxExtEmpty>>;
+
+// $reg <- $reg op $imm
+class MxDiMuOp_DI<string MN, bits<4> CMD, MxBead3Bits OPMODE,
+ MxOperand DST, MxOperand OPD>
+ : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", [],
+ MxDiMuEncoding<MxBead4Bits<CMD>, OPMODE, MxEncEAi, MxExtI16_2>>;
+} // let Constraints
+} // Defs = [CCR]
+
+multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
+
+ let isCommutable = isComm in {
+ def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, MxSDiMuOpmode, MxDRD32,
+ MxDRD16>;
+ def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, MxUDiMuOpmode, MxDRD32,
+ MxDRD16>;
+ }
+
+ def "S"#NAME#"d32i16" : MxDiMuOp_DI<MN#"s", CMD, MxSDiMuOpmode, MxDRD32,
+ Mxi16imm>;
+ def "U"#NAME#"d32i16" : MxDiMuOp_DI<MN#"u", CMD, MxUDiMuOpmode, MxDRD32,
+ Mxi16imm>;
+
+}
+
+defm DIV : MxDiMuOp<"div", 0x8>;
+
+// This is used to cast immediates to 16-bits for operations which don't
+// support smaller immediate sizes.
+def as_i16imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
+// RR i8
+def : Pat<(sdiv i8:$dst, i8:$opd),
+ (EXTRACT_SUBREG
+ (SDIVd32d16 (MOVSXd32d8 $dst), (MOVSXd16d8 $opd)),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(udiv i8:$dst, i8:$opd),
+ (EXTRACT_SUBREG
+ (UDIVd32d16 (MOVZXd32d8 $dst), (MOVZXd16d8 $opd)),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(srem i8:$dst, i8:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SDIVd32d16 (MOVSXd32d8 $dst), (MOVSXd16d8 $opd)), 8), 8),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(urem i8:$dst, i8:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UDIVd32d16 (MOVZXd32d8 $dst), (MOVZXd16d8 $opd)), 8), 8),
+ MxSubRegIndex8Lo)>;
+
+// RR i16
+def : Pat<(sdiv i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (SDIVd32d16 (MOVSXd32d16 $dst), $opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(udiv i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (UDIVd32d16 (MOVZXd32d16 $dst), $opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(srem i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SDIVd32d16 (MOVSXd32d16 $dst), $opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(urem i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UDIVd32d16 (MOVZXd32d16 $dst), $opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+
+// RI i8
+def : Pat<(sdiv i8:$dst, MximmSExt8:$opd),
+ (EXTRACT_SUBREG
+ (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(udiv i8:$dst, MximmSExt8:$opd),
+ (EXTRACT_SUBREG
+ (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(srem i8:$dst, MximmSExt8:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)), 8), 8),
+ MxSubRegIndex8Lo)>;
+
+def : Pat<(urem i8:$dst, MximmSExt8:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)), 8), 8),
+ MxSubRegIndex8Lo)>;
+
+// RI i16
+def : Pat<(sdiv i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (SDIVd32i16 (MOVSXd32d16 $dst), imm:$opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(udiv i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (UDIVd32i16 (MOVZXd32d16 $dst), imm:$opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(srem i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d16 $dst), imm:$opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(urem i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d16 $dst), imm:$opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+
+defm MUL : MxDiMuOp<"mul", 0xC, 1>;
+
+// RR
+def : Pat<(mul i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (SMULd32d16 (MOVXd32d16 $dst), $opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(mulhs i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SMULd32d16 (MOVXd32d16 $dst), $opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(mulhu i16:$dst, i16:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UMULd32d16 (MOVXd32d16 $dst), $opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+
+// RI
+def : Pat<(mul i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (SMULd32i16 (MOVXd32d16 $dst), imm:$opd),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(mulhs i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (ASR32di (ASR32di (SMULd32i16 (MOVXd32d16 $dst), imm:$opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+def : Pat<(mulhu i16:$dst, MximmSExt16:$opd),
+ (EXTRACT_SUBREG
+ (LSR32di (LSR32di (UMULd32i16 (MOVXd32d16 $dst), imm:$opd), 8), 8),
+ MxSubRegIndex16Lo)>;
+
+
+//===----------------------------------------------------------------------===//
+// NEG/NEGX
+//===----------------------------------------------------------------------===//
+
+/// ------------+------------+------+---------+---------
+/// F E D C | B A 9 8 | 7 6 | 5 4 3 | 2 1 0
+/// ------------+------------+------+-------------------
+/// | | | EFFECTIVE ADDRESS
+/// 0 1 0 0 | x x x x | SIZE | MODE | REG
+/// ------------+------------+------+---------+---------
+class MxNEGEncoding<MxBead4Bits CMD, MxEncSize SIZE, MxEncEA EA, MxEncExt EXT>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, SIZE, CMD, MxBead4Bits<0b0100>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+let Defs = [CCR] in {
+let Constraints = "$src = $dst" in {
+
+class MxNeg_D<MxType TYPE>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src),
+ "neg."#TYPE.Prefix#"\t$dst",
+ [(set TYPE.VT:$dst, (ineg TYPE.VT:$src))],
+ MxNEGEncoding<MxBead4Bits<0x4>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MxEncEAd_0, MxExtEmpty>>;
+
+let Uses = [CCR] in {
+class MxNegX_D<MxType TYPE>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src),
+ "negx."#TYPE.Prefix#"\t$dst",
+ [(set TYPE.VT:$dst, (MxSubX 0, TYPE.VT:$src, CCR))],
+ MxNEGEncoding<MxBead4Bits<0x0>,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size),
+ MxEncEAd_0, MxExtEmpty>>;
+}
+
+} // let Constraints
+} // let Defs = [CCR]
+
+foreach S = [8, 16, 32] in {
+ def NEG#S#d : MxNeg_D<!cast<MxType>("MxType"#S#"d")>;
+ def NEGX#S#d : MxNegX_D<!cast<MxType>("MxType"#S#"d")>;
+}
+
+def : Pat<(MxSub 0, i8 :$src), (NEG8d MxDRD8 :$src)>;
+def : Pat<(MxSub 0, i16:$src), (NEG16d MxDRD16:$src)>;
+def : Pat<(MxSub 0, i32:$src), (NEG32d MxDRD32:$src)>;
+
+//===----------------------------------------------------------------------===//
+// no-CCR Patterns
+//===----------------------------------------------------------------------===//
+
+/// Basically the reason for this stuff is that add and addc share the same
+/// operand types constraints for whatever reasons and I had to define a common
+/// MxAdd and MxSub instructions that produce CCR and then pattern-map add and addc
+/// to it.
+/// NOTE On the other hand I see no reason why I cannot just drop explicit CCR
+/// result. Anyway works for now, hopefully I will better understand how this stuff
+/// is designed later
+foreach N = ["add", "addc"] in {
+
+ // add reg, reg
+ def : Pat<(!cast<SDNode>(N) i8 :$src, i8 :$opd),
+ (ADD8dd MxDRD8 :$src, MxDRD8 :$opd)>;
+ def : Pat<(!cast<SDNode>(N) i16:$src, i16:$opd),
+ (ADD16dd MxDRD16:$src, MxDRD16:$opd)>;
+ def : Pat<(!cast<SDNode>(N) i32:$src, i32:$opd),
+ (ADD32rr MxXRD32:$src, MxXRD32:$opd)>;
+
+ // add (An), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.JPat:$opd)),
+ (ADD8dj MxDRD8:$src, MxType8.JOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.JPat:$opd)),
+ (ADD16dj MxDRD16:$src, MxType16.JOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.JPat:$opd)),
+ (ADD32rj MxXRD32:$src, MxType32.JOp:$opd)>;
+
+ // add (i,An), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.PPat:$opd)),
+ (ADD8dp MxDRD8:$src, MxType8.POp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.PPat:$opd)),
+ (ADD16dp MxDRD16:$src, MxType16.POp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.PPat:$opd)),
+ (ADD32rp MxXRD32:$src, MxType32.POp:$opd)>;
+
+ // add (i,An,Xn), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.FPat:$opd)),
+ (ADD8df MxDRD8:$src, MxType8.FOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.FPat:$opd)),
+ (ADD16df MxDRD16:$src, MxType16.FOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.FPat:$opd)),
+ (ADD32rf MxXRD32:$src, MxType32.FOp:$opd)>;
+
+ // add reg, imm
+ def : Pat<(!cast<SDNode>(N) i8: $src, MximmSExt8:$opd),
+ (ADD8di MxDRD8 :$src, imm:$opd)>;
+ def : Pat<(!cast<SDNode>(N) i16:$src, MximmSExt16:$opd),
+ (ADD16di MxDRD16:$src, imm:$opd)>;
+
+ // LEAp is more complex and thus will be selected over normal ADD32ri but it cannot
+ // be used with data registers, here by adding complexity to a simple ADD32ri insts
+ // we make sure it will be selected over LEAp
+ let AddedComplexity = 15 in {
+ def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
+ (ADD32ri MxXRD32:$src, imm:$opd)>;
+ } // AddedComplexity = 15
+
+ // add imm, (An)
+ def : Pat<(store (!cast<SDNode>(N) (load MxType8.JPat:$dst), MxType8.IPat:$opd),
+ MxType8.JPat:$dst),
+ (ADD8ji MxType8.JOp:$dst, imm:$opd)>;
+ def : Pat<(store (!cast<SDNode>(N) (load MxType16.JPat:$dst), MxType16.IPat:$opd),
+ MxType16.JPat:$dst),
+ (ADD16ji MxType16.JOp:$dst, imm:$opd)>;
+ def : Pat<(store (!cast<SDNode>(N) (load MxType32.JPat:$dst), MxType32.IPat:$opd),
+ MxType32.JPat:$dst),
+ (ADD32ji MxType32.JOp:$dst, imm:$opd)>;
+
+} // foreach add, addc
+
+def : Pat<(adde i8 :$src, i8 :$opd), (ADDX8dd MxDRD8 :$src, MxDRD8 :$opd)>;
+def : Pat<(adde i16:$src, i16:$opd), (ADDX16dd MxDRD16:$src, MxDRD16:$opd)>;
+def : Pat<(adde i32:$src, i32:$opd), (ADDX32dd MxDRD32:$src, MxDRD32:$opd)>;
+
+
+
+foreach N = ["sub", "subc"] in {
+
+ // sub reg, reg
+ def : Pat<(!cast<SDNode>(N) i8 :$src, i8 :$opd),
+ (SUB8dd MxDRD8 :$src, MxDRD8 :$opd)>;
+ def : Pat<(!cast<SDNode>(N) i16:$src, i16:$opd),
+ (SUB16dd MxDRD16:$src, MxDRD16:$opd)>;
+ def : Pat<(!cast<SDNode>(N) i32:$src, i32:$opd),
+ (SUB32rr MxXRD32:$src, MxXRD32:$opd)>;
+
+
+ // sub (An), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.JPat:$opd)),
+ (SUB8dj MxDRD8:$src, MxType8.JOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.JPat:$opd)),
+ (SUB16dj MxDRD16:$src, MxType16.JOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.JPat:$opd)),
+ (SUB32rj MxXRD32:$src, MxType32.JOp:$opd)>;
+
+ // sub (i,An), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.PPat:$opd)),
+ (SUB8dp MxDRD8:$src, MxType8.POp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.PPat:$opd)),
+ (SUB16dp MxDRD16:$src, MxType16.POp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.PPat:$opd)),
+ (SUB32rp MxXRD32:$src, MxType32.POp:$opd)>;
+
+ // sub (i,An,Xn), reg
+ def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.FPat:$opd)),
+ (SUB8df MxDRD8:$src, MxType8.FOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.FPat:$opd)),
+ (SUB16df MxDRD16:$src, MxType16.FOp:$opd)>;
+ def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.FPat:$opd)),
+ (SUB32rf MxXRD32:$src, MxType32.FOp:$opd)>;
+
+ // sub reg, imm
+ def : Pat<(!cast<SDNode>(N) i8 :$src, MximmSExt8 :$opd),
+ (SUB8di MxDRD8 :$src, imm:$opd)>;
+ def : Pat<(!cast<SDNode>(N) i16:$src, MximmSExt16:$opd),
+ (SUB16di MxDRD16:$src, imm:$opd)>;
+ def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
+ (SUB32ri MxXRD32:$src, imm:$opd)>;
+
+ // sub imm, (An)
+ def : Pat<(store (!cast<SDNode>(N) (load MxType8.JPat:$dst), MxType8.IPat:$opd),
+ MxType8.JPat:$dst),
+ (SUB8ji MxType8.JOp:$dst, imm:$opd)>;
+ def : Pat<(store (!cast<SDNode>(N) (load MxType16.JPat:$dst), MxType16.IPat:$opd),
+ MxType16.JPat:$dst),
+ (SUB16ji MxType16.JOp:$dst, imm:$opd)>;
+ def : Pat<(store (!cast<SDNode>(N) (load MxType32.JPat:$dst), MxType32.IPat:$opd),
+ MxType32.JPat:$dst),
+ (SUB32ji MxType32.JOp:$dst, imm:$opd)>;
+
+} // foreach sub, subx
+
+def : Pat<(sube i8 :$src, i8 :$opd), (SUBX8dd MxDRD8 :$src, MxDRD8 :$opd)>;
+def : Pat<(sube i16:$src, i16:$opd), (SUBX16dd MxDRD16:$src, MxDRD16:$opd)>;
+def : Pat<(sube i32:$src, i32:$opd), (SUBX32dd MxDRD32:$src, MxDRD32:$opd)>;
+
+multiclass BitwisePat<string INST, SDNode OP> {
+ // op reg, reg
+ def : Pat<(OP i8 :$src, i8 :$opd),
+ (!cast<MxInst>(INST#"8dd") MxDRD8 :$src, MxDRD8 :$opd)>;
+ def : Pat<(OP i16:$src, i16:$opd),
+ (!cast<MxInst>(INST#"16dd") MxDRD16:$src, MxDRD16:$opd)>;
+ def : Pat<(OP i32:$src, i32:$opd),
+ (!cast<MxInst>(INST#"32dd") MxDRD32:$src, MxDRD32:$opd)>;
+ // op reg, imm
+ def : Pat<(OP i8: $src, MximmSExt8 :$opd),
+ (!cast<MxInst>(INST#"8di") MxDRD8 :$src, imm:$opd)>;
+ def : Pat<(OP i16:$src, MximmSExt16:$opd),
+ (!cast<MxInst>(INST#"16di") MxDRD16:$src, imm:$opd)>;
+ def : Pat<(OP i32:$src, MximmSExt32:$opd),
+ (!cast<MxInst>(INST#"32di") MxDRD32:$src, imm:$opd)>;
+}
+
+defm : BitwisePat<"AND", and>;
+defm : BitwisePat<"OR", or>;
+defm : BitwisePat<"XOR", xor>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBits.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBits.td
new file mode 100644
index 0000000..d97ca50
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBits.td
@@ -0,0 +1,100 @@
+//===------- M68kInstrBits.td - Bit Manipulation Instrs --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the bit manipulation instructions in the M68k
+/// architecture. Here is the current status of the file:
+///
+/// Machine:
+///
+/// BCNG [ ] BCLR [ ] BSET [ ] BTST [~]
+///
+/// Map:
+///
+/// [ ] - was not touched at all
+/// [!] - requires extarnal stuff implemented
+/// [~] - in progress but usable
+/// [x] - done
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// BTST
+//===----------------------------------------------------------------------===//
+
+/// ------------+---------+---------+---------+---------
+/// F E D C | B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// ------------+---------+---------+---------+---------
+/// 0 0 0 0 | REG | 1 0 0 | MODE | REG
+/// ------------+---------+---------+---------+---------
+class MxBTSTEnc_R<MxBeadDReg REG, MxEncEA EA, MxEncExt EXT>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead3Bits<0b100>, REG, MxBead4Bits<0b0000>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+/// -------------------------------+---------+---------
+/// F E D C B A 9 8 . 7 6 | 5 4 3 | 2 1 0
+/// -------------------------------+---------+---------
+/// 0 0 0 0 1 0 0 0 . 0 0 | MODE | REG
+/// ------------------------+------+---------+---------
+/// 0 0 0 0 0 0 0 0 | BIT NUMBER
+/// ------------------------+--------------------------
+class MxBTSTEnc_I<MxBead8Imm IMM, MxEncEA EA, MxEncExt EXT>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b00>,
+ MxBead4Bits<0b1000>, MxBead4Bits<0b0000>, IMM,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+let Defs = [CCR] in {
+class MxBTST_RR<MxType TYPE>
+ : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
+ [(set CCR, (MxBt TYPE.VT:$dst, TYPE.VT:$bitno))],
+ MxBTSTEnc_R<MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
+
+class MxBTST_RI<MxType TYPE>
+ : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
+ [(set CCR, (MxBt TYPE.VT:$dst, TYPE.IPat:$bitno))],
+ MxBTSTEnc_I<MxBead8Imm<1>, MxEncEAd_0, MxExtEmpty>>;
+
+class MxBTST_MR<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
+ [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))],
+ MxBTSTEnc_R<MxBeadDReg<1>, EA, EXT>>;
+
+class MxBTST_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
+ [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))],
+ MxBTSTEnc_I<MxBead8Imm<1>, EA, EXT>>;
+} // Defs = [CCR]
+
+// Register BTST limited to 32 bits only
+def BTST32dd : MxBTST_RR<MxType32d>;
+def BTST32di : MxBTST_RI<MxType32d>;
+
+// Memory BTST limited to 8 bits only
+def BTST8jd : MxBTST_MR<MxType8d, MxType8.JOp, MxType8.JPat,
+ MxEncEAj_0, MxExtEmpty>;
+def BTST8pd : MxBTST_MR<MxType8d, MxType8.POp, MxType8.PPat,
+ MxEncEAp_0, MxExtI16_0>;
+def BTST8fd : MxBTST_MR<MxType8d, MxType8.FOp, MxType8.FPat,
+ MxEncEAf_0, MxExtBrief_0>;
+def BTST8qd : MxBTST_MR<MxType8d, MxType8.QOp, MxType8.QPat,
+ MxEncEAq, MxExtI16_0>;
+def BTST8kd : MxBTST_MR<MxType8d, MxType8.KOp, MxType8.KPat,
+ MxEncEAk, MxExtBrief_0>;
+
+def BTST8ji : MxBTST_MI<MxType8d, MxType8.JOp, MxType8.JPat,
+ MxEncEAj_0, MxExtEmpty>;
+def BTST8pi : MxBTST_MI<MxType8d, MxType8.POp, MxType8.PPat,
+ MxEncEAp_0, MxExtI16_0>;
+def BTST8fi : MxBTST_MI<MxType8d, MxType8.FOp, MxType8.FPat,
+ MxEncEAf_0, MxExtBrief_0>;
+def BTST8qi : MxBTST_MI<MxType8d, MxType8.QOp, MxType8.QPat,
+ MxEncEAq, MxExtI16_0>;
+def BTST8ki : MxBTST_MI<MxType8d, MxType8.KOp, MxType8.KPat,
+ MxEncEAk, MxExtBrief_0>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBuilder.h b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBuilder.h
new file mode 100644
index 0000000..e32b1b0
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrBuilder.h
@@ -0,0 +1,94 @@
+//===-- M68kInstrBuilder.h - Functions to build M68k insts --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file exposes functions that may be used with BuildMI from the
+/// MachineInstrBuilder.h file to handle M68k'isms in a clean way.
+///
+/// TODO The BuildMem function may be used with the BuildMI function to add
+/// entire memory references in a single, typed, function call. M68k memory
+/// references can be very complex expressions (described in the README), so
+/// wrapping them up behind an easier to use interface makes sense.
+/// Descriptions of the functions are included below.
+///
+/// For reference, the order of operands for memory references is:
+/// (Operand), Base, Scale, Index, Displacement.
+///
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_M68K_M68KINSTRBUILDER_H
+#define LLVM_LIB_TARGET_M68K_M68KINSTRBUILDER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+#include <cassert>
+
+namespace llvm {
+namespace M68k {
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(Offset);
+}
+
+/// addRegIndirectWithDisp - This function is used to add a memory reference
+/// of the form (Offset, Base), i.e., one with no scale or index, but with a
+/// displacement. An example is: (4,D0).
+static inline const MachineInstrBuilder &
+addRegIndirectWithDisp(const MachineInstrBuilder &MIB, Register Reg,
+ bool IsKill, int Offset) {
+ return MIB.addImm(Offset).addReg(Reg, getKillRegState(IsKill));
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ auto Flags = MachineMemOperand::MONone;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ return MIB.addImm(Offset).addFrameIndex(FI).addMemOperand(MMO);
+}
+
+static inline const MachineInstrBuilder &
+addMemOperand(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ auto Flags = MachineMemOperand::MONone;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ return MIB.addMemOperand(MMO);
+}
+} // end namespace M68k
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_M6800_M6800INSTRBUILDER_H
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrCompiler.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrCompiler.td
new file mode 100644
index 0000000..bcb815d
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrCompiler.td
@@ -0,0 +1,126 @@
+//===-- M68kInstrCompiler.td - Pseudos and Patterns ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the various pseudo instructions used by the compiler,
+/// as well as Pat patterns used during instruction selection.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantPool, GlobalAddress, ExternalSymbol, and JumpTable
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i32 (MxWrapper tconstpool :$src)), (MOV32ri tconstpool :$src)>;
+def : Pat<(i32 (MxWrapper tglobaladdr :$src)), (MOV32ri tglobaladdr :$src)>;
+def : Pat<(i32 (MxWrapper texternalsym :$src)), (MOV32ri texternalsym :$src)>;
+def : Pat<(i32 (MxWrapper tjumptable :$src)), (MOV32ri tjumptable :$src)>;
+def : Pat<(i32 (MxWrapper tblockaddress :$src)), (MOV32ri tblockaddress :$src)>;
+
+def : Pat<(add MxDRD32:$src, (MxWrapper tconstpool:$opd)),
+ (ADD32ri MxDRD32:$src, tconstpool:$opd)>;
+def : Pat<(add MxARD32:$src, (MxWrapper tjumptable:$opd)),
+ (ADD32ri MxARD32:$src, tjumptable:$opd)>;
+def : Pat<(add MxARD32:$src, (MxWrapper tglobaladdr :$opd)),
+ (ADD32ri MxARD32:$src, tglobaladdr:$opd)>;
+def : Pat<(add MxARD32:$src, (MxWrapper texternalsym:$opd)),
+ (ADD32ri MxARD32:$src, texternalsym:$opd)>;
+def : Pat<(add MxARD32:$src, (MxWrapper tblockaddress:$opd)),
+ (ADD32ri MxARD32:$src, tblockaddress:$opd)>;
+
+def : Pat<(store (i32 (MxWrapper tglobaladdr:$src)), iPTR:$dst),
+ (MOV32ji MxARI32:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (MxWrapper texternalsym:$src)), iPTR:$dst),
+ (MOV32ji MxARI32:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (MxWrapper tblockaddress:$src)), iPTR:$dst),
+ (MOV32ji MxARI32:$dst, tblockaddress:$src)>;
+
+def : Pat<(i32 (MxWrapperPC tconstpool :$src)), (LEA32q tconstpool :$src)>;
+def : Pat<(i32 (MxWrapperPC tglobaladdr :$src)), (LEA32q tglobaladdr :$src)>;
+def : Pat<(i32 (MxWrapperPC texternalsym :$src)), (LEA32q texternalsym :$src)>;
+def : Pat<(i32 (MxWrapperPC tjumptable :$src)), (LEA32q tjumptable :$src)>;
+def : Pat<(i32 (MxWrapperPC tblockaddress :$src)), (LEA32q tblockaddress :$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+//
+// CMOV* - Used to implement the SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, Uses = [CCR] in
+class MxCMove<MxType TYPE>
+ : MxPseudo<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$t, TYPE.ROp:$f, i8imm:$cond),
+ [(set TYPE.VT:$dst,
+ (TYPE.VT (MxCmov TYPE.VT:$t, TYPE.VT:$f, imm:$cond, CCR)))]>;
+
+def CMOV8d : MxCMove<MxType8d>;
+def CMOV16d : MxCMove<MxType16d>;
+def CMOV32r : MxCMove<MxType32r>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls
+//===----------------------------------------------------------------------===//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def %SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber CCR.
+let Defs = [SP, CCR], Uses = [SP] in {
+
+ def ADJCALLSTACKDOWN
+ : MxPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(MxCallSeqStart timm:$amt1, timm:$amt2)]>;
+
+ def ADJCALLSTACKUP
+ : MxPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(MxCallSeqEnd timm:$amt1, timm:$amt2)]>;
+
+} // Defs
+
+//===----------------------------------------------------------------------===//
+// Tail Call
+//===----------------------------------------------------------------------===//
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the XR32_TC
+// register classes.
+
+// FIXME TC is disabled for PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(MxTCRet (load MxCP_ARII:$dst), imm:$adj),
+ (TCRETURNj (MOV32af_TC MxARII32:$dst), imm:$adj)>,
+ Requires<[IsNotPIC]>;
+
+def : Pat<(MxTCRet AR32_TC:$dst, imm:$adj),
+ (TCRETURNj MxARI32_TC:$dst, imm:$adj)>;
+
+def : Pat<(MxTCRet (i32 tglobaladdr:$dst), imm:$adj),
+ (TCRETURNq MxPCD32:$dst, imm:$adj)>;
+
+def : Pat<(MxTCRet (i32 texternalsym:$dst), imm:$adj),
+ (TCRETURNq MxPCD32:$dst, imm:$adj)>;
+
+
+//===----------------------------------------------------------------------===//
+// Segmented Stack
+//
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP, CCR], Uses = [SP] in
+let usesCustomInserter = 1 in
+def SALLOCA : MxPseudo<(outs MxARD32:$dst), (ins MxARD32:$size),
+ [(set iPTR:$dst, (MxSegAlloca iPTR:$size))]>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrControl.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrControl.td
new file mode 100644
index 0000000..7084747
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrControl.td
@@ -0,0 +1,317 @@
+//===-- M68kInstrControl.td - Control Flow Instructions --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the M68k jump, return, call, and related instructions.
+/// Here is the current status of the file:
+///
+/// Machine:
+///
+/// BRA [x] BSR [ ] Bcc [ ] DBcc [ ] FBcc [ ]
+/// FDBcc [ ] FNOP [ ] FPn [ ] FScc [ ] FTST [ ]
+/// JMP [~] JSR [x] NOP [x] RTD [!] RTR [ ]
+/// RTS [x] Scc [x] TST [ ]
+///
+/// Pseudo:
+///
+/// RET [x]
+/// TCRETURNj [x] TCRETURNq [x]
+/// TAILJMPj [x] TAILJMPq [x]
+///
+/// Map:
+///
+/// [ ] - was not touched at all
+/// [!] - requires extarnal stuff implemented
+/// [~] - in progress but usable
+/// [x] - done
+///
+///
+/// NOTE
+/// Though branch and jump instructions are using memory operands they
+/// DO NOT read the jump address from memory, they just calculate EA
+/// and jump there.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NOP
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+ def NOP : MxInst<(outs), (ins), "nop", [], MxEncFixed<0x4E71>>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Conditions
+//===----------------------------------------------------------------------===//
+
+/// CC—Carry clear GE—Greater than or equal
+/// LS—Lower or same PL—Plus
+/// CS—Carry set GT—Greater than
+/// LT—Less than T—Always true*
+/// EQ—Equal HI—Higher
+/// MI—Minus VC—Overflow clear
+/// F—Never true* LE—Less than or equal
+/// NE—Not equal VS—Overflow set
+///
+/// *Not applicable to the Bcc instructions.
+def MxCCt : MxBead4Bits<0b0000>;
+def MxCCf : MxBead4Bits<0b0001>;
+def MxCChi : MxBead4Bits<0b0010>;
+def MxCCls : MxBead4Bits<0b0011>;
+def MxCCcc : MxBead4Bits<0b0100>;
+def MxCCcs : MxBead4Bits<0b0101>;
+def MxCCne : MxBead4Bits<0b0110>;
+def MxCCeq : MxBead4Bits<0b0111>;
+def MxCCvc : MxBead4Bits<0b1000>;
+def MxCCvs : MxBead4Bits<0b1001>;
+def MxCCpl : MxBead4Bits<0b1010>;
+def MxCCmi : MxBead4Bits<0b1011>;
+def MxCCge : MxBead4Bits<0b1100>;
+def MxCClt : MxBead4Bits<0b1101>;
+def MxCCgt : MxBead4Bits<0b1110>;
+def MxCCle : MxBead4Bits<0b1111>;
+
+/// --------------------------------+---------+---------
+/// F E D C | B A 9 8 | 7 6 | 5 4 3 | 2 1 0
+/// --------------------------------+---------+---------
+/// 0 1 0 1 | CONDITION | 1 1 | MODE | REG
+/// ----------------------------------------------------
+class MxSccEncoding<MxEncEA EA, MxEncExt EXT, MxBead4Bits CC>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>, CC, MxBead4Bits<0b0101>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+let Uses = [CCR] in {
+class MxSccR<string CC>
+ : MxInst<(outs MxDRD8:$dst), (ins), "s"#CC#"\t$dst",
+ [(set i8:$dst, (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR))],
+ MxSccEncoding<MxEncEAd_0, MxExtEmpty,
+ !cast<MxBead4Bits>("MxCC"#CC)>>;
+
+class MxSccM<string CC, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MEMOpd:$dst), "s"#CC#"\t$dst",
+ [(store (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR), MEMPat:$dst)],
+ MxSccEncoding<EA, EXT, !cast<MxBead4Bits>("MxCC"#CC)>>;
+}
+
+foreach cc = [ "cc", "ls", "lt", "eq", "mi", "f", "ne", "ge",
+ "cs", "pl", "gt", "t", "hi", "vc", "le", "vs"] in {
+def SET#"d8"#cc : MxSccR<cc>;
+def SET#"j8"#cc : MxSccM<cc, MxType8.JOp, MxType8.JPat, MxEncEAj_0, MxExtEmpty>;
+def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Jumps
+//===----------------------------------------------------------------------===//
+
+///------------------------------+---------+---------
+/// F E D C B A 9 8 7 6 | 5 4 3 | 2 1 0
+///------------------------------+---------+---------
+/// 0 1 0 0 1 1 1 0 1 1 | MODE | REG
+///------------------------------+---------+---------
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+class MxJMP<MxOperand LOCOp, ComplexPattern LOCPat, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins LOCOp:$dst), "jmp\t$dst", [(brind iPTR:$dst)],
+ MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>,
+ MxBead4Bits<0b1110>, MxBead4Bits<0b0100>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+
+def JMP32j : MxJMP<MxARI32, MxCP_ARI, MxEncEAj_0, MxExtEmpty>;
+
+
+// FIXME Support 16 bit indirect jump.
+// Currently M68k does not allow 16 bit indirect jumps use sext operands
+// def JMP16r : MxInst<(outs), (ins M68k_ARI16:$dst),
+// "jmp\t$dst",
+// [(brind AR16:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Branches
+//===----------------------------------------------------------------------===//
+
+/// --------------------------------------------------
+/// F E D C | B A 9 8 | 7 6 5 4 3 2 1 0
+/// --------------------------------------------------
+/// 0 1 1 0 | CONDITION | 8-BIT DISPLACEMENT
+/// --------------------------------------------------
+/// 16-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $00
+/// --------------------------------------------------
+/// 32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
+/// --------------------------------------------------
+let isBranch = 1, isTerminator = 1, Uses = [CCR] in
+class MxBcc<string cc, Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty>
+ : MxInst<(outs), (ins TARGET:$dst), "b"#cc#"\t$dst", [], ENC>;
+
+foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge",
+ "cs", "pl", "gt", "hi", "vc", "le", "vs"] in {
+ def B#cc#"8"
+ : MxBcc<cc, MxBrTarget8, MxType8,
+ MxEncoding<MxBead8Disp<0>,
+ !cast<MxBead4Bits>("MxCC"#cc), MxBead4Bits<0x6>>>;
+ def B#cc#"16"
+ : MxBcc<cc, MxBrTarget16, MxType16,
+ MxEncoding<MxBead4Bits<0x0>,
+ MxBead4Bits<0x0>, !cast<MxBead4Bits>("MxCC"#cc),
+ MxBead4Bits<0x6>, MxBead16Imm<0>>>;
+}
+
+foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge",
+ "cs", "pl", "gt", "hi", "vc", "le", "vs"] in {
+def : Pat<(MxBrCond bb:$target, !cast<PatLeaf>("MxCOND"#cc), CCR),
+ (!cast<Instruction>("B"#cc#"8") MxBrTarget8:$target)>;
+}
+
+/// -------------------------------------------------
+/// F E D C B A 9 8 | 7 6 5 4 3 2 1 0
+/// -------------------------------------------------
+/// 0 1 1 0 0 0 0 0 | 8-BIT DISPLACEMENT
+/// -------------------------------------------------
+/// 16-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $00
+/// -------------------------------------------------
+/// 32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
+/// -------------------------------------------------
+let isBranch = 1, isTerminator = 1, isBarrier=1 in
+class MxBra<Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty>
+ : MxInst<(outs), (ins TARGET:$dst), "bra\t$dst", [], ENC>;
+
+def BRA8 : MxBra<MxBrTarget8, MxType8,
+ MxEncoding<MxBead8Disp<0>, MxBead4Bits<0x0>,
+ MxBead4Bits<0x6>>>;
+def BRA16 : MxBra<MxBrTarget16, MxType16,
+ MxEncoding<MxBead4Bits<0x0>, MxBead4Bits<0x0>,
+ MxBead4Bits<0x0>, MxBead4Bits<0x6>,
+ MxBead16Imm<0>>>;
+
+def : Pat<(br bb:$target), (BRA8 MxBrTarget8:$target)>;
+
+
+//===----------------------------------------------------------------------===//
+// Call
+//===----------------------------------------------------------------------===//
+
+// All calls clobber the non-callee saved registers. %SP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let Uses = [SP] in
+let isCall = 1 in
+///------------------------------+---------+---------
+/// F E D C B A 9 8 7 6 | 5 4 3 | 2 1 0
+///------------------------------+---------+---------
+/// 0 1 0 0 1 1 1 0 1 0 | MODE | REG
+///------------------------------+---------+---------
+class MxCall<MxOperand LOCOp, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins LOCOp:$dst), "jsr\t$dst", [],
+ MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b10>,
+ MxBead4Bits<0b1110>, MxBead4Bits<0b0100>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+
+def CALLk : MxCall<MxPCI32, MxEncEAk, MxExtBrief_0>;
+def CALLq : MxCall<MxPCD32, MxEncEAq, MxExtI16_0>;
+def CALLb : MxCall<MxAL32, MxEncEAb, MxExtI32_0>;
+def CALLj : MxCall<MxARI32, MxEncEAj_0, MxExtEmpty>;
+
+multiclass CallPat<MxCall callOp, Predicate pred> {
+ let Predicates = [pred] in {
+ def : Pat<(MxCall (i32 tglobaladdr:$dst)), (callOp tglobaladdr:$dst)>;
+ def : Pat<(MxCall (i32 texternalsym:$dst)), (callOp texternalsym:$dst)>;
+ def : Pat<(MxCall (i32 imm:$dst)), (callOp imm:$dst)>;
+ }
+}
+
+defm : CallPat<CALLq, IsPIC>;
+defm : CallPat<CALLb, IsNotPIC>;
+
+def : Pat<(MxCall iPTR:$dst), (CALLj MxARI32:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// Tail Call
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in {
+let Uses = [SP] in {
+let isCall = 1, isTerminator = 1, isBarrier = 1 in {
+
+let isReturn = 1 in
+def TCRETURNq : MxPseudo<(outs), (ins MxPCD32:$dst, i32imm:$adj)>;
+def TAILJMPq : MxPseudo<(outs), (ins MxPCD32:$dst)>;
+
+// NOTE j does not mean load and jump M68k jmp just calculates EA and jumps
+// and it is using Mem form like (An) thus j letter.
+let isReturn = 1 in
+def TCRETURNj : MxPseudo<(outs), (ins MxARI32_TC:$dst, i32imm:$adj)>;
+def TAILJMPj : MxPseudo<(outs), (ins MxARI32_TC:$dst)>;
+} // isCall = 1, isTerminator = 1, isBarrier = 1
+} // Uses = [SP]
+} // isCodeGenOnly = 1
+
+//===----------------------------------------------------------------------===//
+// Return
+//===----------------------------------------------------------------------===//
+
+// TODO Implement LINK/UNLK
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
+
+def RTS : MxInst<(outs), (ins), "rts", [], MxEncFixed<0x4E75>>;
+
+let isCodeGenOnly = 1 in
+def RET : MxPseudo<(outs), (ins i32imm:$adj, variable_ops),
+ [(MxRet timm:$adj)]>;
+} // isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1
+
+//===----------------------------------------------------------------------===//
+// SETCC_C Patterns
+//===----------------------------------------------------------------------===//
+
+// Use subx to materialize carry bit.
+let Uses = [CCR], Defs = [CCR], isPseudo = 1 in {
+// FIXME These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETCS_C8d : MxPseudo<(outs MxDRD8:$dst), (ins),
+ [(set MxDRD8:$dst, (MxSetCC_C MxCONDcs, CCR))]>;
+def SETCS_C16d : MxPseudo<(outs MxDRD16:$dst), (ins),
+ [(set MxDRD16:$dst, (MxSetCC_C MxCONDcs, CCR))]>;
+def SETCS_C32d : MxPseudo<(outs MxXRD32:$dst), (ins),
+ [(set MxXRD32:$dst, (MxSetCC_C MxCONDcs, CCR))]>;
+} // Uses = [CCR], Defs = [CCR], isPseudo = 1
+
+
+def : Pat<(i16 (anyext (i8 (MxSetCC_C MxCONDcs, CCR)))), (SETCS_C16d)>;
+def : Pat<(i32 (anyext (i8 (MxSetCC_C MxCONDcs, CCR)))), (SETCS_C32d)>;
+
+def : Pat<(i16 (sext (i8 (MxSetCC_C MxCONDcs, CCR)))), (SETCS_C16d)>;
+def : Pat<(i32 (sext (i8 (MxSetCC_C MxCONDcs, CCR)))), (SETCS_C32d)>;
+
+// We canonicalize 'scs' to "(and (subx reg,reg), 1)" on the hope that the and
+// will be eliminated and that the subx can be extended up to a wider type. When
+// this happens, it is great. However, if we are left with an 8-bit subx and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (MxSetCC_C MxCONDcs, CCR)), 1), (SETd8cs)>;
+
+// (add OP, SETB) -> (addx OP, (move 0))
+def : Pat<(add (and (i8 (MxSetCC_C MxCONDcs, CCR)), 1), MxDRD8:$op),
+ (ADDX8dd MxDRD8:$op, (MOV8di 0))>;
+def : Pat<(add (and (i32 (MxSetCC_C MxCONDcs, CCR)), 1), MxXRD32:$op),
+ (ADDX32dd MxDRD32:$op, (MOV32ri 0))>;
+
+// (sub OP, SETB) -> (subx OP, (move 0))
+def : Pat<(sub MxDRD8:$op, (and (i8 (MxSetCC_C MxCONDcs, CCR)), 1)),
+ (SUBX8dd MxDRD8:$op, (MOV8di 0))>;
+def : Pat<(sub MxXRD32:$op, (and (i32 (MxSetCC_C MxCONDcs, CCR)), 1)),
+ (SUBX32dd MxDRD32:$op, (MOV32ri 0))>;
+
+// (sub OP, SETCC_CARRY) -> (addx OP, (move 0))
+def : Pat<(sub MxDRD8:$op, (i8 (MxSetCC_C MxCONDcs, CCR))),
+ (ADDX8dd MxDRD8:$op, (MOV8di 0))>;
+def : Pat<(sub MxXRD32:$op, (i32 (MxSetCC_C MxCONDcs, CCR))),
+ (ADDX32dd MxDRD32:$op, (MOV32ri 0))>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrData.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrData.td
new file mode 100644
index 0000000..40b9e4a
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrData.td
@@ -0,0 +1,712 @@
+//== M68kInstrData.td - M68k Data Movement Instructions -*- tablegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the Motorola 680x0 data movement instructions which are
+/// the basic means of transferring and storing addresses and data. Here is the
+/// current status of the file:
+///
+/// Machine:
+///
+/// EXG [ ] FMOVE [ ] FSMOVE [ ] FDMOVE [ ] FMOVEM [ ]
+/// LEA [~] PEA [ ] MOVE [~] MOVE16 [ ] MOVEA [ ]
+/// MOVEM [ ] MOVEP [ ] MOVEQ [ ] LINK [ ] UNLK [ ]
+///
+/// Pseudo:
+///
+/// MOVSX [x] MOVZX [x] MOVX [x]
+///
+/// Map:
+///
+/// [ ] - was not touched at all
+/// [!] - requires extarnal stuff implemented
+/// [~] - in progress but usable
+/// [x] - done
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MOVE
+//===----------------------------------------------------------------------===//
+
+/// -----------------------------------------------------
+/// F E | D C | B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// -----------------------------------------------------
+/// | | DESTINATION | SOURCE
+/// 0 0 | SIZE | REG | MODE | MODE | REG
+/// -----------------------------------------------------
+///
+/// NOTE Move requires EA X version for direct register destination(0)
+class MxMoveEncoding<MxBead2Bits size,
+ MxEncEA srcEA, MxEncExt srcExt,
+ MxEncEA dstEA, MxEncExt dstExt>
+ : MxEncoding<srcEA.Reg, srcEA.DA, srcEA.Mode, dstEA.DA, dstEA.Mode, dstEA.Reg,
+ size, MxBead2Bits<0b00>,
+ srcExt.Imm, srcExt.B8, srcExt.Scale, srcExt.WL, srcExt.DAReg,
+ dstExt.Imm, dstExt.B8, dstExt.Scale, dstExt.WL, dstExt.DAReg>;
+
+/// MOVE has alternate size encoding
+class MxMoveSize<bits<2> value> : MxBead2Bits<value>;
+def MxMoveSize8 : MxMoveSize<0b01>;
+def MxMoveSize16 : MxMoveSize<0b11>;
+def MxMoveSize32 : MxMoveSize<0b10>;
+
+let Defs = [CCR] in
+class MxMove<string size, dag outs, dag ins, list<dag> pattern, MxEncoding enc>
+ : MxInst<outs, ins, "move."#size#"\t$src, $dst", pattern, enc>;
+
+class MxMove_RR<MxType DST, MxType SRC, MxMoveEncoding ENC>
+ : MxMove<DST.Prefix, (outs DST.ROp:$dst), (ins SRC.ROp:$src),
+ [(null_frag)], ENC>;
+
+let mayStore = 1 in {
+class MxMove_MR<MxOperand MEMOpd, ComplexPattern MEMPat, MxType REG,
+ MxMoveEncoding ENC>
+ : MxMove<REG.Prefix, (outs), (ins MEMOpd:$dst, REG.ROp:$src),
+ [(store REG.VT:$src, MEMPat:$dst)], ENC>;
+
+class MxMove_MI<MxOperand MEMOpd, ComplexPattern MEMPat, MxType TYPE,
+ MxMoveEncoding ENC>
+ : MxMove<TYPE.Prefix, (outs), (ins MEMOpd:$dst, TYPE.IOp:$src),
+ [(store TYPE.IPat:$src, MEMPat:$dst)], ENC>;
+} // let mayStore = 1
+
+class MxMove_RI<MxType DST, MxMoveEncoding ENC>
+ : MxMove<DST.Prefix, (outs DST.ROp:$dst), (ins DST.IOp:$src),
+ [(set DST.VT:$dst, DST.IPat:$src)], ENC>;
+
+
+let mayLoad = 1 in
+class MxMove_RM<MxType REG, MxOperand MEMOpd, ComplexPattern MEMPat,
+ MxBead2Bits SIZE,
+ MxEncEA SRCEA, MxEncExt SRCEXT,
+ MxEncEA DSTEA, MxEncExt DSTEXT>
+ : MxMove<REG.Prefix, (outs REG.ROp:$dst), (ins MEMOpd:$src),
+ [(set REG.VT:$dst, (REG.Load MEMPat:$src))],
+ MxMoveEncoding<SIZE, SRCEA, SRCEXT, DSTEA, DSTEXT>>;
+
+multiclass MMxMove_RM<MxType REG, MxMoveSize SIZE, MxEncEA EA_0> {
+
+ // REG <- (An)+
+ def NAME#REG.OOp.Letter#REG.Postfix : MxMove_RM<REG, REG.OOp, REG.OPat,
+ SIZE, MxEncEAo_1, MxExtEmpty, EA_0, MxExtEmpty>;
+
+ // REG <- -(An)
+ def NAME#REG.EOp.Letter#REG.Postfix : MxMove_RM<REG, REG.EOp, REG.EPat,
+ SIZE, MxEncEAe_1, MxExtEmpty, EA_0, MxExtEmpty>;
+
+ // REG <- (i,PC,Xn)
+ def NAME#REG.KOp.Letter#REG.Postfix : MxMove_RM<REG, REG.KOp, REG.KPat,
+ SIZE, MxEncEAk, MxExtBrief_1, EA_0, MxExtEmpty>;
+
+ // REG <- (i,PC)
+ def NAME#REG.QOp.Letter#REG.Postfix : MxMove_RM<REG, REG.QOp, REG.QPat,
+ SIZE, MxEncEAq, MxExtI16_1, EA_0, MxExtEmpty>;
+
+ // REG <- (i,An,Xn)
+ def NAME#REG.FOp.Letter#REG.Postfix : MxMove_RM<REG, REG.FOp, REG.FPat,
+ SIZE, MxEncEAf_1, MxExtBrief_1, EA_0, MxExtEmpty>;
+
+ // REG <- (i,An)
+ def NAME#REG.POp.Letter#REG.Postfix : MxMove_RM<REG, REG.POp, REG.PPat,
+ SIZE, MxEncEAp_1, MxExtI16_1, EA_0, MxExtEmpty>;
+
+ // REG <- (ABS)
+ def NAME#REG.BOp.Letter#REG.Postfix : MxMove_RM<REG, REG.BOp, REG.BPat,
+ SIZE, MxEncEAb, MxExtI32_1, EA_0, MxExtEmpty>;
+
+ // REG <- (An)
+ def NAME#REG.JOp.Letter#REG.Postfix : MxMove_RM<REG, REG.JOp, REG.JPat,
+ SIZE, MxEncEAj_1, MxExtEmpty, EA_0, MxExtEmpty>;
+}
+
+let mayLoad = 1, mayStore = 1 in {
+class MxMove_MM<string SIZE, PatFrag LOAD,
+ MxOperand DSTOpd, ComplexPattern DSTPat,
+ MxOperand SRCOpd, ComplexPattern SRCPat,
+ MxBead2Bits ESIZE,
+ MxEncEA SRCEA, MxEncExt SRCEXT,
+ MxEncEA DSTEA, MxEncExt DSTEXT>
+ : MxMove<SIZE, (outs), (ins DSTOpd:$dst, SRCOpd:$src),
+ [(store (LOAD SRCPat:$src), DSTPat:$dst)],
+ MxMoveEncoding<ESIZE, SRCEA, SRCEXT, DSTEA, DSTEXT>>;
+} // let mayLoad = 1, mayStore = 1
+
+multiclass MMxMove_MM<MxType TYPE, MxOperand DSTOpd, ComplexPattern DSTPat,
+ MxMoveSize SIZE, MxEncEA EA_0, MxEncExt EXT_0> {
+
+ // MEM <- (An)+
+ def NAME#TYPE.OOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.OOp, TYPE.OPat,
+ SIZE, MxEncEAo_1, MxExtEmpty, EA_0, EXT_0>;
+
+ // MEM <- -(An)
+ def NAME#TYPE.EOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.EOp, TYPE.EPat,
+ SIZE, MxEncEAe_1, MxExtEmpty, EA_0, EXT_0>;
+
+ // MEM <- (i,An)
+ def NAME#TYPE.POp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.POp, TYPE.PPat,
+ SIZE, MxEncEAp_1, MxExtI16_1, EA_0, EXT_0>;
+
+ // MEM <- (i,An,Xn)
+ def NAME#TYPE.FOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.FOp, TYPE.FPat,
+ SIZE, MxEncEAf_1, MxExtBrief_1, EA_0, EXT_0>;
+
+ // MEM <- (i,PC,Xn)
+ def NAME#TYPE.KOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.KOp, TYPE.KPat,
+ SIZE, MxEncEAk, MxExtBrief_1, EA_0, EXT_0>;
+
+ // MEM <- (i,PC)
+ def NAME#TYPE.QOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.QOp, TYPE.QPat,
+ SIZE, MxEncEAq, MxExtI16_1, EA_0, EXT_0>;
+
+ // MEM <- (ABS)
+ def NAME#TYPE.BOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.BOp, TYPE.BPat,
+ SIZE, MxEncEAb, MxExtI32_1, EA_0, EXT_0>;
+
+ // MEM <- (An)
+ def NAME#TYPE.JOp.Letter#TYPE.Postfix
+ : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.JOp, TYPE.JPat,
+ SIZE, MxEncEAj_1, MxExtEmpty, EA_0, EXT_0>;
+}
+
+def MOV8dd
+ : MxMove_RR<MxType8d, MxType8d,
+ MxMoveEncoding<MxMoveSize8, MxEncEAd_1, MxExtEmpty, MxEncEAd_0, MxExtEmpty>>;
+
+// M <- R
+def MOV8fd : MxMove_MR<MxType8.FOp, MxType8.FPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAf_0, MxExtBrief_0>>;
+
+def MOV8pd : MxMove_MR<MxType8.POp, MxType8.PPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAp_0, MxExtI16_0>>;
+
+def MOV8ed : MxMove_MR<MxType8.EOp, MxType8.EPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAe_0, MxExtEmpty>>;
+
+def MOV8od : MxMove_MR<MxType8.OOp, MxType8.OPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAo_0, MxExtEmpty>>;
+
+def MOV8bd : MxMove_MR<MxType8.BOp, MxType8.BPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAb, MxExtI32_0>>;
+
+def MOV8jd : MxMove_MR<MxType8.JOp, MxType8.JPat, MxType8d,
+ MxMoveEncoding<MxMoveSize8,
+ /*src*/ MxEncEAd_1, MxExtEmpty,
+ /*dst*/ MxEncEAj_0, MxExtEmpty>>;
+
+
+// R <- I
+def MOV8di : MxMove_RI<MxType8d,
+ MxMoveEncoding<MxMoveSize8, MxEncEAi, MxExtI8_1, MxEncEAd_0, MxExtEmpty>>;
+
+foreach S = [16, 32] in {
+ foreach D = [ "r", "a" ] in {
+
+ foreach O = [ "r", "a" ] in {
+ def MOV#S#D#O : MxMove_RR<
+ !cast<MxType>("MxType"#S#D),
+ !cast<MxType>("MxType"#S#O),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ !cast<MxEncEA>("MxEncEA"#D#"_0_reflected"), MxExtEmpty>>;
+ }
+
+ // M <- R
+ def MOV#S#"f"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).FOp,
+ !cast<MxType>("MxType"#S).FPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAf_0, MxExtBrief_0>>;
+
+ def MOV#S#"p"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).POp,
+ !cast<MxType>("MxType"#S).PPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAp_0, MxExtI16_0>>;
+
+ def MOV#S#"e"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).EOp,
+ !cast<MxType>("MxType"#S).EPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAe_0, MxExtEmpty>>;
+
+ def MOV#S#"o"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).OOp,
+ !cast<MxType>("MxType"#S).OPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAo_0, MxExtEmpty>>;
+
+ def MOV#S#"b"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).BOp,
+ !cast<MxType>("MxType"#S).BPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAb, MxExtI32_0>>;
+
+ def MOV#S#"j"#D : MxMove_MR<
+ !cast<MxType>("MxType"#S).JOp,
+ !cast<MxType>("MxType"#S).JPat,
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
+ MxEncEAj_0, MxExtEmpty>>;
+
+
+ // R <- I
+ def MOV#S#D#"i" : MxMove_RI<
+ !cast<MxType>("MxType"#S#D),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
+ !cast<MxEncEA>("MxEncEA"#D#"_0_reflected"), MxExtEmpty>>;
+ }
+}
+
+// M <- I
+foreach S = [8, 16, 32] in {
+ def MOV#S#"f"#"i" : MxMove_MI<
+ !cast<MxType>("MxType"#S).FOp,
+ !cast<MxType>("MxType"#S).FPat,
+ !cast<MxType>("MxType"#S),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
+ MxEncEAf_0, MxExtBrief_0>>;
+
+ def MOV#S#"p"#"i" : MxMove_MI<
+ !cast<MxType>("MxType"#S).POp,
+ !cast<MxType>("MxType"#S).PPat,
+ !cast<MxType>("MxType"#S),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
+ MxEncEAp_0, MxExtI16_0>>;
+
+ def MOV#S#"b"#"i" : MxMove_MI<
+ !cast<MxType>("MxType"#S).BOp,
+ !cast<MxType>("MxType"#S).BPat,
+ !cast<MxType>("MxType"#S),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
+ MxEncEAb, MxExtI32_0>>;
+
+ def MOV#S#"j"#"i" : MxMove_MI<
+ !cast<MxType>("MxType"#S).JOp,
+ !cast<MxType>("MxType"#S).JPat,
+ !cast<MxType>("MxType"#S),
+ MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
+ MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
+ MxEncEAj_0, MxExtEmpty>>;
+}
+
+// Store ABS(basically pointer) as Immdiate to Mem
+def : Pat<(store MxType32.BPat :$src, MxType32.PPat :$dst),
+ (MOV32pi MxType32.POp :$dst, MxType32.IOp :$src)>;
+
+def : Pat<(store MxType32.BPat :$src, MxType32.FPat :$dst),
+ (MOV32fi MxType32.FOp :$dst, MxType32.IOp :$src)>;
+
+def : Pat<(store MxType32.BPat :$src, MxType32.BPat :$dst),
+ (MOV32bi MxType32.BOp :$dst, MxType32.IOp :$src)>;
+
+def : Pat<(store MxType32.BPat :$src, MxType32.JPat :$dst),
+ (MOV32ji MxType32.JOp :$dst, MxType32.IOp :$src)>;
+
+// R <- M
+defm MOV8d : MMxMove_RM<MxType8d, MxMoveSize8, MxEncEAd_0>;
+
+defm MOV16r : MMxMove_RM<MxType16r, MxMoveSize16, MxEncEAr_0_reflected>;
+defm MOV16a : MMxMove_RM<MxType16a, MxMoveSize16, MxEncEAa_0>;
+
+defm MOV32r : MMxMove_RM<MxType32r, MxMoveSize32, MxEncEAr_0_reflected>;
+defm MOV32a : MMxMove_RM<MxType32a, MxMoveSize32, MxEncEAa_0>;
+
+let Pattern = [(null_frag)] in {
+defm MOV16r : MMxMove_RM<MxType16r_TC, MxMoveSize16, MxEncEAr_0_reflected>;
+defm MOV16a : MMxMove_RM<MxType16a_TC, MxMoveSize16, MxEncEAa_0>;
+
+defm MOV32r : MMxMove_RM<MxType32r_TC, MxMoveSize32, MxEncEAr_0_reflected>;
+defm MOV32a : MMxMove_RM<MxType32a_TC, MxMoveSize32, MxEncEAa_0>;
+} // Pattern
+
+// M <- M
+defm MOV8p : MMxMove_MM<MxType8, MxType8.POp, MxType8.PPat,
+ MxMoveSize8, MxEncEAp_0, MxExtI16_0>;
+defm MOV16p : MMxMove_MM<MxType16, MxType16.POp, MxType16.PPat,
+ MxMoveSize16, MxEncEAp_0, MxExtI16_0>;
+defm MOV32p : MMxMove_MM<MxType32, MxType32.POp, MxType32.PPat,
+ MxMoveSize32, MxEncEAp_0, MxExtI16_0>;
+
+defm MOV8f : MMxMove_MM<MxType8, MxType8.FOp, MxType8.FPat,
+ MxMoveSize8, MxEncEAf_0, MxExtBrief_0>;
+defm MOV16f : MMxMove_MM<MxType16, MxType16.FOp, MxType16.FPat,
+ MxMoveSize16, MxEncEAf_0, MxExtBrief_0>;
+defm MOV32f : MMxMove_MM<MxType32, MxType32.FOp, MxType32.FPat,
+ MxMoveSize32, MxEncEAf_0, MxExtBrief_0>;
+
+defm MOV8b : MMxMove_MM<MxType8, MxType8.BOp, MxType8.BPat,
+ MxMoveSize8, MxEncEAb, MxExtI32_0>;
+defm MOV16b : MMxMove_MM<MxType16, MxType16.BOp, MxType16.BPat,
+ MxMoveSize16, MxEncEAb, MxExtI32_0>;
+defm MOV32b : MMxMove_MM<MxType32, MxType32.BOp, MxType32.BPat,
+ MxMoveSize32, MxEncEAb, MxExtI32_0>;
+
+defm MOV8e : MMxMove_MM<MxType8, MxType8.EOp, MxType8.EPat,
+ MxMoveSize8, MxEncEAe_0, MxExtEmpty>;
+defm MOV16e : MMxMove_MM<MxType16, MxType16.EOp, MxType16.EPat,
+ MxMoveSize16, MxEncEAe_0, MxExtEmpty>;
+defm MOV32e : MMxMove_MM<MxType32, MxType32.EOp, MxType32.EPat,
+ MxMoveSize32, MxEncEAe_0, MxExtEmpty>;
+
+defm MOV8o : MMxMove_MM<MxType8, MxType8.OOp, MxType8.OPat,
+ MxMoveSize8, MxEncEAo_0, MxExtEmpty>;
+defm MOV16o : MMxMove_MM<MxType16, MxType16.OOp, MxType16.OPat,
+ MxMoveSize16, MxEncEAo_0, MxExtEmpty>;
+defm MOV32o : MMxMove_MM<MxType32, MxType32.OOp, MxType32.OPat,
+ MxMoveSize32, MxEncEAo_0, MxExtEmpty>;
+
+defm MOV8j : MMxMove_MM<MxType8, MxType8.JOp, MxType8.JPat,
+ MxMoveSize8, MxEncEAj_0, MxExtEmpty>;
+defm MOV16j : MMxMove_MM<MxType16, MxType16.JOp, MxType16.JPat,
+ MxMoveSize16, MxEncEAj_0, MxExtEmpty>;
+defm MOV32j : MMxMove_MM<MxType32, MxType32.JOp, MxType32.JPat,
+ MxMoveSize32, MxEncEAj_0, MxExtEmpty>;
+
+//===----------------------------------------------------------------------===//
+// MOVEM
+//
+// The mask is already pre-processed by the save/restore spill hook
+//===----------------------------------------------------------------------===//
+
+// Direction
+def MxMOVEM_MR : MxBead1Bit<0>;
+def MxMOVEM_RM : MxBead1Bit<1>;
+
+// Size
+def MxMOVEM_W : MxBead1Bit<0>;
+def MxMOVEM_L : MxBead1Bit<1>;
+
+/// ---------------+-------------+-------------+---------
+/// F E D C B | A | 9 8 7 | 6 | 5 4 3 | 2 1 0
+/// ---------------+---+---------+---+---------+---------
+/// 0 1 0 0 1 | D | 0 0 1 | S | MODE | REG
+/// ---------------+---+---------+---+---------+---------
+/// REGISTER LIST MASK
+/// -----------------------------------------------------
+/// D - direction(RM,MR)
+/// S - size(W,L)
+class MxMOVEMEncoding<MxEncEA EA, MxEncExt EXT, MxBead1Bit SIZE, MxBead1Bit DIR,
+ MxBead16Imm IMM>
+ : MxEncoding<EA.Reg, EA.DA, EA.Mode, SIZE, MxBead3Bits<0b001>, DIR,
+ MxBead1Bit<1>, MxBead4Bits<0b0100>, IMM,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+
+let mayStore = 1 in
+class MxMOVEM_MR<MxType TYPE, MxBead1Bit SIZE,
+ MxOperand MEMOp, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MEMOp:$dst, MxMoveMask:$mask),
+ "movem."#TYPE.Prefix#"\t$mask, $dst", [],
+ MxMOVEMEncoding<EA, EXT, SIZE, MxMOVEM_MR, MxBead16Imm<1>>>;
+
+let mayLoad = 1 in
+class MxMOVEM_RM<MxType TYPE, MxBead1Bit SIZE,
+ MxOperand MEMOp, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs), (ins MxMoveMask:$mask, MEMOp:$src),
+ "movem."#TYPE.Prefix#"\t$src, $mask", [],
+ MxMOVEMEncoding<EA, EXT, SIZE, MxMOVEM_RM, MxBead16Imm<0>>>;
+
+def MOVM32jm : MxMOVEM_MR<MxType32, MxMOVEM_L, MxType32.JOp, MxEncEAj_0, MxExtEmpty>;
+def MOVM32pm : MxMOVEM_MR<MxType32, MxMOVEM_L, MxType32.POp, MxEncEAp_0, MxExtI16_0>;
+
+def MOVM32mj : MxMOVEM_RM<MxType32, MxMOVEM_L, MxType32.JOp, MxEncEAj_1, MxExtEmpty>;
+def MOVM32mp : MxMOVEM_RM<MxType32, MxMOVEM_L, MxType32.POp, MxEncEAp_1, MxExtI16_1>;
+
+// Pseudo versions. These a required by virtual register spill/restore since
+// the mask requires real register to encode. These instruction will be expanded
+// into real MOVEM after RA finishes.
+let mayStore = 1 in
+class MxMOVEM_MR_Pseudo<MxType TYPE, MxOperand MEMOp>
+ : MxPseudo<(outs), (ins MEMOp:$dst, TYPE.ROp:$reg)>;
+let mayLoad = 1 in
+class MxMOVEM_RM_Pseudo<MxType TYPE, MxOperand MEMOp>
+ : MxPseudo<(outs TYPE.ROp:$dst), (ins MEMOp:$src)>;
+
+// Mem <- Reg
+def MOVM8jm_P : MxMOVEM_MR_Pseudo<MxType8d, MxType8.JOp>;
+def MOVM16jm_P : MxMOVEM_MR_Pseudo<MxType16r, MxType16.JOp>;
+def MOVM32jm_P : MxMOVEM_MR_Pseudo<MxType32r, MxType32.JOp>;
+
+def MOVM8pm_P : MxMOVEM_MR_Pseudo<MxType8d, MxType8.POp>;
+def MOVM16pm_P : MxMOVEM_MR_Pseudo<MxType16r, MxType16.POp>;
+def MOVM32pm_P : MxMOVEM_MR_Pseudo<MxType32r, MxType32.POp>;
+
+// Reg <- Mem
+def MOVM8mj_P : MxMOVEM_RM_Pseudo<MxType8d, MxType8.JOp>;
+def MOVM16mj_P : MxMOVEM_RM_Pseudo<MxType16r, MxType16.JOp>;
+def MOVM32mj_P : MxMOVEM_RM_Pseudo<MxType32r, MxType32.JOp>;
+
+def MOVM8mp_P : MxMOVEM_RM_Pseudo<MxType8d, MxType8.POp>;
+def MOVM16mp_P : MxMOVEM_RM_Pseudo<MxType16r, MxType16.POp>;
+def MOVM32mp_P : MxMOVEM_RM_Pseudo<MxType32r, MxType32.POp>;
+
+
+//===----------------------------------------------------------------------===//
+// MOVE to/from SR/CCR
+//
+// A special care must be taken working with to/from CCR since it is basically
+// word-size SR register truncated for user mode thus it only supports word-size
+// instructions. Plus the original M68000 does not support moves from CCR. So in
+// order to use CCR effectively one MUST use proper byte-size pseudo instructi-
+// ons that will be resolved sometime after RA pass.
+//===----------------------------------------------------------------------===//
+
+/// --------------------------------------------------
+/// F E D C B A 9 8 7 6 | 5 4 3 | 2 1 0
+/// --------------------------------------------------
+/// | EFFECTIVE ADDRESS
+/// 0 1 0 0 0 1 0 0 1 1 | MODE | REG
+/// --------------------------------------------------
+let Defs = [CCR] in
+class MxMoveToCCR<dag INS, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs CCRC:$dst), INS, "move.w\t$src, $dst", [],
+ MxEncoding<EA.Reg, EA.DA, EA.Mode,
+ MxBead4Bits<0b0011>, MxBead4Bits<0b0001>, MxBead2Bits<0b01>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+
+class MxMoveToCCRPseudo<dag INS> : MxPseudo<(outs CCRC:$dst), INS>;
+
+let mayLoad = 1 in {
+def MOV16cp : MxMoveToCCR<(ins MxType16d.POp:$src), MxEncEAp_1, MxExtI16_1>;
+def MOV8cp : MxMoveToCCRPseudo<(ins MxType8d.POp:$src)>;
+} // let mayLoad = 1
+
+def MOV16cd : MxMoveToCCR<(ins MxType16d.ROp:$src), MxEncEAd_1, MxExtEmpty>;
+def MOV8cd : MxMoveToCCRPseudo<(ins MxType8d.ROp:$src)>;
+
+/// Move from CCR
+/// --------------------------------------------------
+/// F E D C B A 9 8 7 6 | 5 4 3 | 2 1 0
+/// --------------------------------------------------
+/// | EFFECTIVE ADDRESS
+/// 0 1 0 0 0 0 1 0 1 1 | MODE | REG
+/// --------------------------------------------------
+let Uses = [CCR] in
+class MxMoveFromCCR<dag OUTS, dag INS, MxEncEA EA, MxEncExt EXT>
+ : MxInst<OUTS, INS, "move.w\t$src, $dst", [],
+ MxEncoding<EA.Reg, EA.DA, EA.Mode,
+ MxBead4Bits<0b1011>, MxBead4Bits<0b0000>, MxBead2Bits<0b01>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>,
+ Requires<[ IsM68010 ]>;
+
+class MxMoveFromCCRPseudo<dag INS> : MxPseudo<(outs), INS>;
+
+let mayStore = 1 in {
+def MOV16pc
+ : MxMoveFromCCR<(outs), (ins MxType16d.POp:$dst, CCRC:$src), MxEncEAp_0, MxExtI16_0>;
+def MOV8pc : MxMoveFromCCRPseudo<(ins MxType8d.POp:$dst, CCRC:$src)>;
+} // let mayStore = 1
+
+def MOV16dc
+ : MxMoveFromCCR<(outs MxType16d.ROp:$dst), (ins CCRC:$src), MxEncEAd_0, MxExtEmpty>;
+
+def MOV8dc : MxMoveFromCCRPseudo<(ins MxType8d.ROp:$dst, CCRC:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// LEA
+//===----------------------------------------------------------------------===//
+
+/// ----------------------------------------------------
+/// F E D C | B A 9 | 8 7 6 | 5 4 3 | 2 1 0
+/// ----------------------------------------------------
+/// 0 1 0 0 | DST REG | 1 1 1 | MODE | REG
+/// ----------------------------------------------------
+class MxLEA<MxOperand SRCOpd, ComplexPattern SRCPat, MxEncEA EA, MxEncExt EXT>
+ : MxInst<(outs MxARD32:$dst), (ins SRCOpd:$src),
+ "lea\t$src, $dst", [(set i32:$dst, SRCPat:$src)],
+ MxEncoding<EA.Reg, EA.DA, EA.Mode,
+ MxBead3Bits<0b111>, MxBeadReg<0>, MxBead4Bits<0x4>,
+ EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+
+def LEA32p : MxLEA<MxARID32, MxCP_ARID, MxEncEAp_1, MxExtI16_1>;
+def LEA32f : MxLEA<MxARII32, MxCP_ARII, MxEncEAf_1, MxExtBrief_1>;
+def LEA32q : MxLEA<MxPCD32, MxCP_PCD, MxEncEAq, MxExtI16_1>;
+def LEA32b : MxLEA<MxAL32, MxCP_AL, MxEncEAb, MxExtI32_1>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudos
+//===----------------------------------------------------------------------===//
+
+/// Pushe/Pop to/from SP for simplicity
+let Uses = [SP], Defs = [SP], hasSideEffects = 0 in {
+
+// SP <- SP - <size>; (SP) <- Dn
+let mayStore = 1 in {
+def PUSH8d : MxPseudo<(outs), (ins DR8:$reg)>;
+def PUSH16d : MxPseudo<(outs), (ins DR16:$reg)>;
+def PUSH32r : MxPseudo<(outs), (ins XR32:$reg)>;
+} // let mayStore = 1
+
+// Dn <- (SP); SP <- SP + <size>
+let mayLoad = 1 in {
+def POP8d : MxPseudo<(outs DR8:$reg), (ins)>;
+def POP16d : MxPseudo<(outs DR16:$reg), (ins)>;
+def POP32r : MxPseudo<(outs XR32:$reg), (ins)>;
+} // let mayLoad = 1
+
+} // let Uses/Defs = [SP], hasSideEffects = 0
+
+
+let Defs = [CCR] in {
+class MxPseudoMove_RR<MxType DST, MxType SRC, list<dag> PAT = []>
+ : MxPseudo<(outs DST.ROp:$dst), (ins SRC.ROp:$src), PAT>;
+
+class MxPseudoMove_RM<MxType DST, MxOperand SRCOpd, list<dag> PAT = []>
+ : MxPseudo<(outs DST.ROp:$dst), (ins SRCOpd:$src), PAT>;
+}
+
+/// This group of Pseudos is analogues to the real x86 extending moves, but
+/// since M68k does not have those we need to emulate. These instructions
+/// will be expanded right after RA completed because we need to know precisely
+/// what registers are allocated for the operands and if they overlap we just
+/// extend the value if the registers are completely different we need to move
+/// first.
+foreach EXT = ["S", "Z"] in {
+ let hasSideEffects = 0 in {
+
+ def MOV#EXT#Xd16d8 : MxPseudoMove_RR<MxType16d, MxType8d>;
+ def MOV#EXT#Xd32d8 : MxPseudoMove_RR<MxType32d, MxType8d>;
+ def MOV#EXT#Xd32d16 : MxPseudoMove_RR<MxType32r, MxType16r>;
+
+ let mayLoad = 1 in {
+
+ def MOV#EXT#Xd16j8 : MxPseudoMove_RM<MxType16d, MxType8.JOp>;
+ def MOV#EXT#Xd32j8 : MxPseudoMove_RM<MxType32d, MxType8.JOp>;
+ def MOV#EXT#Xd32j16 : MxPseudoMove_RM<MxType32d, MxType16.JOp>;
+
+ def MOV#EXT#Xd16p8 : MxPseudoMove_RM<MxType16d, MxType8.POp>;
+ def MOV#EXT#Xd32p8 : MxPseudoMove_RM<MxType32d, MxType8.POp>;
+ def MOV#EXT#Xd32p16 : MxPseudoMove_RM<MxType32d, MxType16.POp>;
+
+ def MOV#EXT#Xd16f8 : MxPseudoMove_RM<MxType16d, MxType8.FOp>;
+ def MOV#EXT#Xd32f8 : MxPseudoMove_RM<MxType32d, MxType8.FOp>;
+ def MOV#EXT#Xd32f16 : MxPseudoMove_RM<MxType32d, MxType16.FOp>;
+
+ }
+ }
+}
+
+/// This group of instructions is similar to the group above but DOES NOT do
+/// any value extension, they just load a smaller register into the lower part
+/// of another register if operands' real registers are different or does
+/// nothing if they are the same.
+def MOVXd16d8 : MxPseudoMove_RR<MxType16d, MxType8d>;
+def MOVXd32d8 : MxPseudoMove_RR<MxType32d, MxType8d>;
+def MOVXd32d16 : MxPseudoMove_RR<MxType32r, MxType16r>;
+
+//===----------------------------------------------------------------------===//
+// Extend/Truncate Patterns
+//===----------------------------------------------------------------------===//
+
+// i16 <- sext i8
+def: Pat<(i16 (sext i8:$src)),
+ (EXTRACT_SUBREG (MOVSXd32d8 MxDRD8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxSExtLoadi16i8 MxCP_ARI:$src),
+ (EXTRACT_SUBREG (MOVSXd32j8 MxARI8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxSExtLoadi16i8 MxCP_ARID:$src),
+ (EXTRACT_SUBREG (MOVSXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxSExtLoadi16i8 MxCP_ARII:$src),
+ (EXTRACT_SUBREG (MOVSXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+
+// i32 <- sext i8
+def: Pat<(i32 (sext i8:$src)), (MOVSXd32d8 MxDRD8:$src)>;
+def: Pat<(MxSExtLoadi32i8 MxCP_ARI :$src), (MOVSXd32j8 MxARI8 :$src)>;
+def: Pat<(MxSExtLoadi32i8 MxCP_ARID:$src), (MOVSXd32p8 MxARID8:$src)>;
+def: Pat<(MxSExtLoadi32i8 MxCP_ARII:$src), (MOVSXd32f8 MxARII8:$src)>;
+
+// i32 <- sext i16
+def: Pat<(i32 (sext i16:$src)), (MOVSXd32d16 MxDRD16:$src)>;
+def: Pat<(MxSExtLoadi32i16 MxCP_ARI :$src), (MOVSXd32j16 MxARI16 :$src)>;
+def: Pat<(MxSExtLoadi32i16 MxCP_ARID:$src), (MOVSXd32p16 MxARID16:$src)>;
+def: Pat<(MxSExtLoadi32i16 MxCP_ARII:$src), (MOVSXd32f16 MxARII16:$src)>;
+
+// i16 <- zext i8
+def: Pat<(i16 (zext i8:$src)),
+ (EXTRACT_SUBREG (MOVZXd32d8 MxDRD8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxZExtLoadi16i8 MxCP_ARI:$src),
+ (EXTRACT_SUBREG (MOVZXd32j8 MxARI8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxZExtLoadi16i8 MxCP_ARID:$src),
+ (EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxZExtLoadi16i8 MxCP_ARII:$src),
+ (EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+
+// i32 <- zext i8
+def: Pat<(i32 (zext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>;
+def: Pat<(MxZExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>;
+def: Pat<(MxZExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>;
+def: Pat<(MxZExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>;
+
+// i32 <- zext i16
+def: Pat<(i32 (zext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>;
+def: Pat<(MxZExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>;
+def: Pat<(MxZExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>;
+def: Pat<(MxZExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>;
+
+// i16 <- anyext i8
+def: Pat<(i16 (anyext i8:$src)),
+ (EXTRACT_SUBREG (MOVZXd32d8 MxDRD8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxExtLoadi16i8 MxCP_ARI:$src),
+ (EXTRACT_SUBREG (MOVZXd32j8 MxARI8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxExtLoadi16i8 MxCP_ARID:$src),
+ (EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxExtLoadi16i8 MxCP_ARII:$src),
+ (EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+
+// i32 <- anyext i8
+def: Pat<(i32 (anyext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>;
+def: Pat<(MxExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>;
+def: Pat<(MxExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>;
+def: Pat<(MxExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>;
+
+// i32 <- anyext i16
+def: Pat<(i32 (anyext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>;
+def: Pat<(MxExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>;
+def: Pat<(MxExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>;
+def: Pat<(MxExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc i32:$src)),
+ (EXTRACT_SUBREG MxXRD32:$src, MxSubRegIndex16Lo)>;
+def : Pat<(i8 (trunc i32:$src)),
+ (EXTRACT_SUBREG MxXRD32:$src, MxSubRegIndex8Lo)>;
+def : Pat<(i8 (trunc i16:$src)),
+ (EXTRACT_SUBREG MxXRD16:$src, MxSubRegIndex8Lo)>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrFormats.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrFormats.td
new file mode 100644
index 0000000..1d950bd
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -0,0 +1,371 @@
+//=== M68kInstrFormats.td - M68k Instruction Formats ---*- tablegen -*-===//
+// The LLVM Compiler Infrastructure
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains M68k instruction formats.
+///
+/// Since M68k has quite a lot memory addressing modes there are more
+/// instruction prefixes than just i, r and m:
+/// TSF Since Form Letter Description
+/// 00 M68000 Dn or An r any register
+/// 01 M68000 Dn d data register direct
+/// 02 M68000 An a address register direct
+/// 03 M68000 (An) j address register indirect
+/// 04 M68000 (An)+ o address register indirect with postincrement
+/// 05 M68000 -(An) e address register indirect with predecrement
+/// 06 M68000 (i,An) p address register indirect with displacement
+/// 10 M68000 (i,An,Xn.L) f address register indirect with index and scale = 1
+/// 07 M68000 (i,An,Xn.W) F address register indirect with index and scale = 1
+/// 12 M68020 (i,An,Xn.L,SCALE) g address register indirect with index
+/// 11 M68020 (i,An,Xn.W,SCALE) G address register indirect with index
+/// 14 M68020 ([bd,An],Xn.L,SCALE,od) u memory indirect postindexed mode
+/// 13 M68020 ([bd,An],Xn.W,SCALE,od) U memory indirect postindexed mode
+/// 16 M68020 ([bd,An,Xn.L,SCALE],od) v memory indirect preindexed mode
+/// 15 M68020 ([bd,An,Xn.W,SCALE],od) V memory indirect preindexed mode
+/// 20 M68000 abs.L b absolute long address
+/// 17 M68000 abs.W B absolute short address
+/// 21 M68000 (i,PC) q program counter with displacement
+/// 23 M68000 (i,PC,Xn.L) k program counter with index and scale = 1
+/// 22 M68000 (i,PC,Xn.W) K program counter with index and scale = 1
+/// 25 M68020 (i,PC,Xn.L,SCALE) l program counter with index
+/// 24 M68020 (i,PC,Xn.W,SCALE) L program counter with index
+/// 27 M68020 ([bd,PC],Xn.L,SCALE,od) x program counter memory indirect postindexed mode
+/// 26 M68020 ([bd,PC],Xn.W,SCALE,od) X program counter memory indirect postindexed mode
+/// 31 M68020 ([bd,PC,Xn.L,SCALE],od) y program counter memory indirect preindexed mode
+/// 30 M68020 ([bd,PC,Xn.W,SCALE],od) Y program counter memory indirect preindexed mode
+/// 32 M68000 #immediate i immediate data
+///
+/// NOTE that long form is always lowercase, word variants are capitalized
+///
+/// Operand can be qualified with size where appropriate to force a particular
+/// instruction encoding, e.g.:
+/// (i8,An,Xn.W) f8 1 extension word
+/// (i16,An,Xn.W) f16 2 extension words
+/// (i32,An,Xn.W) f32 3 extension words
+///
+/// Form without size qualifier will adapt to operand size automatically, e.g.:
+/// (i,An,Xn.W) f 1, 2 or 3 extension words
+///
+/// Some forms already imply a particular size of their operands, e.g.:
+/// (i,An) p 1 extension word and i is 16bit
+///
+/// Operand order follows x86 Intel order(destination before source), e.g.:
+/// MOV8df MOVE (4,A0,D0), D1
+///
+/// Number after instruction mnemonics determines the size of the data
+///
+//===----------------------------------------------------------------------===//
+
+/// ??? Is it possible to use this stuff for disassembling?
+/// NOTE 1: In case of conditional beads(DA, DAReg), cond part is able to
+/// consume any bit, though a more general instructions must be chosen, e.g.
+/// d -> r, a -> r
+
+//===----------------------------------------------------------------------===//
+// Encoding primitives
+//===----------------------------------------------------------------------===//
+
+class MxBead<bits<4> type, bit b4 = 0, bit b5 = 0, bit b6 = 0, bit b7 = 0> {
+ bits<8> Value = 0b00000000;
+ let Value{3-0} = type;
+ let Value{4} = b4;
+ let Value{5} = b5;
+ let Value{6} = b6;
+ let Value{7} = b7;
+}
+
+/// System beads, allow to control beading flow
+def MxBeadTerm : MxBead<0x0, 0, 0, 0, 0>;
+def MxBeadIgnore : MxBead<0x0, 1, 0, 0, 0>;
+
+/// Add plain bit to the instruction
+class MxBead1Bit <bits<1> b> : MxBead<0x1, b>;
+class MxBead2Bits <bits<2> b> : MxBead<0x2, b{0}, b{1}>;
+class MxBead3Bits <bits<3> b> : MxBead<0x3, b{0}, b{1}, b{2}>;
+class MxBead4Bits <bits<4> b> : MxBead<0x4, b{0}, b{1}, b{2}, b{3}>;
+
+/// bits<3> o - operand number
+/// bit a - use alternative, used to select index register or
+/// outer displacement/immediate
+/// suffix NP means non-padded
+class MxBeadDAReg <bits<3> o, bit a = 0> : MxBead<0x5, o{0}, o{1}, o{2}, a>;
+class MxBeadDA <bits<3> o, bit a = 0> : MxBead<0x6, o{0}, o{1}, o{2}, a>;
+class MxBeadReg <bits<3> o, bit a = 0> : MxBead<0x7, o{0}, o{1}, o{2}, a>;
+class MxBeadDReg <bits<3> o, bit a = 0> : MxBead<0x8, o{0}, o{1}, o{2}, a>;
+class MxBead8Disp <bits<3> o, bit a = 0> : MxBead<0x9, o{0}, o{1}, o{2}, a>;
+
+/// Add Immediate to the instruction. 8-bit version is padded with zeros to fit
+/// the word.
+class MxBead8Imm <bits<3> o, bit a = 0> : MxBead<0xA, o{0}, o{1}, o{2}, a>;
+class MxBead16Imm <bits<3> o, bit a = 0> : MxBead<0xB, o{0}, o{1}, o{2}, a>;
+class MxBead32Imm <bits<3> o, bit a = 0> : MxBead<0xC, o{0}, o{1}, o{2}, a>;
+
+/// Encodes an immediate 0-7(alt. 1-8) into 3 bit field
+class MxBead3Imm <bits<3> o, bit a = 0> : MxBead<0xD, o{0}, o{1}, o{2}, a>;
+
+
+class MxEncoding<MxBead n0 = MxBeadTerm, MxBead n1 = MxBeadTerm,
+ MxBead n2 = MxBeadTerm, MxBead n3 = MxBeadTerm,
+ MxBead n4 = MxBeadTerm, MxBead n5 = MxBeadTerm,
+ MxBead n6 = MxBeadTerm, MxBead n7 = MxBeadTerm,
+ MxBead n8 = MxBeadTerm, MxBead n9 = MxBeadTerm,
+ MxBead n10 = MxBeadTerm, MxBead n11 = MxBeadTerm,
+ MxBead n12 = MxBeadTerm, MxBead n13 = MxBeadTerm,
+ MxBead n14 = MxBeadTerm, MxBead n15 = MxBeadTerm,
+ MxBead n16 = MxBeadTerm, MxBead n17 = MxBeadTerm,
+ MxBead n18 = MxBeadTerm, MxBead n19 = MxBeadTerm,
+ MxBead n20 = MxBeadTerm, MxBead n21 = MxBeadTerm,
+ MxBead n22 = MxBeadTerm, MxBead n23 = MxBeadTerm> {
+ bits <192> Value;
+ let Value{7-0} = n0.Value;
+ let Value{15-8} = n1.Value;
+ let Value{23-16} = n2.Value;
+ let Value{31-24} = n3.Value;
+ let Value{39-32} = n4.Value;
+ let Value{47-40} = n5.Value;
+ let Value{55-48} = n6.Value;
+ let Value{63-56} = n7.Value;
+ let Value{71-64} = n8.Value;
+ let Value{79-72} = n9.Value;
+ let Value{87-80} = n10.Value;
+ let Value{95-88} = n11.Value;
+ let Value{103-96} = n12.Value;
+ let Value{111-104} = n13.Value;
+ let Value{119-112} = n14.Value;
+ let Value{127-120} = n15.Value;
+ let Value{135-128} = n16.Value;
+ let Value{143-136} = n17.Value;
+ let Value{151-144} = n18.Value;
+ let Value{159-152} = n19.Value;
+ let Value{167-160} = n20.Value;
+ let Value{175-168} = n21.Value;
+ let Value{183-176} = n22.Value;
+ let Value{191-184} = n23.Value;
+}
+
+class MxEncFixed<bits<16> value> : MxEncoding {
+ let Value{7-0} = MxBead4Bits<value{3-0}>.Value;
+ let Value{15-8} = MxBead4Bits<value{7-4}>.Value;
+ let Value{23-16} = MxBead4Bits<value{11-8}>.Value;
+ let Value{31-24} = MxBead4Bits<value{15-12}>.Value;
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding composites
+//
+// These must be lowered to MxEncoding by instr specific wrappers
+//
+// HERE BE DRAGONS...
+//===----------------------------------------------------------------------===//
+
+class MxEncByte<bits<8> value> : MxEncoding {
+ MxBead4Bits LO = MxBead4Bits<value{3-0}>;
+ MxBead4Bits HI = MxBead4Bits<value{7-4}>;
+}
+
+def MxEncEmpty : MxEncoding;
+
+
+/// M68k Standard Effective Address layout:
+///
+/// :-------------------:
+/// | 5 4 3 | 2 1 0 |
+/// | mode | reg |
+/// :-------------------:
+///
+/// If the EA is a direct register mode, bits 4 and 5 are 0, and the register
+/// number will be encoded in bit 0 - 3. Since the first address register's
+/// (A0) register number is 8, we can easily tell data registers from
+/// address registers by only inspecting bit 3 (i.e. if bit 3 is set, it's an
+/// address register).
+///
+///
+/// But MOVE instruction uses reversed layout for destination EA:
+///
+/// :-------------------:
+/// | 5 4 3 | 2 1 0 |
+/// | reg | mode |
+/// :-------------------:
+///
+/// And this complicates things a bit because the DA bit is now separated from
+/// the register and we have to encode those separately using MxBeadDA<opN>
+///
+class MxEncEA<MxBead reg, MxBead mode, MxBead da = MxBeadIgnore> {
+ MxBead Reg = reg;
+ MxBead Mode = mode;
+ MxBead DA = da;
+}
+
+// FIXME: Is there a way to factorize the addressing mode suffix (i.e.
+// 'r', 'd', 'a' etc.) and use something like multiclass to replace?
+def MxEncEAr_0: MxEncEA<MxBeadDAReg<0>, MxBead2Bits<0b00>>;
+def MxEncEAd_0: MxEncEA<MxBeadDReg<0>, MxBead2Bits<0b00>, MxBead1Bit<0>>;
+def MxEncEAa_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b00>, MxBead1Bit<1>>;
+def MxEncEAj_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b01>, MxBead1Bit<0>>;
+def MxEncEAo_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b01>, MxBead1Bit<1>>;
+def MxEncEAe_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b10>, MxBead1Bit<0>>;
+def MxEncEAp_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b10>, MxBead1Bit<1>>;
+def MxEncEAf_0: MxEncEA<MxBeadReg<0>, MxBead2Bits<0b11>, MxBead1Bit<0>>;
+
+def MxEncEAa_0_reflected : MxEncEA<MxBeadReg<0>, MxBead3Bits<0b001>>;
+def MxEncEAr_0_reflected : MxEncEA<MxBeadReg<0>, MxBead2Bits<0b00>, MxBeadDA<0>>;
+
+def MxEncEAr_1: MxEncEA<MxBeadDAReg<1>, MxBead2Bits<0b00>>;
+def MxEncEAd_1: MxEncEA<MxBeadDReg<1>, MxBead2Bits<0b00>, MxBead1Bit<0>>;
+def MxEncEAa_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b00>, MxBead1Bit<1>>;
+def MxEncEAj_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b01>, MxBead1Bit<0>>;
+def MxEncEAo_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b01>, MxBead1Bit<1>>;
+def MxEncEAe_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b10>, MxBead1Bit<0>>;
+def MxEncEAp_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b10>, MxBead1Bit<1>>;
+def MxEncEAf_1: MxEncEA<MxBeadReg<1>, MxBead2Bits<0b11>, MxBead1Bit<0>>;
+
+def MxEncEAr_2: MxEncEA<MxBeadDAReg<2>, MxBead2Bits<0b00>>;
+def MxEncEAd_2: MxEncEA<MxBeadDReg<2>, MxBead2Bits<0b00>, MxBead1Bit<0>>;
+def MxEncEAa_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b00>, MxBead1Bit<1>>;
+def MxEncEAj_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b01>, MxBead1Bit<0>>;
+def MxEncEAo_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b01>, MxBead1Bit<1>>;
+def MxEncEAe_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b10>, MxBead1Bit<0>>;
+def MxEncEAp_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b10>, MxBead1Bit<1>>;
+def MxEncEAf_2: MxEncEA<MxBeadReg<2>, MxBead2Bits<0b11>, MxBead1Bit<0>>;
+
+def MxEncEAb : MxEncEA<MxBead3Bits<0b001>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
+def MxEncEAq : MxEncEA<MxBead3Bits<0b010>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
+def MxEncEAk : MxEncEA<MxBead3Bits<0b011>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
+def MxEncEAi : MxEncEA<MxBead3Bits<0b100>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
+
+// Allows you to specify each bit of opcode
+class MxEncOpMode<MxBead b0, MxBead b1 = MxBeadIgnore, MxBead b2 = MxBeadIgnore> {
+ MxBead B0 = b0;
+ MxBead B1 = b1;
+ MxBead B2 = b2;
+}
+
+// op EA, Dn
+def MxOpMode8dEA : MxEncOpMode<MxBead3Bits<0b000>>;
+def MxOpMode16dEA : MxEncOpMode<MxBead3Bits<0b001>>;
+def MxOpMode32dEA : MxEncOpMode<MxBead3Bits<0b010>>;
+
+// op EA, An
+def MxOpMode16aEA : MxEncOpMode<MxBead3Bits<0b110>>;
+def MxOpMode32aEA : MxEncOpMode<MxBead3Bits<0b111>>;
+
+// op EA, Rn
+// As you might noticed this guy is special... Since M68k differentiates
+// between Data and Address registers we required to use different OPMODE codes
+// for Address registers DST operands. One way of dealing with it is to use
+// separate tablegen instructions, but in this case it would force Register
+// Allocator to use specific Register Classes and eventually will lead to
+// superfluous moves. Another approach is to use reg-variadic encoding which will
+// change OPMODE base on Register Class used. Luckily, all the bits that differ go
+// from 0 to 1 and can be encoded with MxBeadDA.
+// Basically, if the register used is of Data type these encodings will be
+// the same as MxOpMode{16,32}dEA above and used with regular instructions(e.g. ADD,
+// SUB), but if the register is of Address type the appropriate bits will flip and
+// the instructions become of *A type(e.g ADDA, SUBA).
+def MxOpMode16rEA : MxEncOpMode<MxBead1Bit<1>, MxBeadDA<0>, MxBead1Bit<0>>;
+def MxOpMode32rEA : MxEncOpMode<MxBeadDA<0>, MxBead1Bit<1>, MxBeadDA<0>>;
+
+// op Dn, EA
+def MxOpMode8EAd : MxEncOpMode<MxBead3Bits<0b100>>;
+def MxOpMode16EAd : MxEncOpMode<MxBead3Bits<0b101>>;
+def MxOpMode32EAd : MxEncOpMode<MxBead3Bits<0b110>>;
+
+
+// Represents two types of extension word:
+// - Imm extension word
+// - Brief extension word
+class MxEncExt<MxBead imm = MxBeadIgnore, MxBead b8 = MxBeadIgnore,
+ MxBead scale = MxBeadIgnore, MxBead wl = MxBeadIgnore,
+ MxBead daReg = MxBeadIgnore> {
+ MxBead Imm = imm;
+ MxBead B8 = b8;
+ MxBead Scale = scale;
+ MxBead WL = wl;
+ MxBead DAReg = daReg;
+}
+
+def MxExtEmpty : MxEncExt;
+
+// These handle encoding of displacement fields, absolute addresses and
+// immediate values, since encoding for these categories is mainly the same,
+// with exception of some weird immediates.
+def MxExtI8_0 : MxEncExt<MxBead8Imm<0>>;
+def MxExtI16_0 : MxEncExt<MxBead16Imm<0>>;
+def MxExtI32_0 : MxEncExt<MxBead32Imm<0>>;
+
+def MxExtI8_1 : MxEncExt<MxBead8Imm<1>>;
+def MxExtI16_1 : MxEncExt<MxBead16Imm<1>>;
+def MxExtI32_1 : MxEncExt<MxBead32Imm<1>>;
+
+def MxExtI8_2 : MxEncExt<MxBead8Imm<2>>;
+def MxExtI16_2 : MxEncExt<MxBead16Imm<2>>;
+def MxExtI32_2 : MxEncExt<MxBead32Imm<2>>;
+
+// NOTE They are all using Long Xn
+def MxExtBrief_0 : MxEncExt<MxBead8Disp<0>, MxBead1Bit<0b0>,
+ MxBead2Bits<0b00>, MxBead1Bit<1>,
+ MxBeadDAReg<0, 1>>;
+
+def MxExtBrief_1 : MxEncExt<MxBead8Disp<1>, MxBead1Bit<0b0>,
+ MxBead2Bits<0b00>, MxBead1Bit<1>,
+ MxBeadDAReg<1, 1>>;
+
+def MxExtBrief_2 : MxEncExt<MxBead8Disp<2>, MxBead1Bit<0b0>,
+ MxBead2Bits<0b00>, MxBead1Bit<1>,
+ MxBeadDAReg<2, 1>>;
+
+def MxExtBrief_3 : MxEncExt<MxBead8Disp<3>, MxBead1Bit<0b0>,
+ MxBead2Bits<0b00>, MxBead1Bit<1>,
+ MxBeadDAReg<3, 1>>;
+
+def MxExtBrief_4 : MxEncExt<MxBead8Disp<4>, MxBead1Bit<0b0>,
+ MxBead2Bits<0b00>, MxBead1Bit<1>,
+ MxBeadDAReg<4, 1>>;
+
+class MxEncSize<bits<2> value> : MxBead2Bits<value>;
+def MxEncSize8 : MxEncSize<0b00>;
+def MxEncSize16 : MxEncSize<0b01>;
+def MxEncSize32 : MxEncSize<0b10>;
+def MxEncSize64 : MxEncSize<0b11>;
+
+// M68k INSTRUCTION. Most instructions specify the location of an operand by
+// using the effective address field in the operation word. The effective address
+// is composed of two 3-bit fields: the mode field and the register field. The
+// value in the mode field selects the different address modes. The register
+// field contains the number of a register. The effective address field may
+// require additional information to fully specify the operand. This additional
+// information, called the effective address extension, is contained in the
+// following word or words and is considered part of the instruction. The
+// effective address modes are grouped into three categories: register direct,
+// memory addressing, and special.
+class MxInst<dag outs, dag ins,
+ string asmStr = "",
+ list<dag> pattern = [],
+ MxEncoding beads = MxEncEmpty,
+ InstrItinClass itin = NoItinerary>
+ : Instruction {
+ let Namespace = "M68k";
+ let OutOperandList = outs;
+ let InOperandList = ins;
+ let AsmString = asmStr;
+ let Pattern = pattern;
+ let Itinerary = itin;
+
+ // Byte stream
+ field bits<192> Beads = beads.Value;
+
+ // Number of bytes
+ let Size = 0;
+
+ let UseLogicalOperandMappings = 1;
+}
+
+// M68k PSEUDO INSTRUCTION
+class MxPseudo<dag outs, dag ins, list<dag> pattern = []>
+ : MxInst<outs, ins, "; error: this should not be emitted", pattern> {
+ let isPseudo = 1;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.cpp
new file mode 100644
index 0000000..0eddd8c
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -0,0 +1,869 @@
+//===-- M68kInstrInfo.cpp - M68k Instruction Information ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the M68k declaration of the TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kInstrInfo.h"
+
+#include "M68kInstrBuilder.h"
+#include "M68kMachineFunction.h"
+#include "M68kTargetMachine.h"
+#include "MCTargetDesc/M68kMCCodeEmitter.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "M68k-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "M68kGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void M68kInstrInfo::anchor() {}
+
+M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI)
+ : M68kGenInstrInfo(M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
+ M68k::RET),
+ Subtarget(STI), RI(STI) {}
+
+static M68k::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default:
+ return M68k::COND_INVALID;
+ case M68k::Beq8:
+ return M68k::COND_EQ;
+ case M68k::Bne8:
+ return M68k::COND_NE;
+ case M68k::Blt8:
+ return M68k::COND_LT;
+ case M68k::Ble8:
+ return M68k::COND_LE;
+ case M68k::Bgt8:
+ return M68k::COND_GT;
+ case M68k::Bge8:
+ return M68k::COND_GE;
+ case M68k::Bcs8:
+ return M68k::COND_CS;
+ case M68k::Bls8:
+ return M68k::COND_LS;
+ case M68k::Bhi8:
+ return M68k::COND_HI;
+ case M68k::Bcc8:
+ return M68k::COND_CC;
+ case M68k::Bmi8:
+ return M68k::COND_MI;
+ case M68k::Bpl8:
+ return M68k::COND_PL;
+ case M68k::Bvs8:
+ return M68k::COND_VS;
+ case M68k::Bvc8:
+ return M68k::COND_VC;
+ }
+}
+
+bool M68kInstrInfo::AnalyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+
+ auto UncondBranch =
+ std::pair<MachineBasicBlock::reverse_iterator, MachineBasicBlock *>{
+ MBB.rend(), nullptr};
+
+ // Erase any instructions if allowed at the end of the scope.
+ std::vector<std::reference_wrapper<llvm::MachineInstr>> EraseList;
+ auto FinalizeOnReturn = llvm::make_scope_exit([&EraseList] {
+ std::for_each(EraseList.begin(), EraseList.end(),
+ [](auto &ref) { ref.get().eraseFromParent(); });
+ });
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ for (auto iter = MBB.rbegin(); iter != MBB.rend(); iter = std::next(iter)) {
+
+ unsigned Opcode = iter->getOpcode();
+
+ if (iter->isDebugInstr())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(*iter))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!iter->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (Opcode == M68k::BRA8 || Opcode == M68k::BRA16) {
+ if (!iter->getOperand(0).isMBB())
+ return true;
+ UncondBranch = {iter, iter->getOperand(0).getMBB()};
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = UncondBranch.second;
+
+ if (!AllowModify)
+ continue;
+
+ // If the block has any instructions after a JMP, erase them.
+ EraseList.insert(EraseList.begin(), MBB.rbegin(), iter);
+
+ Cond.clear();
+ FBB = nullptr;
+
+ // Erase the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(UncondBranch.second)) {
+ TBB = nullptr;
+ EraseList.push_back(*iter);
+ UncondBranch = {MBB.rend(), nullptr};
+ }
+
+ continue;
+ }
+
+ // Handle conditional branches.
+ auto BranchCode = M68k::GetCondFromBranchOpc(Opcode);
+
+ // Can't handle indirect branch.
+ if (BranchCode == M68k::COND_INVALID)
+ return true;
+
+ // In practice we should never have an undef CCR operand, if we do
+ // abort here as we are not prepared to preserve the flag.
+ // ??? Is this required?
+ // if (iter->getOperand(1).isUndef())
+ // return true;
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ if (!iter->getOperand(0).isMBB())
+ return true;
+ MachineBasicBlock *CondBranchTarget = iter->getOperand(0).getMBB();
+
+ // If we see something like this:
+ //
+ // bcc l1
+ // bra l2
+ // ...
+ // l1:
+ // ...
+ // l2:
+ if (UncondBranch.first != MBB.rend()) {
+
+ assert(std::next(UncondBranch.first) == iter && "Wrong block layout.");
+
+ // And we are allowed to modify the block and the target block of the
+ // conditional branch is the direct successor of this block:
+ //
+ // bcc l1
+ // bra l2
+ // l1:
+ // ...
+ // l2:
+ //
+ // we change it to this if allowed:
+ //
+ // bncc l2
+ // l1:
+ // ...
+ // l2:
+ //
+ // Which is a bit more efficient.
+ if (AllowModify && MBB.isLayoutSuccessor(CondBranchTarget)) {
+
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ unsigned BNCC = GetCondBranchFromCond(BranchCode);
+
+ BuildMI(MBB, *UncondBranch.first, MBB.rfindDebugLoc(iter), get(BNCC))
+ .addMBB(UncondBranch.second);
+
+ EraseList.push_back(*iter);
+ EraseList.push_back(*UncondBranch.first);
+
+ TBB = UncondBranch.second;
+ FBB = nullptr;
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+ // Otherwise preserve TBB, FBB and Cond as requested
+ } else {
+ TBB = CondBranchTarget;
+ FBB = UncondBranch.second;
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ }
+
+ UncondBranch = {MBB.rend(), nullptr};
+ continue;
+ }
+
+ TBB = CondBranchTarget;
+ FBB = nullptr;
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // If the conditions are the same, we can leave them alone.
+ auto OldBranchCode = static_cast<M68k::CondCode>(Cond[0].getImm());
+ if (!iter->getOperand(0).isMBB())
+ return true;
+ auto NewTBB = iter->getOperand(0).getMBB();
+ if (OldBranchCode == BranchCode && TBB == NewTBB)
+ continue;
+
+ // If they differ we cannot do much here.
+ return true;
+ }
+
+ return false;
+}
+
+bool M68kInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, AllowModify);
+}
+
+unsigned M68kInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != M68k::BRA8 &&
+ getCondFromBranchOpc(I->getOpcode()) == M68k::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned M68kInstrInfo::insertBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "M68k branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(M68k::BRA8)).addMBB(TBB);
+ return 1;
+ }
+
+ // If FBB is null, it is implied to be a fall-through block.
+ bool FallThru = FBB == nullptr;
+
+ // Conditional branch.
+ unsigned Count = 0;
+ M68k::CondCode CC = (M68k::CondCode)Cond[0].getImm();
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ ++Count;
+ if (!FallThru) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(M68k::BRA8)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+void M68kInstrInfo::AddSExt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned Reg, MVT From, MVT To) const {
+ if (From == MVT::i8) {
+ unsigned R = Reg;
+ // EXT16 requires i16 register
+ if (To == MVT::i32) {
+ R = RI.getSubReg(Reg, M68k::MxSubRegIndex16Lo);
+ assert(R && "No viable SUB register available");
+ }
+ BuildMI(MBB, I, DL, get(M68k::EXT16), R).addReg(R);
+ }
+
+ if (To == MVT::i32)
+ BuildMI(MBB, I, DL, get(M68k::EXT32), Reg).addReg(Reg);
+}
+
+void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned Reg, MVT From, MVT To) const {
+
+ unsigned Mask, And;
+ if (From == MVT::i8)
+ Mask = 0xFF;
+ else
+ Mask = 0xFFFF;
+
+ if (To == MVT::i16)
+ And = M68k::AND16di;
+ else // i32
+ And = M68k::AND32di;
+
+ // TODO use xor r,r to decrease size
+ BuildMI(MBB, I, DL, get(And), Reg).addReg(Reg).addImm(Mask);
+}
+
+bool M68kInstrInfo::ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst,
+ MVT MVTSrc) const {
+ unsigned Move = MVTDst == MVT::i16 ? M68k::MOV16rr : M68k::MOV32rr;
+ unsigned Dst = MIB->getOperand(0).getReg();
+ unsigned Src = MIB->getOperand(1).getReg();
+
+ assert(Dst != Src && "You cannot use the same Regs with MOVX_RR");
+
+ const auto &TRI = getRegisterInfo();
+
+ const auto *RCDst = TRI.getMaximalPhysRegClass(Dst, MVTDst);
+ const auto *RCSrc = TRI.getMaximalPhysRegClass(Src, MVTSrc);
+
+ assert(RCDst && RCSrc && "Wrong use of MOVX_RR");
+ assert(RCDst != RCSrc && "You cannot use the same Reg Classes with MOVX_RR");
+
+ // We need to find the super source register that matches the size of Dst
+ unsigned SSrc = RI.getMatchingMegaReg(Src, RCDst);
+ assert(SSrc && "No viable MEGA register available");
+
+ DebugLoc DL = MIB->getDebugLoc();
+
+ // If it happens to that super source register is the destination register
+ // we do nothing
+ if (Dst == SSrc) {
+ LLVM_DEBUG(dbgs() << "Remove " << *MIB.getInstr() << '\n');
+ MIB->eraseFromParent();
+ } else { // otherwise we need to MOV
+ LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to MOV\n");
+ MIB->setDesc(get(Move));
+ MIB->getOperand(1).setReg(SSrc);
+ }
+
+ return true;
+}
+
+/// Expand SExt MOVE pseudos into a MOV and a EXT if the operands are two
+/// different registers or just EXT if it is the same register
+bool M68kInstrInfo::ExpandMOVSZX_RR(MachineInstrBuilder &MIB, bool IsSigned,
+ MVT MVTDst, MVT MVTSrc) const {
+ LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to ");
+
+ unsigned Move;
+
+ if (MVTDst == MVT::i16)
+ Move = M68k::MOV16rr;
+ else // i32
+ Move = M68k::MOV32rr;
+
+ unsigned Dst = MIB->getOperand(0).getReg();
+ unsigned Src = MIB->getOperand(1).getReg();
+
+ assert(Dst != Src && "You cannot use the same Regs with MOVSX_RR");
+
+ const auto &TRI = getRegisterInfo();
+
+ const auto *RCDst = TRI.getMaximalPhysRegClass(Dst, MVTDst);
+ const auto *RCSrc = TRI.getMaximalPhysRegClass(Src, MVTSrc);
+
+ assert(RCDst && RCSrc && "Wrong use of MOVSX_RR");
+ assert(RCDst != RCSrc && "You cannot use the same Reg Classes with MOVSX_RR");
+
+ // We need to find the super source register that matches the size of Dst
+ unsigned SSrc = RI.getMatchingMegaReg(Src, RCDst);
+ assert(SSrc && "No viable MEGA register available");
+
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+
+ if (Dst != SSrc) {
+ LLVM_DEBUG(dbgs() << "Move and " << '\n');
+ BuildMI(MBB, MIB.getInstr(), DL, get(Move), Dst).addReg(SSrc);
+ }
+
+ if (IsSigned) {
+ LLVM_DEBUG(dbgs() << "Sign Extend" << '\n');
+ AddSExt(MBB, MIB.getInstr(), DL, Dst, MVTSrc, MVTDst);
+ } else {
+ LLVM_DEBUG(dbgs() << "Zero Extend" << '\n');
+ AddZExt(MBB, MIB.getInstr(), DL, Dst, MVTSrc, MVTDst);
+ }
+
+ MIB->eraseFromParent();
+
+ return true;
+}
+
+bool M68kInstrInfo::ExpandMOVSZX_RM(MachineInstrBuilder &MIB, bool IsSigned,
+ const MCInstrDesc &Desc, MVT MVTDst,
+ MVT MVTSrc) const {
+ LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to LOAD and ");
+
+ unsigned Dst = MIB->getOperand(0).getReg();
+
+ // We need the subreg of Dst to make instruction verifier happy because the
+ // real machine instruction consumes and produces values of the same size and
+ // the registers the will be used here fall into different classes and this
+ // makes IV cry. We could use a bigger operation, but this will put some
+ // pressure on cache and memory, so no.
+ unsigned SubDst =
+ RI.getSubReg(Dst, MVTSrc == MVT::i8 ? M68k::MxSubRegIndex8Lo
+ : M68k::MxSubRegIndex16Lo);
+ assert(SubDst && "No viable SUB register available");
+
+ // Make this a plain move
+ MIB->setDesc(Desc);
+ MIB->getOperand(0).setReg(SubDst);
+
+ MachineBasicBlock::iterator I = MIB.getInstr();
+ I++;
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+
+ if (IsSigned) {
+ LLVM_DEBUG(dbgs() << "Sign Extend" << '\n');
+ AddSExt(MBB, I, DL, Dst, MVTSrc, MVTDst);
+ } else {
+ LLVM_DEBUG(dbgs() << "Zero Extend" << '\n');
+ AddZExt(MBB, I, DL, Dst, MVTSrc, MVTDst);
+ }
+
+ return true;
+}
+
+bool M68kInstrInfo::ExpandPUSH_POP(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc, bool IsPush) const {
+ MachineBasicBlock::iterator I = MIB.getInstr();
+ I++;
+ MachineBasicBlock &MBB = *MIB->getParent();
+ MachineOperand MO = MIB->getOperand(0);
+ DebugLoc DL = MIB->getDebugLoc();
+ if (IsPush)
+ BuildMI(MBB, I, DL, Desc).addReg(RI.getStackRegister()).add(MO);
+ else
+ BuildMI(MBB, I, DL, Desc, MO.getReg()).addReg(RI.getStackRegister());
+
+ MIB->eraseFromParent();
+ return true;
+}
+
+bool M68kInstrInfo::ExpandCCR(MachineInstrBuilder &MIB, bool IsToCCR) const {
+
+ // Replace the pseudo instruction with the real one
+ if (IsToCCR)
+ MIB->setDesc(get(M68k::MOV16cd));
+ else
+ // FIXME M68010 or later is required
+ MIB->setDesc(get(M68k::MOV16dc));
+
+ // Promote used register to the next class
+ auto &Opd = MIB->getOperand(1);
+ Opd.setReg(getRegisterInfo().getMatchingSuperReg(
+ Opd.getReg(), M68k::MxSubRegIndex8Lo, &M68k::DR16RegClass));
+
+ return true;
+}
+
+bool M68kInstrInfo::ExpandMOVEM(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc, bool IsRM) const {
+ int Reg = 0, Offset = 0, Base = 0;
+ auto XR32 = RI.getRegClass(M68k::XR32RegClassID);
+ auto DL = MIB->getDebugLoc();
+ auto MI = MIB.getInstr();
+ auto &MBB = *MIB->getParent();
+
+ if (IsRM) {
+ Reg = MIB->getOperand(0).getReg();
+ Offset = MIB->getOperand(1).getImm();
+ Base = MIB->getOperand(2).getReg();
+ } else {
+ Offset = MIB->getOperand(0).getImm();
+ Base = MIB->getOperand(1).getReg();
+ Reg = MIB->getOperand(2).getReg();
+ }
+
+ // If the register is not in XR32 then it is smaller than 32 bit, we
+ // implicitly promote it to 32
+ if (!XR32->contains(Reg)) {
+ Reg = RI.getMatchingMegaReg(Reg, XR32);
+ assert(Reg && "Has not meaningful MEGA register");
+ }
+
+ unsigned Mask = 1 << RI.getSpillRegisterOrder(Reg);
+ if (IsRM) {
+ BuildMI(MBB, MI, DL, Desc)
+ .addImm(Mask)
+ .addImm(Offset)
+ .addReg(Base)
+ .addReg(Reg, RegState::ImplicitDefine)
+ .copyImplicitOps(*MIB);
+ } else {
+ BuildMI(MBB, MI, DL, Desc)
+ .addImm(Offset)
+ .addReg(Base)
+ .addImm(Mask)
+ .addReg(Reg, RegState::Implicit)
+ .copyImplicitOps(*MIB);
+ }
+
+ MIB->eraseFromParent();
+
+ return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+/// %d0 = SETCS_C32d
+/// to:
+/// %d0 = SUBX32dd %d0<undef>, %d0<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(Desc);
+
+ // MachineInstr::addOperand() will insert explicit operands before any
+ // implicit operands.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ // But we don't trust that.
+ assert(MIB->getOperand(1).getReg() == Reg &&
+ MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+ return true;
+}
+
+bool M68kInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ switch (MI.getOpcode()) {
+ case M68k::PUSH8d:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV8ed), true);
+ case M68k::PUSH16d:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV16er), true);
+ case M68k::PUSH32r:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV32er), true);
+
+ case M68k::POP8d:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV8do), false);
+ case M68k::POP16d:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV16ro), false);
+ case M68k::POP32r:
+ return ExpandPUSH_POP(MIB, get(M68k::MOV32ro), false);
+
+ case M68k::SETCS_C8d:
+ return Expand2AddrUndef(MIB, get(M68k::SUBX8dd));
+ case M68k::SETCS_C16d:
+ return Expand2AddrUndef(MIB, get(M68k::SUBX16dd));
+ case M68k::SETCS_C32d:
+ return Expand2AddrUndef(MIB, get(M68k::SUBX32dd));
+ }
+ return false;
+}
+
+bool M68kInstrInfo::isPCRelRegisterOperandLegal(
+ const MachineOperand &MO) const {
+ assert(MO.isReg());
+ const auto *MI = MO.getParent();
+ const uint8_t *Beads = M68k::getMCInstrBeads(MI->getOpcode());
+ assert(*Beads);
+
+ // Only addressing mode k has (non-pc) register with PCRel
+ // So we're looking for EA Beads equal to
+ // `3Bits<011>_1Bit<1>_2Bits<11>`
+ // FIXME: There is an important caveat and two assumptions
+ // here: The caveat is that EA encoding always sit on the LSB.
+ // Where the assumptions are that if there are more than one
+ // operands, the EA encoding for the source operand always sit
+ // on the LSB. At the same time, k addressing mode can not be used
+ // on destination operand.
+ // The last assumption is kinda dirty so we need to find a way around
+ // it
+ const uint8_t EncEAk[3] = {0b011, 0b1, 0b11};
+ for (const uint8_t Pat : EncEAk) {
+ uint8_t Bead = *(Beads++);
+ if (!Bead)
+ return false;
+
+ switch (Bead & 0xF) {
+ default:
+ return false;
+ case M68kBeads::Bits1:
+ case M68kBeads::Bits2:
+ case M68kBeads::Bits3: {
+ uint8_t Val = (Bead & 0xF0) >> 4;
+ if (Val != Pat)
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void M68kInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, MCRegister DstReg,
+ MCRegister SrcReg, bool KillSrc) const {
+ unsigned Opc = 0;
+
+ // First deal with the normal symmetric copies.
+ if (M68k::XR32RegClass.contains(DstReg, SrcReg))
+ Opc = M68k::MOV32rr;
+ else if (M68k::XR16RegClass.contains(DstReg, SrcReg))
+ Opc = M68k::MOV16rr;
+ else if (M68k::DR8RegClass.contains(DstReg, SrcReg))
+ Opc = M68k::MOV8dd;
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ // Now deal with asymmetrically sized copies. The cases that follow are upcast
+ // moves.
+ //
+ // NOTE
+ // These moves are not aware of type nature of these values and thus
+ // won't do any SExt or ZExt and upper bits will basically contain garbage.
+ MachineInstrBuilder MIB(*MBB.getParent(), MI);
+ if (M68k::DR8RegClass.contains(SrcReg)) {
+ if (M68k::XR16RegClass.contains(DstReg))
+ Opc = M68k::MOVXd16d8;
+ else if (M68k::XR32RegClass.contains(DstReg))
+ Opc = M68k::MOVXd32d8;
+ } else if (M68k::XR16RegClass.contains(SrcReg) &&
+ M68k::XR32RegClass.contains(DstReg))
+ Opc = M68k::MOVXd32d16;
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ bool FromCCR = SrcReg == M68k::CCR;
+ bool FromSR = SrcReg == M68k::SR;
+ bool ToCCR = DstReg == M68k::CCR;
+ bool ToSR = DstReg == M68k::SR;
+
+ if (FromCCR) {
+ assert(M68k::DR8RegClass.contains(DstReg) &&
+ "Need DR8 register to copy CCR");
+ Opc = M68k::MOV8dc;
+ } else if (ToCCR) {
+ assert(M68k::DR8RegClass.contains(SrcReg) &&
+ "Need DR8 register to copy CCR");
+ Opc = M68k::MOV8cd;
+ } else if (FromSR || ToSR)
+ llvm_unreachable("Cannot emit SR copy instruction");
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
+ << RI.getName(DstReg) << '\n');
+ llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+namespace {
+unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ const M68kSubtarget &STI, bool load) {
+ switch (TRI->getRegSizeInBits(*RC)) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 8:
+ if (M68k::DR8RegClass.hasSubClassEq(RC))
+ return load ? M68k::MOVM8mp_P : M68k::MOVM8pm_P;
+ if (M68k::CCRCRegClass.hasSubClassEq(RC))
+ return load ? M68k::MOV16cp : M68k::MOV16pc;
+
+ llvm_unreachable("Unknown 1-byte regclass");
+ case 16:
+ assert(M68k::XR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+ return load ? M68k::MOVM16mp_P : M68k::MOVM16pm_P;
+ case 32:
+ assert(M68k::XR32RegClass.hasSubClassEq(RC) && "Unknown 4-byte regclass");
+ return load ? M68k::MOVM32mp_P : M68k::MOVM32pm_P;
+ }
+}
+
+unsigned getStoreRegOpcode(unsigned SrcReg, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ const M68kSubtarget &STI) {
+ return getLoadStoreRegOpcode(SrcReg, RC, TRI, STI, false);
+}
+
+unsigned getLoadRegOpcode(unsigned DstReg, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ const M68kSubtarget &STI) {
+ return getLoadStoreRegOpcode(DstReg, RC, TRI, STI, true);
+}
+} // end anonymous namespace
+
+bool M68kInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+ unsigned SubIdx, unsigned &Size,
+ unsigned &Offset,
+ const MachineFunction &MF) const {
+ // The slot size must be the maximum size so we can easily use MOVEM.L
+ Size = 4;
+ Offset = 0;
+ return true;
+}
+
+void M68kInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register SrcReg, bool IsKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo().getObjectSize(FrameIndex) == 4 &&
+ "Stack slot too small for store");
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, TRI, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ // (0,FrameIndex) <- $reg
+ M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIndex)
+ .addReg(SrcReg, getKillRegState(IsKill));
+}
+
+void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register DstReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo().getObjectSize(FrameIndex) == 4 &&
+ "Stack slot too small for store");
+ unsigned Opc = getLoadRegOpcode(DstReg, RC, TRI, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DstReg), FrameIndex);
+}
+
+/// Return a virtual register initialized with the the global base register
+/// value. Output instructions required to initialize the register in the
+/// function entry block, if necessary.
+///
+/// TODO Move this function to M68kMachineFunctionInfo.
+unsigned M68kInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ M68kMachineFunctionInfo *MxFI = MF->getInfo<M68kMachineFunctionInfo>();
+ unsigned GlobalBaseReg = MxFI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Create the register. The code to initialize it is inserted later,
+ // by the CGBR pass (below).
+ //
+ // NOTE
+ // Normally M68k uses A5 register as global base pointer but this will
+ // create unnecessary spill if we use less then 4 registers in code; since A5
+ // is callee-save anyway we could try to allocate caller-save first and if
+ // lucky get one, otherwise it does not really matter which callee-save to
+ // use.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ GlobalBaseReg = RegInfo.createVirtualRegister(&M68k::AR32_NOSPRegClass);
+ MxFI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+std::pair<unsigned, unsigned>
+M68kInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+M68kInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace M68kII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_ABSOLUTE_ADDRESS, "m68k-absolute"},
+ {MO_PC_RELATIVE_ADDRESS, "m68k-pcrel"},
+ {MO_GOT, "m68k-got"},
+ {MO_GOTOFF, "m68k-gotoff"},
+ {MO_GOTPCREL, "m68k-gotpcrel"},
+ {MO_PLT, "m68k-plt"}};
+ return makeArrayRef(TargetFlags);
+}
+
+namespace {
+/// Create Global Base Reg pass. This initializes the PIC global base register
+struct CGBR : public MachineFunctionPass {
+ static char ID;
+ CGBR() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const M68kSubtarget &STI = MF.getSubtarget<M68kSubtarget>();
+ M68kMachineFunctionInfo *MxFI = MF.getInfo<M68kMachineFunctionInfo>();
+
+ unsigned GlobalBaseReg = MxFI->getGlobalBaseReg();
+
+ // If we didn't need a GlobalBaseReg, don't insert code.
+ if (GlobalBaseReg == 0)
+ return false;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ const M68kInstrInfo *TII = STI.getInstrInfo();
+
+ // Generate lea (__GLOBAL_OFFSET_TABLE_,%PC), %A5
+ BuildMI(FirstMBB, MBBI, DL, TII->get(M68k::LEA32q), GlobalBaseReg)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", M68kII::MO_GOTPCREL);
+
+ return true;
+ }
+
+ StringRef getPassName() const override {
+ return "M68k PIC Global Base Reg Initialization";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // namespace
+
+char CGBR::ID = 0;
+FunctionPass *llvm::createM68kGlobalBaseRegPass() { return new CGBR(); }
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.h b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.h
new file mode 100644
index 0000000..a503b02
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -0,0 +1,339 @@
+//===-- M68kInstrInfo.h - M68k Instruction Information ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the M68k implementation of the TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KINSTRINFO_H
+#define LLVM_LIB_TARGET_M68K_M68KINSTRINFO_H
+
+#include "M68k.h"
+#include "M68kRegisterInfo.h"
+
+#include "MCTargetDesc/M68kBaseInfo.h"
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "M68kGenInstrInfo.inc"
+
+namespace llvm {
+
+class M68kSubtarget;
+
+namespace M68k {
+// These MUST be kept in sync with codes definitions in M68kInstrInfo.td
+enum CondCode {
+ COND_T = 0, // True
+ COND_F = 1, // False
+ COND_HI = 2, // High
+ COND_LS = 3, // Less or Same
+ COND_CC = 4, // Carry Clear
+ COND_CS = 5, // Carry Set
+ COND_NE = 6, // Not Equal
+ COND_EQ = 7, // Equal
+ COND_VC = 8, // Overflow Clear
+ COND_VS = 9, // Overflow Set
+ COND_PL = 10, // Plus
+ COND_MI = 11, // Minus
+ COND_GE = 12, // Greater or Equal
+ COND_LT = 13, // Less Than
+ COND_GT = 14, // Greater Than
+ COND_LE = 15, // Less or Equal
+ LAST_VALID_COND = COND_LE,
+ COND_INVALID
+};
+
+// FIXME would be nice tablegen to generate these predicates and converters
+// mb tag based
+
+static inline M68k::CondCode GetOppositeBranchCondition(M68k::CondCode CC) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Illegal condition code!");
+ case M68k::COND_T:
+ return M68k::COND_F;
+ case M68k::COND_F:
+ return M68k::COND_T;
+ case M68k::COND_HI:
+ return M68k::COND_LS;
+ case M68k::COND_LS:
+ return M68k::COND_HI;
+ case M68k::COND_CC:
+ return M68k::COND_CS;
+ case M68k::COND_CS:
+ return M68k::COND_CC;
+ case M68k::COND_NE:
+ return M68k::COND_EQ;
+ case M68k::COND_EQ:
+ return M68k::COND_NE;
+ case M68k::COND_VC:
+ return M68k::COND_VS;
+ case M68k::COND_VS:
+ return M68k::COND_VC;
+ case M68k::COND_PL:
+ return M68k::COND_MI;
+ case M68k::COND_MI:
+ return M68k::COND_PL;
+ case M68k::COND_GE:
+ return M68k::COND_LT;
+ case M68k::COND_LT:
+ return M68k::COND_GE;
+ case M68k::COND_GT:
+ return M68k::COND_LE;
+ case M68k::COND_LE:
+ return M68k::COND_GT;
+ }
+}
+
+static inline unsigned GetCondBranchFromCond(M68k::CondCode CC) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Illegal condition code!");
+ case M68k::COND_EQ:
+ return M68k::Beq8;
+ case M68k::COND_NE:
+ return M68k::Bne8;
+ case M68k::COND_LT:
+ return M68k::Blt8;
+ case M68k::COND_LE:
+ return M68k::Ble8;
+ case M68k::COND_GT:
+ return M68k::Bgt8;
+ case M68k::COND_GE:
+ return M68k::Bge8;
+ case M68k::COND_CS:
+ return M68k::Bcs8;
+ case M68k::COND_LS:
+ return M68k::Bls8;
+ case M68k::COND_HI:
+ return M68k::Bhi8;
+ case M68k::COND_CC:
+ return M68k::Bcc8;
+ case M68k::COND_MI:
+ return M68k::Bmi8;
+ case M68k::COND_PL:
+ return M68k::Bpl8;
+ case M68k::COND_VS:
+ return M68k::Bvs8;
+ case M68k::COND_VC:
+ return M68k::Bvc8;
+ }
+}
+
+static inline M68k::CondCode GetCondFromBranchOpc(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return M68k::COND_INVALID;
+ case M68k::Beq8:
+ return M68k::COND_EQ;
+ case M68k::Bne8:
+ return M68k::COND_NE;
+ case M68k::Blt8:
+ return M68k::COND_LT;
+ case M68k::Ble8:
+ return M68k::COND_LE;
+ case M68k::Bgt8:
+ return M68k::COND_GT;
+ case M68k::Bge8:
+ return M68k::COND_GE;
+ case M68k::Bcs8:
+ return M68k::COND_CS;
+ case M68k::Bls8:
+ return M68k::COND_LS;
+ case M68k::Bhi8:
+ return M68k::COND_HI;
+ case M68k::Bcc8:
+ return M68k::COND_CC;
+ case M68k::Bmi8:
+ return M68k::COND_MI;
+ case M68k::Bpl8:
+ return M68k::COND_PL;
+ case M68k::Bvs8:
+ return M68k::COND_VS;
+ case M68k::Bvc8:
+ return M68k::COND_VC;
+ }
+}
+
+static inline unsigned IsCMP(unsigned Op) {
+ switch (Op) {
+ default:
+ return false;
+ case M68k::CMP8dd:
+ case M68k::CMP8df:
+ case M68k::CMP8di:
+ case M68k::CMP8dj:
+ case M68k::CMP8dp:
+ case M68k::CMP16dd:
+ case M68k::CMP16df:
+ case M68k::CMP16di:
+ case M68k::CMP16dj:
+ case M68k::CMP16dp:
+ return true;
+ }
+}
+
+static inline bool IsSETCC(unsigned SETCC) {
+ switch (SETCC) {
+ default:
+ return false;
+ case M68k::SETd8eq:
+ case M68k::SETd8ne:
+ case M68k::SETd8lt:
+ case M68k::SETd8ge:
+ case M68k::SETd8le:
+ case M68k::SETd8gt:
+ case M68k::SETd8cs:
+ case M68k::SETd8cc:
+ case M68k::SETd8ls:
+ case M68k::SETd8hi:
+ case M68k::SETd8pl:
+ case M68k::SETd8mi:
+ case M68k::SETd8vc:
+ case M68k::SETd8vs:
+ case M68k::SETj8eq:
+ case M68k::SETj8ne:
+ case M68k::SETj8lt:
+ case M68k::SETj8ge:
+ case M68k::SETj8le:
+ case M68k::SETj8gt:
+ case M68k::SETj8cs:
+ case M68k::SETj8cc:
+ case M68k::SETj8ls:
+ case M68k::SETj8hi:
+ case M68k::SETj8pl:
+ case M68k::SETj8mi:
+ case M68k::SETj8vc:
+ case M68k::SETj8vs:
+ case M68k::SETp8eq:
+ case M68k::SETp8ne:
+ case M68k::SETp8lt:
+ case M68k::SETp8ge:
+ case M68k::SETp8le:
+ case M68k::SETp8gt:
+ case M68k::SETp8cs:
+ case M68k::SETp8cc:
+ case M68k::SETp8ls:
+ case M68k::SETp8hi:
+ case M68k::SETp8pl:
+ case M68k::SETp8mi:
+ case M68k::SETp8vc:
+ case M68k::SETp8vs:
+ return true;
+ }
+}
+
+} // namespace M68k
+
+class M68kInstrInfo : public M68kGenInstrInfo {
+ virtual void anchor();
+
+protected:
+ const M68kSubtarget &Subtarget;
+ const M68kRegisterInfo RI;
+
+public:
+ explicit M68kInstrInfo(const M68kSubtarget &STI);
+
+ static const M68kInstrInfo *create(M68kSubtarget &STI);
+
+ /// TargetInstrInfo is a superset of MRegister info. As such, whenever a
+ /// client has an instance of instruction info, it should always be able to
+ /// get register info as well (through this method).
+ const M68kRegisterInfo &getRegisterInfo() const { return RI; };
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
+ bool KillSrc) const override;
+
+ bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
+ unsigned &Size, unsigned &Offset,
+ const MachineFunction &MF) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register SrcReg,
+ bool IsKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ bool isPCRelRegisterOperandLegal(const MachineOperand &MO) const override;
+
+ /// Add appropriate SExt nodes
+ void AddSExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned Reg, MVT From, MVT To) const;
+
+ /// Add appropriate ZExt nodes
+ void AddZExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned Reg, MVT From, MVT To) const;
+
+ /// Move across register classes without extension
+ bool ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst, MVT MVTSrc) const;
+
+ /// Move from register and extend
+ bool ExpandMOVSZX_RR(MachineInstrBuilder &MIB, bool IsSigned, MVT MVTDst,
+ MVT MVTSrc) const;
+
+ /// Move from memory and extend
+ bool ExpandMOVSZX_RM(MachineInstrBuilder &MIB, bool IsSigned,
+ const MCInstrDesc &Desc, MVT MVTDst, MVT MVTSrc) const;
+
+ /// Push/Pop to/from stack
+ bool ExpandPUSH_POP(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+ bool IsPush) const;
+
+ /// Moves to/from CCR
+ bool ExpandCCR(MachineInstrBuilder &MIB, bool IsToCCR) const;
+
+ /// Expand all MOVEM pseudos into real MOVEMs
+ bool ExpandMOVEM(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+ bool IsRM) const;
+
+ /// Return a virtual register initialized with the the global base register
+ /// value. Output instructions required to initialize the register in the
+ /// function entry block, if necessary.
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.td
new file mode 100644
index 0000000..e743213
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -0,0 +1,687 @@
+//== M68kInstrInfo.td - Main M68k Instruction Definition -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the M68k instruction set, defining the instructions
+/// and properties of the instructions which are needed for code generation,
+/// machine code emission, and analysis.
+///
+//===----------------------------------------------------------------------===//
+
+include "M68kInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Profiles
+//===----------------------------------------------------------------------===//
+
+def MxSDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def MxSDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def MxSDT_Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def MxSDT_Ret : SDTypeProfile<0, -1, [
+ /* ADJ */ SDTCisVT<0, i32>
+]>;
+
+def MxSDT_TCRet : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def MxSDT_Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def MxSDT_UnArithCCROut : SDTypeProfile<2, 1, [
+ /* RES */ SDTCisInt<0>,
+ /* CCR */ SDTCisVT<1, i8>,
+ /* OPD */ SDTCisSameAs<0, 2>
+]>;
+
+// RES, CCR <- op LHS, RHS
+def MxSDT_BiArithCCROut : SDTypeProfile<2, 2, [
+ /* RES */ SDTCisInt<0>,
+ /* CCR */ SDTCisVT<1, i8>,
+ /* LHS */ SDTCisSameAs<0, 2>,
+ /* RHS */ SDTCisSameAs<0, 3>
+]>;
+
+// RES, CCR <- op LHS, RHS, CCR
+def MxSDT_BiArithCCRInOut : SDTypeProfile<2, 3, [
+ /* RES 1 */ SDTCisInt<0>,
+ /* CCR */ SDTCisVT<1, i8>,
+ /* LHS */ SDTCisSameAs<0, 2>,
+ /* RHS */ SDTCisSameAs<0, 3>,
+ /* CCR */ SDTCisSameAs<1, 4>
+]>;
+
+// RES1, RES2, CCR <- op LHS, RHS
+def MxSDT_2BiArithCCROut : SDTypeProfile<3, 2, [
+ /* RES 1 */ SDTCisInt<0>,
+ /* RES 2 */ SDTCisSameAs<0, 1>,
+ /* CCR */ SDTCisVT<1, i8>,
+ /* LHS */ SDTCisSameAs<0, 2>,
+ /* RHS */ SDTCisSameAs<0, 3>
+]>;
+
+def MxSDT_CmpTest : SDTypeProfile<1, 2, [
+ /* CCR */ SDTCisVT<0, i8>,
+ /* Ops */ SDTCisSameAs<1, 2>
+]>;
+
+def MxSDT_Cmov : SDTypeProfile<1, 4, [
+ /* ARG */ SDTCisSameAs<0, 1>,
+ /* ARG */ SDTCisSameAs<1, 2>,
+ /* Cond */ SDTCisVT<3, i8>,
+ /* CCR */ SDTCisVT<4, i8>
+]>;
+
+def MxSDT_BrCond : SDTypeProfile<0, 3, [
+ /* Dest */ SDTCisVT<0, OtherVT>,
+ /* Cond */ SDTCisVT<1, i8>,
+ /* CCR */ SDTCisVT<2, i8>
+]>;
+
+def MxSDT_SetCC : SDTypeProfile<1, 2, [
+ /* BOOL */ SDTCisVT<0, i8>,
+ /* Cond */ SDTCisVT<1, i8>,
+ /* CCR */ SDTCisVT<2, i8>
+]>;
+
+def MxSDT_SetCC_C : SDTypeProfile<1, 2, [
+ /* BOOL */ SDTCisInt<0>,
+ /* Cond */ SDTCisVT<1, i8>,
+ /* CCR */ SDTCisVT<2, i8>
+]>;
+
+
+def MxSDT_SEG_ALLOCA : SDTypeProfile<1, 1,[
+ /* MEM */ SDTCisVT<0, iPTR>,
+ /* SIZE */ SDTCisVT<1, iPTR>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// Nodes
+//===----------------------------------------------------------------------===//
+
+def MxCallSeqStart : SDNode<"ISD::CALLSEQ_START", MxSDT_CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+
+def MxCallSeqEnd : SDNode<"ISD::CALLSEQ_END", MxSDT_CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def MxCall : SDNode<"M68kISD::CALL", MxSDT_Call,
+ [SDNPHasChain, SDNPOutGlue,
+ SDNPOptInGlue, SDNPVariadic]>;
+
+def MxRet : SDNode<"M68kISD::RET", MxSDT_Ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MxTCRet : SDNode<"M68kISD::TC_RETURN", MxSDT_TCRet,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MxWrapper : SDNode<"M68kISD::Wrapper", MxSDT_Wrapper>;
+def MxWrapperPC : SDNode<"M68kISD::WrapperPC", MxSDT_Wrapper>;
+
+def MxAdd : SDNode<"M68kISD::ADD", MxSDT_BiArithCCROut, [SDNPCommutative]>;
+def MxSub : SDNode<"M68kISD::SUB", MxSDT_BiArithCCROut>;
+def MxOr : SDNode<"M68kISD::OR", MxSDT_BiArithCCROut, [SDNPCommutative]>;
+def MxXor : SDNode<"M68kISD::XOR", MxSDT_BiArithCCROut, [SDNPCommutative]>;
+def MxAnd : SDNode<"M68kISD::AND", MxSDT_BiArithCCROut, [SDNPCommutative]>;
+
+def MxAddX : SDNode<"M68kISD::ADDX", MxSDT_BiArithCCRInOut>;
+def MxSubX : SDNode<"M68kISD::SUBX", MxSDT_BiArithCCRInOut>;
+
+def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
+def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_2BiArithCCROut, [SDNPCommutative]>;
+
+def MxCmp : SDNode<"M68kISD::CMP", MxSDT_CmpTest>;
+def MxBt : SDNode<"M68kISD::BT", MxSDT_CmpTest>;
+
+def MxCmov : SDNode<"M68kISD::CMOV", MxSDT_Cmov>;
+def MxBrCond : SDNode<"M68kISD::BRCOND", MxSDT_BrCond, [SDNPHasChain]>;
+def MxSetCC : SDNode<"M68kISD::SETCC", MxSDT_SetCC>;
+def MxSetCC_C : SDNode<"M68kISD::SETCC_CARRY", MxSDT_SetCC_C>;
+
+
+def MxSegAlloca : SDNode<"M68kISD::SEG_ALLOCA", MxSDT_SEG_ALLOCA,
+ [SDNPHasChain]>;
+
+
+//===----------------------------------------------------------------------===//
+// Operands
+//===----------------------------------------------------------------------===//
+
+/// Size is the size of the data, either bits of a register or number of bits
+/// addressed in memory. Size id is a letter that identifies size.
+class MxSize<int num, string id, string full> {
+ int Num = num;
+ string Id = id;
+ string Full = full;
+}
+
+def MxSize8 : MxSize<8, "b", "byte">;
+def MxSize16 : MxSize<16, "w", "word">;
+def MxSize32 : MxSize<32, "l", "long">;
+
+class MxOpClass<string name> : AsmOperandClass {
+ let Name = name;
+ let ParserMethod = "parseMemOp";
+}
+
+def MxRegClass : MxOpClass<"Reg">;
+
+class MxOperand<ValueType vt, MxSize size, string letter, RegisterClass rc, dag pat = (null_frag)> {
+ ValueType VT = vt;
+ string Letter = letter;
+ MxSize Size = size;
+ RegisterClass RC = rc;
+ dag Pat = pat;
+}
+
+class MxRegOp<ValueType vt,
+ RegisterClass rc,
+ MxSize size,
+ string letter,
+ string pm = "printOperand">
+ : RegisterOperand<rc, pm>,
+ MxOperand<vt, size, letter, rc> {
+ let ParserMatchClass = MxRegClass;
+}
+
+// REGISTER DIRECT. The operand is in the data register specified by
+// the effective address register field.
+def MxXRD16 : MxRegOp<i16, XR16, MxSize16, "r">;
+def MxXRD32 : MxRegOp<i32, XR32, MxSize32, "r">;
+
+def MxXRD16_TC : MxRegOp<i16, XR16_TC, MxSize16, "r">;
+def MxXRD32_TC : MxRegOp<i32, XR32_TC, MxSize32, "r">;
+
+// DATA REGISTER DIRECT. The operand is in the data register specified by
+// the effective address register field.
+def MxDRD8 : MxRegOp<i8, DR8, MxSize8, "d">;
+def MxDRD16 : MxRegOp<i16, DR16, MxSize16, "d">;
+def MxDRD32 : MxRegOp<i32, DR32, MxSize32, "d">;
+
+def MxDRD16_TC : MxRegOp<i16, DR16_TC, MxSize16, "d">;
+def MxDRD32_TC : MxRegOp<i32, DR32_TC, MxSize32, "d">;
+
+// ADDRESS REGISTER DIRECT. The operand is in the address register specified by
+// the effective address register field.
+def MxARD16 : MxRegOp<i16, AR16, MxSize16, "a">;
+def MxARD32 : MxRegOp<i32, AR32, MxSize32, "a">;
+
+def MxARD16_TC : MxRegOp<i16, AR16_TC, MxSize16, "a">;
+def MxARD32_TC : MxRegOp<i32, AR32_TC, MxSize32, "a">;
+
+class MxMemOp<dag ops, MxSize size, string letter,
+ string printMethod = "printOperand",
+ AsmOperandClass parserMatchClass = ImmAsmOperand>
+ : Operand<iPTR>, MxOperand<iPTR, size, letter, ?> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = ops;
+ let ParserMatchClass = parserMatchClass;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// ADDRESS REGISTER INDIRECT. The address of the operand is in the address
+// register specified by the register field. The reference is classified as
+// a data reference with the exception of the jump and jump-to-subroutine
+// instructions.
+def MxARI : MxOpClass<"ARI">;
+def MxARI8 : MxMemOp<(ops AR32), MxSize8, "j", "printARI8Mem", MxARI>;
+def MxARI16 : MxMemOp<(ops AR32), MxSize16, "j", "printARI16Mem", MxARI>;
+def MxARI32 : MxMemOp<(ops AR32), MxSize32, "j", "printARI32Mem", MxARI>;
+
+def MxARI8_TC : MxMemOp<(ops AR32_TC), MxSize8, "j", "printARI8Mem", MxARI>;
+def MxARI16_TC : MxMemOp<(ops AR32_TC), MxSize16, "j", "printARI16Mem", MxARI>;
+def MxARI32_TC : MxMemOp<(ops AR32_TC), MxSize32, "j", "printARI32Mem", MxARI>;
+
+// ADDRESS REGISTER INDIRECT WITH POSTINCREMENT. The address of the operand is
+// in the address register specified by the register field. After the operand
+// address is used, it is incremented by one, two, or four depending upon whether
+// the size of the operand is byte, word, or long word. If the address register
+// is the stack pointer and the operand size is byte, the address is incremented
+// by two rather than one to keep the stack pointer on a word boundary.
+// The reference is classified as a data reference.
+def MxARIPI : MxOpClass<"ARIPI">;
+def MxARIPI8 : MxMemOp<(ops AR32), MxSize8, "o", "printARIPI8Mem", MxARIPI>;
+def MxARIPI16 : MxMemOp<(ops AR32), MxSize16, "o", "printARIPI16Mem", MxARIPI>;
+def MxARIPI32 : MxMemOp<(ops AR32), MxSize32, "o", "printARIPI32Mem", MxARIPI>;
+
+def MxARIPI8_TC : MxMemOp<(ops AR32_TC), MxSize8, "o", "printARIPI8Mem", MxARIPI>;
+def MxARIPI16_TC : MxMemOp<(ops AR32_TC), MxSize16, "o", "printARIPI16Mem", MxARIPI>;
+def MxARIPI32_TC : MxMemOp<(ops AR32_TC), MxSize32, "o", "printARIPI32Mem", MxARIPI>;
+
+// ADDRESS REGISTER INDIRECT WITH PREDECREMENT. The address of the operand is in
+// the address register specified by the register field. Before the operand
+// address is used, it is decremented by one, two, or four depending upon whether
+// the operand size is byte, word, or long word. If the address register is
+// the stack pointer and the operand size is byte, the address is decremented by
+// two rather than one to keep the stack pointer on a word boundary.
+// The reference is classified as a data reference.
+def MxARIPD : MxOpClass<"ARIPD">;
+def MxARIPD8 : MxMemOp<(ops AR32), MxSize8, "e", "printARIPD8Mem", MxARIPD>;
+def MxARIPD16 : MxMemOp<(ops AR32), MxSize16, "e", "printARIPD16Mem", MxARIPD>;
+def MxARIPD32 : MxMemOp<(ops AR32), MxSize32, "e", "printARIPD32Mem", MxARIPD>;
+
+def MxARIPD8_TC : MxMemOp<(ops AR32_TC), MxSize8, "e", "printARIPD8Mem", MxARIPD>;
+def MxARIPD16_TC : MxMemOp<(ops AR32_TC), MxSize16, "e", "printARIPD16Mem", MxARIPD>;
+def MxARIPD32_TC : MxMemOp<(ops AR32_TC), MxSize32, "e", "printARIPD32Mem", MxARIPD>;
+
+// ADDRESS REGISTER INDIRECT WITH DISPLACEMENT. This addressing mode requires one
+// word of extension. The address of the operand is the sum of the address in
+// the address register and the sign-extended 16-bit displacement integer in the
+// extension word. The reference is classified as a data reference with the
+// exception of the jump and jump-to-subroutine instructions.
+def MxARID : MxOpClass<"ARID">;
+def MxARID8 : MxMemOp<(ops i16imm, AR32), MxSize8, "p", "printARID8Mem", MxARID>;
+def MxARID16 : MxMemOp<(ops i16imm, AR32), MxSize16, "p", "printARID16Mem", MxARID>;
+def MxARID32 : MxMemOp<(ops i16imm, AR32), MxSize32, "p", "printARID32Mem", MxARID>;
+
+def MxARID8_TC : MxMemOp<(ops i16imm, AR32_TC), MxSize8, "p", "printARID8Mem", MxARID>;
+def MxARID16_TC : MxMemOp<(ops i16imm, AR32_TC), MxSize16, "p", "printARID16Mem", MxARID>;
+def MxARID32_TC : MxMemOp<(ops i16imm, AR32_TC), MxSize32, "p", "printARID32Mem", MxARID>;
+
+// ADDRESS REGISTER INDIRECT WITH INDEX. This addressing mode requires one word
+// of extension. The address of the operand is the sum of the address in the
+// address register, the signextended displacement integer in the low order eight
+// bits of the extension word, and the contents of the index register.
+// The reference is classified as a data reference with the exception of the
+// jump and jump-to-subroutine instructions
+def MxARII : MxOpClass<"ARII">;
+def MxARII8 : MxMemOp<(ops i8imm, AR32, XR32), MxSize8, "f", "printARII8Mem", MxARII>;
+def MxARII16 : MxMemOp<(ops i8imm, AR32, XR32), MxSize16, "f", "printARII16Mem", MxARII>;
+def MxARII32 : MxMemOp<(ops i8imm, AR32, XR32), MxSize32, "f", "printARII32Mem", MxARII>;
+
+def MxARII8_TC : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize8, "f", "printARII8Mem", MxARII>;
+def MxARII16_TC : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize16, "f", "printARII16Mem", MxARII>;
+def MxARII32_TC : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize32, "f", "printARII32Mem", MxARII>;
+
+// ABSOLUTE SHORT ADDRESS. This addressing mode requires one word of extension.
+// The address of the operand is the extension word. The 16-bit address is sign
+// extended before it is used. The reference is classified as a data reference
+// with the exception of the jump and jump-tosubroutine instructions.
+def MxAddr : MxOpClass<"Addr">;
+def MxAS8 : MxMemOp<(ops OtherVT), MxSize8, "B", "printAS8Mem", MxAddr>;
+def MxAS16 : MxMemOp<(ops OtherVT), MxSize16, "B", "printAS16Mem", MxAddr>;
+def MxAS32 : MxMemOp<(ops OtherVT), MxSize32, "B", "printAS32Mem", MxAddr>;
+
+// ABSOLUTE LONG ADDRESS. This addressing mode requires two words of extension.
+// The address of the operand is developed by the concatenation of the extension
+// words. The high order part of the address is the first extension word; the low
+// order part of the address is the second extension word. The reference is
+// classified as a data reference with the exception of the jump and jump
+// to-subroutine instructions.
+def MxAL8 : MxMemOp<(ops OtherVT), MxSize8, "b", "printAL8Mem", MxAddr>;
+def MxAL16 : MxMemOp<(ops OtherVT), MxSize16, "b", "printAL16Mem", MxAddr>;
+def MxAL32 : MxMemOp<(ops OtherVT), MxSize32, "b", "printAL32Mem", MxAddr>;
+
+def MxPCD : MxOpClass<"PCD">;
+def MxPCI : MxOpClass<"PCI">;
+
+let OperandType = "OPERAND_PCREL" in {
+// PROGRAM COUNTER WITH DISPLACEMENT. This addressing mode requires one word of
+// extension. The address of the operand is the sum of the address in the program
+// counter and the Sign-extended 16-bit displacement integer in the extension
+// word. The value in the program counter is the address of the extension word.
+// The reference is classified as a program reference.
+def MxPCD8 : MxMemOp<(ops i16imm), MxSize8, "q", "printPCD8Mem", MxPCD>;
+def MxPCD16 : MxMemOp<(ops i16imm), MxSize16, "q", "printPCD16Mem", MxPCD>;
+def MxPCD32 : MxMemOp<(ops i16imm), MxSize32, "q", "printPCD32Mem", MxPCD>;
+
+// PROGRAM COUNTER WITH INDEX. This addressing mode requires one word of
+// extension. The address is the sum of the address in the program counter, the
+// sign-extended displacement integer in the lower eight bits of the extension
+// word, and the contents of the index register. The value in the program
+// counter is the address of the extension word. This reference is classified as
+// a program reference.
+def MxPCI8 : MxMemOp<(ops i8imm, XR32), MxSize8, "k", "printPCI8Mem", MxPCI>;
+def MxPCI16 : MxMemOp<(ops i8imm, XR32), MxSize16, "k", "printPCI16Mem", MxPCI>;
+def MxPCI32 : MxMemOp<(ops i8imm, XR32), MxSize32, "k", "printPCI32Mem", MxPCI>;
+} // OPERAND_PCREL
+
+def MxImm : AsmOperandClass {
+ let Name = "MxImm";
+ let PredicateMethod = "isImm";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseImm";
+}
+
+class MxOp<ValueType vt, MxSize size, string letter>
+ : Operand<vt>,
+ MxOperand<vt, size, letter, ?> {
+ let ParserMatchClass = MxImm;
+}
+
+let OperandType = "OPERAND_IMMEDIATE",
+ PrintMethod = "printImmediate" in {
+// IMMEDIATE DATA. This addressing mode requires either one or two words of
+// extension depending on the size of the operation.
+// Byte Operation - operand is low order byte of extension word
+// Word Operation - operand is extension word
+// Long Word Operation - operand is in the two extension words,
+// high order 16 bits are in the first
+// extension word, low order 16 bits are
+// in the second extension word.
+def Mxi8imm : MxOp<i8, MxSize8, "i">;
+def Mxi16imm : MxOp<i16, MxSize16, "i">;
+def Mxi32imm : MxOp<i32, MxSize32, "i">;
+} // OPERAND_IMMEDIATE
+
+let OperandType = "OPERAND_PCREL",
+ ParserMatchClass = MxAddr,
+ PrintMethod = "printPCRelImm" in {
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def MxBrTarget8 : Operand<OtherVT>;
+def MxBrTarget16 : Operand<OtherVT>;
+def MxBrTarget32 : Operand<OtherVT>;
+
+} // OPERAND_PCREL
+
+// Used with MOVEM
+def MxMoveMask : MxOp<i16, MxSize16, "m"> {
+ let OperandType = "OPERAND_IMMEDIATE";
+ let PrintMethod = "printMoveMask";
+}
+
+//===----------------------------------------------------------------------===//
+// Predicates
+//===----------------------------------------------------------------------===//
+
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+ "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsPIC : Predicate<"TM.isPositionIndependent()">;
+def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
+def IsM68000 : Predicate<"Subtarget.IsM68000()">;
+def IsM68010 : Predicate<"Subtarget.IsM68010()">;
+def IsM68020 : Predicate<"Subtarget.IsM68020()">;
+def IsM68030 : Predicate<"Subtarget.IsM68030()">;
+def IsM68040 : Predicate<"Subtarget.IsM68040()">;
+
+
+//===----------------------------------------------------------------------===//
+// Condition Codes
+//
+// These MUST be kept in sync with codes enum in M68kInstrInfo.h
+//===----------------------------------------------------------------------===//
+
+def MxCONDt : PatLeaf<(i8 0)>; // True
+def MxCONDf : PatLeaf<(i8 1)>; // False
+def MxCONDhi : PatLeaf<(i8 2)>; // High
+def MxCONDls : PatLeaf<(i8 3)>; // Less or Same
+def MxCONDcc : PatLeaf<(i8 4)>; // Carry Clear
+def MxCONDcs : PatLeaf<(i8 5)>; // Carry Set
+def MxCONDne : PatLeaf<(i8 6)>; // Not Equal
+def MxCONDeq : PatLeaf<(i8 7)>; // Equal
+def MxCONDvc : PatLeaf<(i8 8)>; // Overflow Clear
+def MxCONDvs : PatLeaf<(i8 9)>; // Overflow Set
+def MxCONDpl : PatLeaf<(i8 10)>; // Plus
+def MxCONDmi : PatLeaf<(i8 11)>; // Minus
+def MxCONDge : PatLeaf<(i8 12)>; // Greater or Equal
+def MxCONDlt : PatLeaf<(i8 13)>; // Less Than
+def MxCONDgt : PatLeaf<(i8 14)>; // Greater Than
+def MxCONDle : PatLeaf<(i8 15)>; // Less or Equal
+
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// NOTE Though this CP is not strictly necessarily it will simplify instruciton
+// definitions
+def MxCP_ARI : ComplexPattern<iPTR, 1, "SelectARI",
+ [], [SDNPWantParent]>;
+
+def MxCP_ARIPI : ComplexPattern<iPTR, 1, "SelectARIPI",
+ [], [SDNPWantParent]>;
+
+def MxCP_ARIPD : ComplexPattern<iPTR, 1, "SelectARIPD",
+ [], [SDNPWantParent]>;
+
+def MxCP_ARID : ComplexPattern<iPTR, 2, "SelectARID",
+ [add, sub, mul, or, shl, frameindex],
+ [SDNPWantParent]>;
+
+def MxCP_ARII : ComplexPattern<iPTR, 3, "SelectARII",
+ [add, sub, mul, or, shl, frameindex],
+ [SDNPWantParent]>;
+
+def MxCP_AL : ComplexPattern<iPTR, 1, "SelectAL",
+ [add, sub, mul, or, shl],
+ [SDNPWantParent]>;
+
+def MxCP_PCD : ComplexPattern<iPTR, 1, "SelectPCD",
+ [add, sub, mul, or, shl],
+ [SDNPWantParent]>;
+
+def MxCP_PCI : ComplexPattern<iPTR, 2, "SelectPCI",
+ [add, sub, mul, or, shl], [SDNPWantParent]>;
+
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def MximmSExt8 : PatLeaf<(i8 imm)>;
+def MximmSExt16 : PatLeaf<(i16 imm)>;
+def MximmSExt32 : PatLeaf<(i32 imm)>;
+
+// Used for Shifts and Rotations, since M68k immediates in these instructions
+// are 1 <= i <= 8. Generally, if immediate is bigger than 8 it will be moved
+// to a register and then an operation is performed.
+//
+// TODO Need to evaluate whether splitting one big shift(or rotate)
+// into a few smaller is faster than doing a move, if so do custom lowering
+def Mximm8_1to8 : ImmLeaf<i8, [{ return Imm >= 1 && Imm <= 8; }]>;
+def Mximm16_1to8 : ImmLeaf<i16, [{ return Imm >= 1 && Imm <= 8; }]>;
+def Mximm32_1to8 : ImmLeaf<i32, [{ return Imm >= 1 && Imm <= 8; }]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def Mxloadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isSimple();
+ return false;
+}]>;
+
+def Mxloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isSimple();
+ return false;
+}]>;
+
+def Mxloadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+
+def MxSExtLoadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def MxSExtLoadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def MxSExtLoadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def MxZExtLoadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def MxZExtLoadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def MxZExtLoadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def MxZExtLoadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def MxZExtLoadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def MxZExtLoadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def MxExtLoadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def MxExtLoadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def MxExtLoadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def MxExtLoadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def MxExtLoadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def MxExtLoadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+
+//===----------------------------------------------------------------------===//
+// Type Fixtures
+//
+// Type Fixtures are ValueType related information sets that usually go together
+//===----------------------------------------------------------------------===//
+
+// TODO make it folded like MxType8.F.Op nad MxType8.F.Pat
+// TODO move strings into META subclass
+// vt: Type of data this fixture refers to
+// prefix: Prefix used to identify type
+// postfix: Prefix used to qualify type
+class MxType<ValueType vt, string prefix, string postfix,
+ // rLet: Register letter
+ // rOp: Supported any register operand
+ string rLet, MxOperand rOp,
+ // jOp: Supported ARI operand
+ // jPat: What ARI pattern to use
+ MxOperand jOp, ComplexPattern jPat,
+ // oOp: Supported ARIPI operand
+ // oPat: What ARIPI pattern is used
+ MxOperand oOp, ComplexPattern oPat,
+ // eOp: Supported ARIPD operand
+ // ePat: What ARIPD pattern is used
+ MxOperand eOp, ComplexPattern ePat,
+ // pOp: Supported ARID operand
+ // pPat: What ARID pattern is used
+ MxOperand pOp, ComplexPattern pPat,
+ // fOp: Supported ARII operand
+ // fPat: What ARII pattern is used
+ MxOperand fOp, ComplexPattern fPat,
+ // bOp: Supported absolute operand
+ // bPat: What absolute pattern is used
+ MxOperand bOp, ComplexPattern bPat,
+ // qOp: Supported PCD operand
+ // qPat: What PCD pattern is used
+ MxOperand qOp, ComplexPattern qPat,
+ // kOp: Supported PCD operand
+ // kPat: What PCD pattern is used
+ MxOperand kOp, ComplexPattern kPat,
+ // iOp: Supported immediate operand
+ // iPat: What immediate pattern is used
+ MxOperand iOp, PatFrag iPat,
+ // load: What load operation is used with MEM
+ PatFrag load> {
+ int Size = vt.Size;
+ ValueType VT = vt;
+ string Prefix = prefix;
+ string Postfix = postfix;
+
+ string RLet = rLet;
+ MxOperand ROp = rOp;
+
+ MxOperand JOp = jOp;
+ ComplexPattern JPat = jPat;
+
+ MxOperand OOp = oOp;
+ ComplexPattern OPat = oPat;
+
+ MxOperand EOp = eOp;
+ ComplexPattern EPat = ePat;
+
+ MxOperand POp = pOp;
+ ComplexPattern PPat = pPat;
+
+ MxOperand FOp = fOp;
+ ComplexPattern FPat = fPat;
+
+ MxOperand BOp = bOp;
+ ComplexPattern BPat = bPat;
+
+ MxOperand QOp = qOp;
+ ComplexPattern QPat = qPat;
+
+ MxOperand KOp = kOp;
+ ComplexPattern KPat = kPat;
+
+ MxOperand IOp = iOp;
+ PatFrag IPat = iPat;
+
+ PatFrag Load = load;
+}
+
+class MxType8Class<string rLet, MxOperand reg>
+ : MxType<i8, "b", "", rLet, reg,
+ MxARI8, MxCP_ARI,
+ MxARIPI8, MxCP_ARIPI,
+ MxARIPD8, MxCP_ARIPD,
+ MxARID8, MxCP_ARID,
+ MxARII8, MxCP_ARII,
+ MxAL8, MxCP_AL,
+ MxPCD8, MxCP_PCD,
+ MxPCI8, MxCP_PCI,
+ Mxi8imm, MximmSExt8,
+ Mxloadi8>;
+
+def MxType8 : MxType8Class<?,?>;
+
+class MxType16Class<string rLet, MxOperand reg>
+ : MxType<i16, "w", "", rLet, reg,
+ MxARI16, MxCP_ARI,
+ MxARIPI16, MxCP_ARIPI,
+ MxARIPD16, MxCP_ARIPD,
+ MxARID16, MxCP_ARID,
+ MxARII16, MxCP_ARII,
+ MxAL16, MxCP_AL,
+ MxPCD16, MxCP_PCD,
+ MxPCI16, MxCP_PCI,
+ Mxi16imm, MximmSExt16,
+ Mxloadi16>;
+
+def MxType16 : MxType16Class<?,?>;
+
+class MxType32Class<string rLet, MxOperand reg>
+ : MxType<i32, "l", "", rLet, reg,
+ MxARI32, MxCP_ARI,
+ MxARIPI32, MxCP_ARIPI,
+ MxARIPD32, MxCP_ARIPD,
+ MxARID32, MxCP_ARID,
+ MxARII32, MxCP_ARII,
+ MxAL32, MxCP_AL,
+ MxPCD32, MxCP_PCD,
+ MxPCI32, MxCP_PCI,
+ Mxi32imm, MximmSExt32,
+ Mxloadi32>;
+
+def MxType32 : MxType32Class<?,?>;
+
+
+def MxType8d : MxType8Class<"d", MxDRD8>;
+
+def MxType16d : MxType16Class<"d", MxDRD16>;
+def MxType16a : MxType16Class<"a", MxARD16>;
+def MxType16r : MxType16Class<"r", MxXRD16>;
+def MxType32d : MxType32Class<"d", MxDRD32>;
+def MxType32a : MxType32Class<"a", MxARD32>;
+def MxType32r : MxType32Class<"r", MxXRD32>;
+
+let Postfix = "_TC" in {
+def MxType16d_TC : MxType16Class<"d", MxDRD16_TC>;
+def MxType16a_TC : MxType16Class<"a", MxARD16_TC>;
+def MxType16r_TC : MxType16Class<"r", MxXRD16_TC>;
+def MxType32d_TC : MxType32Class<"d", MxDRD32_TC>;
+def MxType32a_TC : MxType32Class<"a", MxARD32_TC>;
+def MxType32r_TC : MxType32Class<"r", MxXRD32_TC>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subsystems
+//===----------------------------------------------------------------------===//
+
+include "M68kInstrData.td"
+include "M68kInstrShiftRotate.td"
+include "M68kInstrBits.td"
+include "M68kInstrArithmetic.td"
+include "M68kInstrControl.td"
+
+include "M68kInstrCompiler.td"
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kInstrShiftRotate.td b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
new file mode 100644
index 0000000..cab6876
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
@@ -0,0 +1,92 @@
+//===------ M68kInstrShiftRotate.td - Logical Instrs -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the logical instructions in the M68k architecture.
+/// Here is the current status of the file:
+///
+/// Machine:
+///
+/// SHL [~] ASR [~] LSR [~] SWAP [ ]
+/// ROL [~] ROR [~] ROXL [ ] ROXR [ ]
+///
+/// Map:
+///
+/// [ ] - was not touched at all
+/// [!] - requires extarnal stuff implemented
+/// [~] - in progress but usable
+/// [x] - done
+///
+//===----------------------------------------------------------------------===//
+
+def MxRODI_R : MxBead1Bit<0>;
+def MxRODI_L : MxBead1Bit<1>;
+
+def MxROOP_AS : MxBead2Bits<0b00>;
+def MxROOP_LS : MxBead2Bits<0b01>;
+def MxROOP_ROX : MxBead2Bits<0b10>;
+def MxROOP_RO : MxBead2Bits<0b11>;
+
+/// ------------+---------+---+------+---+------+---------
+/// F E D C | B A 9 | 8 | 7 6 | 5 | 4 3 | 2 1 0
+/// ------------+---------+---+------+---+------+---------
+/// 1 1 1 0 | REG/IMM | D | SIZE |R/I| OP | REG
+/// ------------+---------+---+------+---+------+---------
+class MxSREncoding_R<MxBead1Bit DIRECTION, MxBead2Bits ROOP, MxEncSize SIZE>
+ : MxEncoding<MxBeadDReg<0>, ROOP, MxBead1Bit<1>, SIZE, DIRECTION,
+ MxBeadDReg<2>, MxBead4Bits<0b1110>>;
+
+class MxSREncoding_I<MxBead1Bit DIRECTION, MxBead2Bits ROOP, MxEncSize SIZE>
+ : MxEncoding<MxBeadDReg<0>, ROOP, MxBead1Bit<0>, SIZE, DIRECTION,
+ MxBead3Imm<2, 1>, MxBead4Bits<0b1110>>;
+
+// $reg <- $reg op $reg
+class MxSR_DD<string MN, MxType TYPE, SDNode NODE,
+ MxBead1Bit RODI, MxBead2Bits ROOP>
+ : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
+ MxSREncoding_R<RODI, ROOP,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size)>>;
+
+// $reg <- $reg op $imm
+class MxSR_DI<string MN, MxType TYPE, SDNode NODE,
+ MxBead1Bit RODI, MxBead2Bits ROOP>
+ : MxInst<(outs TYPE.ROp:$dst),
+ (ins TYPE.ROp:$src, !cast<Operand>("Mxi"#TYPE.Size#"imm"):$opd),
+ MN#"."#TYPE.Prefix#"\t$opd, $dst",
+ [(set TYPE.VT:$dst,
+ (NODE TYPE.VT:$src,
+ !cast<ImmLeaf>("Mximm"#TYPE.Size#"_1to8"):$opd))],
+ MxSREncoding_I<RODI, ROOP,
+ !cast<MxEncSize>("MxEncSize"#TYPE.Size)>>;
+
+multiclass MxSROp<string MN, SDNode NODE, MxBead1Bit RODI, MxBead2Bits ROOP> {
+
+ let Defs = [CCR] in {
+ let Constraints = "$src = $dst" in {
+
+ def NAME#"8dd" : MxSR_DD<MN, MxType8d, NODE, RODI, ROOP>;
+ def NAME#"16dd" : MxSR_DD<MN, MxType16d, NODE, RODI, ROOP>;
+ def NAME#"32dd" : MxSR_DD<MN, MxType32d, NODE, RODI, ROOP>;
+
+ def NAME#"8di" : MxSR_DI<MN, MxType8d, NODE, RODI, ROOP>;
+ def NAME#"16di" : MxSR_DI<MN, MxType16d, NODE, RODI, ROOP>;
+ def NAME#"32di" : MxSR_DI<MN, MxType32d, NODE, RODI, ROOP>;
+
+ } // $src = $dst
+ } // Defs = [CCR]
+
+} // MxBiArOp_RF
+
+defm SHL : MxSROp<"lsl", shl, MxRODI_L, MxROOP_LS>;
+defm LSR : MxSROp<"lsr", srl, MxRODI_R, MxROOP_LS>;
+defm ASR : MxSROp<"asr", sra, MxRODI_R, MxROOP_AS>;
+
+defm ROL : MxSROp<"rol", rotl, MxRODI_L, MxROOP_RO>;
+defm ROR : MxSROp<"ror", rotr, MxRODI_R, MxROOP_RO>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.cpp
new file mode 100644
index 0000000..f143615
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -0,0 +1,170 @@
+//===-- M68kMCInstLower.cpp - M68k MachineInstr to MCInst ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains code to lower M68k MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kMCInstLower.h"
+
+#include "M68kAsmPrinter.h"
+#include "M68kInstrInfo.h"
+
+#include "MCTargetDesc/M68kBaseInfo.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-mc-inst-lower"
+
+M68kMCInstLower::M68kMCInstLower(MachineFunction &MF, M68kAsmPrinter &AP)
+ : Ctx(MF.getContext()), MF(MF), TM(MF.getTarget()), MAI(*TM.getMCAsmInfo()),
+ AsmPrinter(AP) {}
+
+MCSymbol *
+M68kMCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
+ "Isn't a symbol reference");
+
+ const auto &TT = TM.getTargetTriple();
+ if (MO.isGlobal() && TT.isOSBinFormatELF())
+ return AsmPrinter.getSymbolPreferLocal(*MO.getGlobal());
+
+ const DataLayout &DL = MF.getDataLayout();
+
+ MCSymbol *Sym = nullptr;
+ SmallString<128> Name;
+ StringRef Suffix;
+
+ if (!Suffix.empty())
+ Name += DL.getPrivateGlobalPrefix();
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ AsmPrinter.getNameWithPrefix(Name, GV);
+ } else if (MO.isSymbol()) {
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+ } else if (MO.isMBB()) {
+ assert(Suffix.empty());
+ Sym = MO.getMBB()->getSymbol();
+ }
+
+ Name += Suffix;
+ if (!Sym)
+ Sym = Ctx.getOrCreateSymbol(Name);
+
+ return Sym;
+}
+
+MCOperand M68kMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing. This fixme is originally from X86
+ const MCExpr *Expr = nullptr;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case M68kII::MO_NO_FLAG:
+ case M68kII::MO_ABSOLUTE_ADDRESS:
+ case M68kII::MO_PC_RELATIVE_ADDRESS:
+ break;
+ case M68kII::MO_GOTPCREL:
+ RefKind = MCSymbolRefExpr::VK_GOTPCREL;
+ break;
+ case M68kII::MO_GOT:
+ RefKind = MCSymbolRefExpr::VK_GOT;
+ break;
+ case M68kII::MO_GOTOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTOFF;
+ break;
+ case M68kII::MO_PLT:
+ RefKind = MCSymbolRefExpr::VK_PLT;
+ break;
+ }
+
+ if (!Expr) {
+ Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+ }
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) {
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ }
+
+ return MCOperand::createExpr(Expr);
+}
+
+Optional<MCOperand>
+M68kMCInstLower::LowerOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const {
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return None;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ case MachineOperand::MO_MCSymbol:
+ return LowerSymbolOperand(MO, MO.getMCSymbol());
+ case MachineOperand::MO_JumpTableIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(
+ MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ return None;
+ }
+}
+
+void M68kMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ unsigned Opcode = MI->getOpcode();
+ OutMI.setOpcode(Opcode);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ Optional<MCOperand> MCOp = LowerOperand(MI, MO);
+
+ if (MCOp.hasValue() && MCOp.getValue().isValid())
+ OutMI.addOperand(MCOp.getValue());
+ }
+
+ // TAILJMPj, TAILJMPq - Lower to the correct jump instructions.
+ if (Opcode == M68k::TAILJMPj || Opcode == M68k::TAILJMPq) {
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands");
+ switch (Opcode) {
+ case M68k::TAILJMPj:
+ Opcode = M68k::JMP32j;
+ break;
+ case M68k::TAILJMPq:
+ Opcode = M68k::BRA8;
+ break;
+ }
+ OutMI.setOpcode(Opcode);
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.h b/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.h
new file mode 100644
index 0000000..d616062
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kMCInstLower.h
@@ -0,0 +1,54 @@
+//===-- M68kMCInstLower.h - Lower MachineInstr to MCInst -----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains code to lower M68k MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KMCINSTLOWER_H
+#define LLVM_LIB_TARGET_M68K_M68KMCINSTLOWER_H
+
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCContext;
+class MCInst;
+class MCOperand;
+class MachineInstr;
+class MachineFunction;
+class M68kAsmPrinter;
+
+/// This class is used to lower an MachineInstr into an MCInst.
+class M68kMCInstLower {
+ typedef MachineOperand::MachineOperandType MachineOperandType;
+ MCContext &Ctx;
+ MachineFunction &MF;
+ const TargetMachine &TM;
+ const MCAsmInfo &MAI;
+ M68kAsmPrinter &AsmPrinter;
+
+public:
+ M68kMCInstLower(MachineFunction &MF, M68kAsmPrinter &AP);
+
+ /// Lower an MO_GlobalAddress or MO_ExternalSymbol operand to an MCSymbol.
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ Optional<MCOperand> LowerOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const;
+
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.cpp
new file mode 100644
index 0000000..3d048df
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.cpp
@@ -0,0 +1,20 @@
+//===-- M68kMachineFunctionInfo.cpp - M68k private data ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "M68kMachineFunction.h"
+
+#include "M68kInstrInfo.h"
+#include "M68kSubtarget.h"
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+void M68kMachineFunctionInfo::anchor() {}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.h b/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.h
new file mode 100644
index 0000000..5760bdd
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kMachineFunction.h
@@ -0,0 +1,114 @@
+//===-- M68kMachineFunctionInfo.h - M68k private data ---------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the M68k specific subclass of MachineFunctionInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_M68K_M68KMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+class M68kMachineFunctionInfo : public MachineFunctionInfo {
+ MachineFunction &MF;
+
+ /// Non-zero if the function has base pointer and makes call to
+ /// llvm.eh.sjlj.setjmp. When non-zero, the value is a displacement from the
+ /// frame pointer to a slot where the base pointer is stashed.
+ signed char RestoreBasePointerOffset = 0;
+
+ /// Size of the callee-saved register portion of the stack frame in bytes.
+ unsigned CalleeSavedFrameSize = 0;
+
+ /// Number of bytes function pops on return (in addition to the space used by
+ /// the return address). Used on windows platform for stdcall & fastcall
+ /// name decoration
+ unsigned BytesToPopOnReturn = 0;
+
+ /// FrameIndex for return slot.
+ int ReturnAddrIndex = 0;
+
+ /// The number of bytes by which return address stack slot is moved as the
+ /// result of tail call optimization.
+ int TailCallReturnAddrDelta = 0;
+
+ /// keeps track of the virtual register initialized for use as the global
+ /// base register. This is used for PIC in some PIC relocation models.
+ unsigned GlobalBaseReg = 0;
+
+ /// FrameIndex for start of varargs area.
+ int VarArgsFrameIndex = 0;
+
+ /// Keeps track of whether this function uses sequences of pushes to pass
+ /// function parameters.
+ bool HasPushSequences = false;
+
+ /// Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg = 0;
+
+ /// A list of virtual and physical registers that must be forwarded to every
+ /// musttail call.
+ SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+ /// The number of bytes on stack consumed by the arguments being passed on
+ /// the stack.
+ unsigned ArgumentStackSize = 0;
+
+public:
+ explicit M68kMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+
+ bool getRestoreBasePointer() const { return RestoreBasePointerOffset != 0; }
+ void setRestoreBasePointer(const MachineFunction *MF);
+ int getRestoreBasePointerOffset() const { return RestoreBasePointerOffset; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn(unsigned bytes) { BytesToPopOnReturn = bytes; }
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) { TailCallReturnAddrDelta = delta; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+ return ForwardedMustTailRegParms;
+ }
+
+private:
+ virtual void anchor();
+};
+
+} // end of namespace llvm
+
+#endif // M68K_MACHINE_FUNCTION_INFO_H
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
new file mode 100644
index 0000000..69d1603
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
@@ -0,0 +1,267 @@
+//===-- M68kRegisterInfo.cpp - CPU0 Register Information -----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the CPU0 implementation of the TargetRegisterInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kRegisterInfo.h"
+
+#include "M68k.h"
+#include "M68kMachineFunction.h"
+#include "M68kSubtarget.h"
+
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "M68kGenRegisterInfo.inc"
+
+#define DEBUG_TYPE "m68k-reg-info"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableBasePointer(
+ "m68k-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+// Pin the vtable to this file.
+void M68kRegisterInfo::anchor() {}
+
+M68kRegisterInfo::M68kRegisterInfo(const M68kSubtarget &ST)
+ // FIXME x26 not sure it this the correct value, it expects RA, but M68k
+ // passes IP anyway, how this works?
+ : M68kGenRegisterInfo(M68k::A0, 0, 0, M68k::PC), Subtarget(ST) {
+ StackPtr = M68k::SP;
+ FramePtr = M68k::A6;
+ GlobalBasePtr = M68k::A5;
+ BasePtr = M68k::A4;
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+const MCPhysReg *
+M68kRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ return CSR_STD_SaveList;
+}
+
+const uint32_t *
+M68kRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const {
+ return CSR_STD_RegMask;
+}
+
+const TargetRegisterClass *
+M68kRegisterInfo::getRegsForTailCall(const MachineFunction &MF) const {
+ return &M68k::XR32_TCRegClass;
+}
+
+unsigned
+M68kRegisterInfo::getMatchingMegaReg(unsigned Reg,
+ const TargetRegisterClass *RC) const {
+ for (MCSuperRegIterator Super(Reg, this); Super.isValid(); ++Super)
+ if (RC->contains(*Super))
+ return *Super;
+ return 0;
+}
+
+const TargetRegisterClass *
+M68kRegisterInfo::getMaximalPhysRegClass(unsigned reg, MVT VT) const {
+ assert(Register::isPhysicalRegister(reg) &&
+ "reg must be a physical register");
+
+ // Pick the most sub register class of the right type that contains
+ // this physreg.
+ const TargetRegisterClass *BestRC = nullptr;
+ for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E;
+ ++I) {
+ const TargetRegisterClass *RC = *I;
+ if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) &&
+ RC->contains(reg) &&
+ (!BestRC ||
+ (BestRC->hasSubClass(RC) && RC->getNumRegs() > BestRC->getNumRegs())))
+ BestRC = RC;
+ }
+
+ assert(BestRC && "Couldn't find the register class");
+ return BestRC;
+}
+
+int M68kRegisterInfo::getRegisterOrder(unsigned Reg,
+ const TargetRegisterClass &TRC) const {
+ for (unsigned i = 0; i < TRC.getNumRegs(); ++i) {
+ if (regsOverlap(Reg, TRC.getRegister(i))) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+int M68kRegisterInfo::getSpillRegisterOrder(unsigned Reg) const {
+ int Result = getRegisterOrder(Reg, *getRegClass(M68k::SPILLRegClassID));
+ assert(Result >= 0 && "Can not determine spill order");
+ return Result;
+}
+
+BitVector M68kRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ const M68kFrameLowering *TFI = getFrameLowering(MF);
+
+ BitVector Reserved(getNumRegs());
+
+ // Set a register's and its sub-registers and aliases as reserved.
+ auto setBitVector = [&Reserved, this](unsigned Reg) {
+ for (MCRegAliasIterator I(Reg, this, /* self */ true); I.isValid(); ++I) {
+ Reserved.set(*I);
+ }
+ for (MCSubRegIterator I(Reg, this, /* self */ true); I.isValid(); ++I) {
+ Reserved.set(*I);
+ }
+ };
+
+ // Registers reserved by users
+ for (size_t Reg = 0, Total = getNumRegs(); Reg != Total; ++Reg) {
+ if (MF.getSubtarget<M68kSubtarget>().isRegisterReservedByUser(Reg))
+ setBitVector(Reg);
+ }
+
+ setBitVector(M68k::PC);
+ setBitVector(M68k::SP);
+
+ if (TFI->hasFP(MF)) {
+ setBitVector(FramePtr);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
+ const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error("Stack realignment in presence of dynamic allocas is "
+ "not supported with"
+ "this calling convention.");
+
+ setBitVector(getBaseRegister());
+ }
+
+ return Reserved;
+}
+
+void M68kRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const M68kFrameLowering *TFI = getFrameLowering(MF);
+
+ // We have either (i,An,Rn) or (i,An) EA form
+ // NOTE Base contains the FI and we need to backtrace a bit to get Disp
+ MachineOperand &Disp = MI.getOperand(FIOperandNum - 1);
+ MachineOperand &Base = MI.getOperand(FIOperandNum);
+
+ int Imm = (int)(Disp.getImm());
+ int FIndex = (int)(Base.getIndex());
+
+ // FIXME tail call: implement jmp from mem
+ bool AfterFPPop = false;
+
+ unsigned BasePtr;
+ if (hasBasePointer(MF))
+ BasePtr = (FIndex < 0 ? FramePtr : getBaseRegister());
+ else if (hasStackRealignment(MF))
+ BasePtr = (FIndex < 0 ? FramePtr : StackPtr);
+ else if (AfterFPPop)
+ BasePtr = StackPtr;
+ else
+ BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+ Base.ChangeToRegister(BasePtr, false);
+
+ // Now add the frame object offset to the offset from FP.
+ int64_t FIOffset;
+ Register IgnoredFrameReg;
+ if (AfterFPPop) {
+ // Tail call jmp happens after FP is popped.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ FIOffset = MFI.getObjectOffset(FIndex) - TFI->getOffsetOfLocalArea();
+ } else {
+ FIOffset =
+ TFI->getFrameIndexReference(MF, FIndex, IgnoredFrameReg).getFixed();
+ }
+
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
+ Disp.ChangeToImmediate(FIOffset + Imm);
+}
+
+bool M68kRegisterInfo::requiresRegisterScavenging(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool M68kRegisterInfo::trackLivenessAfterRegAlloc(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+static bool CantUseSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment();
+}
+
+bool M68kRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment, we can't address the stack from the frame
+ // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
+ // can't address variables from the stack pointer. MS inline asm can
+ // reference locals while also adjusting the stack pointer. When we can't
+ // use both the SP and the FP, we need a separate base pointer register.
+ bool CantUseFP = hasStackRealignment(MF);
+ return CantUseFP && CantUseSP(MFI);
+}
+
+bool M68kRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve it.
+ if (CantUseSP(MFI))
+ return MRI->canReserveReg(BasePtr);
+
+ return true;
+}
+
+Register M68kRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+const TargetRegisterClass *M68kRegisterInfo::intRegClass(unsigned size) const {
+ return &M68k::DR32RegClass;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.h b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.h
new file mode 100644
index 0000000..51b9429
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.h
@@ -0,0 +1,109 @@
+//===-- M68kRegisterInfo.h - M68k Register Information Impl --*- C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the M68k implementation of the TargetRegisterInfo
+/// class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KREGISTERINFO_H
+#define LLVM_LIB_TARGET_M68K_M68KREGISTERINFO_H
+
+#include "M68k.h"
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "M68kGenRegisterInfo.inc"
+
+namespace llvm {
+class M68kSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class M68kRegisterInfo : public M68kGenRegisterInfo {
+ virtual void anchor();
+
+ /// Physical register used as stack ptr.
+ unsigned StackPtr;
+
+ /// Physical register used as frame ptr.
+ unsigned FramePtr;
+
+ /// Physical register used as a base ptr in complex stack frames. I.e., when
+ /// we need a 3rd base, not just SP and FP, due to variable size stack
+ /// objects.
+ unsigned BasePtr;
+
+ /// Physical register used to store GOT address if needed.
+ unsigned GlobalBasePtr;
+
+protected:
+ const M68kSubtarget &Subtarget;
+
+public:
+ M68kRegisterInfo(const M68kSubtarget &Subtarget);
+
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+
+ /// Returns a register class with registers that can be used in forming tail
+ /// calls.
+ const TargetRegisterClass *
+ getRegsForTailCall(const MachineFunction &MF) const;
+
+ /// Return a mega-register of the specified register Reg so its sub-register
+ /// of index SubIdx is Reg, its super(or mega) Reg. In other words it will
+ /// return a register that is not direct super register but still shares
+ /// physical register with Reg.
+ /// NOTE not sure about the term though.
+ unsigned getMatchingMegaReg(unsigned Reg,
+ const TargetRegisterClass *RC) const;
+
+ /// Returns the Register Class of a physical register of the given type,
+ /// picking the biggest register class of the right type that contains this
+ /// physreg.
+ const TargetRegisterClass *getMaximalPhysRegClass(unsigned reg, MVT VT) const;
+
+ /// Return index of a register within a register class, otherwise return -1
+ int getRegisterOrder(unsigned Reg, const TargetRegisterClass &TRC) const;
+
+ /// Return spill order index of a register, if there is none then trap
+ int getSpillRegisterOrder(unsigned Reg) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ /// FrameIndex represent objects inside a abstract stack. We must replace
+ /// FrameIndex with an stack/frame pointer direct reference.
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ /// True if the stack can be realigned for the target.
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ Register getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getStackRegister() const { return StackPtr; }
+ unsigned getBaseRegister() const { return BasePtr; }
+ unsigned getGlobalBaseRegister() const { return GlobalBasePtr; }
+
+ const TargetRegisterClass *intRegClass(unsigned Size) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.td b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.td
new file mode 100644
index 0000000..e2ea296
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kRegisterInfo.td
@@ -0,0 +1,127 @@
+//== M68kRegisterInfo.td - M68k register definitions ----*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes the M68k Register file, defining the registers
+/// aliases between the registers, and the register classes built out of the
+/// registers.
+///
+//===----------------------------------------------------------------------===//
+
+class MxReg<string N, bits<16> ENC,
+ list<Register> SUBREGS = [], list<SubRegIndex> SUBIDX,
+ list<int> DWREGS = [], list<string> ALTNAMES = []>
+ : Register<N, ALTNAMES>, DwarfRegNum<DWREGS> {
+ let Namespace = "M68k";
+ let HWEncoding = ENC;
+ let SubRegs = SUBREGS;
+ let SubRegIndices = SUBIDX;
+}
+
+// Subregister indices.
+let Namespace = "M68k" in {
+ def MxSubRegIndex8Lo : SubRegIndex<8, 0>;
+ def MxSubRegIndex16Lo : SubRegIndex<16, 0>;
+}
+
+multiclass MxDataRegister<int INDEX, string REG_NAME, list<string> ALTNAMES = []> {
+ def "B"#NAME : MxReg<REG_NAME, INDEX, [], [], [INDEX], ALTNAMES>;
+ def "W"#NAME
+ : MxReg<REG_NAME, INDEX,
+ [!cast<Register>("B"#NAME)], [MxSubRegIndex8Lo],
+ [INDEX], ALTNAMES>;
+ def NAME
+ : MxReg<REG_NAME, INDEX,
+ [!cast<Register>("W"#NAME)], [MxSubRegIndex16Lo],
+ [INDEX], ALTNAMES>;
+}
+
+multiclass MxAddressRegister<int INDEX, string REG_NAME, list<string> ALTNAMES = []> {
+ def "W"#NAME
+ : MxReg<REG_NAME, INDEX, [], [], [!add(8,INDEX)], ALTNAMES>;
+ def NAME
+ : MxReg<REG_NAME, INDEX,
+ [!cast<Register>("W"#NAME)], [MxSubRegIndex16Lo],
+ [!add(8,INDEX)], ALTNAMES>;
+}
+
+defm D0 : MxDataRegister<0, "d0">;
+defm D1 : MxDataRegister<1, "d1">;
+defm D2 : MxDataRegister<2, "d2">;
+defm D3 : MxDataRegister<3, "d3">;
+defm D4 : MxDataRegister<4, "d4">;
+defm D5 : MxDataRegister<5, "d5">;
+defm D6 : MxDataRegister<6, "d6">;
+defm D7 : MxDataRegister<7, "d7">;
+
+defm A0 : MxAddressRegister<0, "a0">;
+defm A1 : MxAddressRegister<1, "a1">;
+defm A2 : MxAddressRegister<2, "a2">;
+defm A3 : MxAddressRegister<3, "a3">;
+defm A4 : MxAddressRegister<4, "a4">;
+defm A5 : MxAddressRegister<5, "a5", ["bp"]>;
+defm A6 : MxAddressRegister<6, "a6", ["fp"]>;
+defm SP : MxAddressRegister<7, "sp", ["usp", "ssp", "isp", "a7"]>;
+
+
+// Pseudo Registers
+class MxPseudoReg<string N, list<Register> SUBREGS = [], list<SubRegIndex> SUBIDX = []>
+ : MxReg<N, 0, SUBREGS, SUBIDX>;
+
+def CCR : MxPseudoReg<"ccr">;
+def SR : MxPseudoReg<"sr">;
+
+def PC : MxPseudoReg<"pc">;
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+class MxRegClass<list<ValueType> regTypes, int alignment, dag regList>
+ : RegisterClass<"M68k", regTypes, alignment, regList>;
+
+// Data Registers
+def DR8 : MxRegClass<[i8], 16, (sequence "BD%u", 0, 7)>;
+def DR16 : MxRegClass<[i16], 16, (sequence "WD%u", 0, 7)>;
+def DR32 : MxRegClass<[i32], 32, (sequence "D%u", 0, 7)>;
+
+// Address Registers
+def AR16 : MxRegClass<[i16], 16, (add (sequence "WA%u", 0, 6), WSP)>;
+def AR32 : MxRegClass<[i32], 32, (add (sequence "A%u", 0, 6), SP)>;
+
+def AR32_NOSP : MxRegClass<[i32], 32, (sequence "A%u", 0, 6)>;
+
+// Index Register Classes
+// FIXME try alternative ordering like `D0, D1, A0, A1, ...`
+def XR16 : MxRegClass<[i16], 16, (add DR16, AR16)>;
+def XR32 : MxRegClass<[i32], 32, (add DR32, AR32)>;
+
+def SPC : MxRegClass<[i32], 32, (add SP)>;
+
+let CopyCost = -1 in {
+ def CCRC : MxRegClass<[i8], 16, (add CCR)>;
+ def SRC : MxRegClass<[i16], 16, (add SR)>;
+}
+
+let isAllocatable = 0 in {
+ def PCC : MxRegClass<[i32], 32, (add PC)>;
+}
+
+// Register used with tail call
+def DR16_TC : MxRegClass<[i16], 16, (add D0, D1)>;
+def DR32_TC : MxRegClass<[i32], 32, (add D0, D1)>;
+
+def AR16_TC : MxRegClass<[i16], 16, (add A0, A1)>;
+def AR32_TC : MxRegClass<[i32], 32, (add A0, A1)>;
+
+def XR16_TC : MxRegClass<[i16], 16, (add DR16_TC, AR16_TC)>;
+def XR32_TC : MxRegClass<[i32], 32, (add DR32_TC, AR32_TC)>;
+
+// These classes provide spill/restore order if used with MOVEM instruction
+def SPILL : MxRegClass<[i32], 32, (add XR32)>;
+def SPILL_R : MxRegClass<[i32], 32, (add SP, (sequence "A%u", 6, 0), (sequence "D%u", 7, 0))>;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kSchedule.td b/src/llvm-project/llvm/lib/Target/M68k/M68kSchedule.td
new file mode 100644
index 0000000..a94cd8f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kSchedule.td
@@ -0,0 +1,23 @@
+//===-- M68kSchedule.td - M68k Scheduling Definitions --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains M68k scheduler definitions.
+///
+//===----------------------------------------------------------------------===//
+
+/// This is a very general M68k Scheduling Model and best suited for the very
+/// first M68000 CPU, other model must override these characteristics
+class M68kSchedModel : SchedMachineModel {
+ let LoadLatency = 4; // Word (Rn)
+ let HighLatency = 16; // Long ABS
+ let PostRAScheduler = 0;
+ let CompleteModel = 0;
+}
+
+def GenericM68kModel : M68kSchedModel;
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.cpp
new file mode 100644
index 0000000..963e83c
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -0,0 +1,259 @@
+//===-- M68kSubtarget.cpp - M68k Subtarget Information ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the M68k specific subclass of TargetSubtargetInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kSubtarget.h"
+#include "GlSel/M68kCallLowering.h"
+#include "GlSel/M68kLegalizerInfo.h"
+#include "GlSel/M68kRegisterBankInfo.h"
+
+#include "M68k.h"
+#include "M68kMachineFunction.h"
+#include "M68kRegisterInfo.h"
+#include "M68kTargetMachine.h"
+
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "M68kGenSubtargetInfo.inc"
+
+extern bool FixGlobalBaseReg;
+
+/// Select the M68k CPU for the given triple and cpu name.
+static StringRef selectM68kCPU(Triple TT, StringRef CPU) {
+ if (CPU.empty() || CPU == "generic") {
+ CPU = "M68000";
+ }
+ return CPU;
+}
+
+void M68kSubtarget::anchor() {}
+
+M68kSubtarget::M68kSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const M68kTargetMachine &TM)
+ : M68kGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ UserReservedRegister(M68k::NUM_TARGET_REGS), TM(TM), TSInfo(),
+ InstrInfo(initializeSubtargetDependencies(CPU, TT, FS, TM)),
+ FrameLowering(*this, this->getStackAlignment()), TLInfo(TM, *this),
+ TargetTriple(TT) {
+ CallLoweringInfo.reset(new M68kCallLowering(*getTargetLowering()));
+ Legalizer.reset(new M68kLegalizerInfo(*this));
+
+ auto *RBI = new M68kRegisterBankInfo(*getRegisterInfo());
+ RegBankInfo.reset(RBI);
+ InstSelector.reset(createM68kInstructionSelector(TM, *this, *RBI));
+}
+
+const CallLowering *M68kSubtarget::getCallLowering() const {
+ return CallLoweringInfo.get();
+}
+
+InstructionSelector *M68kSubtarget::getInstructionSelector() const {
+ return InstSelector.get();
+}
+
+const LegalizerInfo *M68kSubtarget::getLegalizerInfo() const {
+ return Legalizer.get();
+}
+
+const RegisterBankInfo *M68kSubtarget::getRegBankInfo() const {
+ return RegBankInfo.get();
+}
+
+bool M68kSubtarget::isPositionIndependent() const {
+ return TM.isPositionIndependent();
+}
+
+bool M68kSubtarget::isLegalToCallImmediateAddr() const { return true; }
+
+bool M68kSubtarget::abiUsesSoftFloat() const { return true; }
+
+M68kSubtarget &M68kSubtarget::initializeSubtargetDependencies(
+ StringRef CPU, Triple TT, StringRef FS, const M68kTargetMachine &TM) {
+ std::string CPUName = selectM68kCPU(TT, CPU).str();
+
+ // Parse features string.
+ ParseSubtargetFeatures(CPUName, CPUName, FS);
+
+ // Initialize scheduling itinerary for the specified CPU.
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ stackAlignment = 8;
+
+ return *this;
+}
+
+//===----------------------------------------------------------------------===//
+// Code Model
+//
+// Key assumptions:
+// - Whenever possible we use pc-rel encoding since it is smaller(16 bit) than
+// absolute(32 bit).
+// - GOT is reachable within 16 bit offset for both Small and Medium models.
+// - Code section is reachable within 16 bit offset for both models.
+//
+// ---------------------+-------------------------+--------------------------
+// | Small | Medium
+// +-------------------------+------------+-------------
+// | Static | PIC | Static | PIC
+// ---------------------+------------+------------+------------+-------------
+// branch | pc-rel | pc-rel | pc-rel | pc-rel
+// ---------------------+------------+------------+------------+-------------
+// call global | @PLT | @PLT | @PLT | @PLT
+// ---------------------+------------+------------+------------+-------------
+// call internal | pc-rel | pc-rel | pc-rel | pc-rel
+// ---------------------+------------+------------+------------+-------------
+// data local | pc-rel | pc-rel | ~pc-rel | ^pc-rel
+// ---------------------+------------+------------+------------+-------------
+// data local big* | pc-rel | pc-rel | absolute | @GOTOFF
+// ---------------------+------------+------------+------------+-------------
+// data global | pc-rel | @GOTPCREL | ~pc-rel | @GOTPCREL
+// ---------------------+------------+------------+------------+-------------
+// data global big* | pc-rel | @GOTPCREL | absolute | @GOTPCREL
+// ---------------------+------------+------------+------------+-------------
+//
+// * Big data potentially cannot be reached within 16 bit offset and requires
+// special handling for old(x00 and x10) CPUs. Normally these symbols go into
+// separate .ldata section which mapped after normal .data and .text, but I
+// don't really know how this must be done for M68k atm... will try to dig
+// this info out from GCC. For now CPUs prior to M68020 will use static ref
+// for Static Model and @GOT based references for PIC.
+//
+// ~ These are absolute for older CPUs for now.
+// ^ These are @GOTOFF for older CPUs for now.
+//===----------------------------------------------------------------------===//
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char M68kSubtarget::classifyBlockAddressReference() const {
+ // Unless we start to support Large Code Model branching is always pc-rel
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+}
+
+unsigned char
+M68kSubtarget::classifyLocalReference(const GlobalValue *GV) const {
+ switch (TM.getCodeModel()) {
+ default:
+ llvm_unreachable("Unsupported code model");
+ case CodeModel::Small:
+ case CodeModel::Kernel: {
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+ }
+ case CodeModel::Medium: {
+ if (isPositionIndependent()) {
+ // On M68020 and better we can fit big any data offset into dips field.
+ if (atLeastM68020()) {
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+ }
+ // Otherwise we could check the data size and make sure it will fit into
+ // 16 bit offset. For now we will be conservative and go with @GOTOFF
+ return M68kII::MO_GOTOFF;
+ } else {
+ if (atLeastM68020()) {
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+ }
+ return M68kII::MO_ABSOLUTE_ADDRESS;
+ }
+ }
+ }
+}
+
+unsigned char M68kSubtarget::classifyExternalReference(const Module &M) const {
+ if (TM.shouldAssumeDSOLocal(M, nullptr))
+ return classifyLocalReference(nullptr);
+
+ if (isPositionIndependent())
+ return M68kII::MO_GOTPCREL;
+
+ return M68kII::MO_GOT;
+}
+
+unsigned char
+M68kSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
+ return classifyGlobalReference(GV, *GV->getParent());
+}
+
+unsigned char M68kSubtarget::classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const {
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return classifyLocalReference(GV);
+
+ switch (TM.getCodeModel()) {
+ default:
+ llvm_unreachable("Unsupported code model");
+ case CodeModel::Small:
+ case CodeModel::Kernel: {
+ if (isPositionIndependent())
+ return M68kII::MO_GOTPCREL;
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+ }
+ case CodeModel::Medium: {
+ if (isPositionIndependent())
+ return M68kII::MO_GOTPCREL;
+
+ if (atLeastM68020())
+ return M68kII::MO_PC_RELATIVE_ADDRESS;
+
+ return M68kII::MO_ABSOLUTE_ADDRESS;
+ }
+ }
+}
+
+unsigned M68kSubtarget::getJumpTableEncoding() const {
+ if (isPositionIndependent()) {
+ // The only time we want to use GOTOFF(used when with EK_Custom32) is when
+ // the potential delta between the jump target and table base can be larger
+ // than displacement field, which is True for older CPUs(16 bit disp)
+ // in Medium model(can have large data way beyond 16 bit).
+ if (TM.getCodeModel() == CodeModel::Medium && !atLeastM68020())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ return MachineJumpTableInfo::EK_LabelDifference32;
+ }
+
+ // In non-pic modes, just use the address of a block.
+ return MachineJumpTableInfo::EK_BlockAddress;
+}
+
+unsigned char
+M68kSubtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
+ return classifyGlobalFunctionReference(GV, *GV->getParent());
+}
+
+unsigned char
+M68kSubtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const {
+ // local always use pc-rel referencing
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return M68kII::MO_NO_FLAG;
+
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids run-time overhead
+ // at the cost of eager binding.
+ auto *F = dyn_cast_or_null<Function>(GV);
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind)) {
+ return M68kII::MO_GOTPCREL;
+ }
+
+ // otherwise linker will figure this out
+ return M68kII::MO_PLT;
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.h b/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.h
new file mode 100644
index 0000000..f45cb7e
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kSubtarget.h
@@ -0,0 +1,182 @@
+//===-- M68kSubtarget.h - Define Subtarget for the M68k -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the M68k specific subclass of TargetSubtargetInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H
+#define LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H
+
+#include "M68kFrameLowering.h"
+#include "M68kISelLowering.h"
+#include "M68kInstrInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Alignment.h"
+
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "M68kGenSubtargetInfo.inc"
+
+extern bool M68kReserveGP;
+extern bool M68kNoCpload;
+
+namespace llvm {
+class StringRef;
+
+class M68kTargetMachine;
+
+class M68kSubtarget : public M68kGenSubtargetInfo {
+ virtual void anchor();
+
+protected:
+ // These define which ISA is supported. Since each Motorola M68k ISA is
+ // built on top of the previous one whenever an ISA is selected the previous
+ // selected as well.
+ enum SubtargetEnum { M00, M10, M20, M30, M40, M60 };
+ SubtargetEnum SubtargetKind = M00;
+
+ BitVector UserReservedRegister;
+
+ InstrItineraryData InstrItins;
+
+ /// Small section is used.
+ bool UseSmallSection = true;
+
+ const M68kTargetMachine &TM;
+
+ SelectionDAGTargetInfo TSInfo;
+ M68kInstrInfo InstrInfo;
+ M68kFrameLowering FrameLowering;
+ M68kTargetLowering TLInfo;
+
+ /// The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment = 8;
+
+ Triple TargetTriple;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ M68kSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const M68kTargetMachine &_TM);
+
+ /// Parses features string setting specified subtarget options. Definition
+ /// of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ bool atLeastM68000() const { return SubtargetKind >= M00; }
+ bool atLeastM68010() const { return SubtargetKind >= M10; }
+ bool atLeastM68020() const { return SubtargetKind >= M20; }
+ bool atLeastM68030() const { return SubtargetKind >= M30; }
+ bool atLeastM68040() const { return SubtargetKind >= M40; }
+ bool atLeastM68060() const { return SubtargetKind >= M60; }
+
+ bool useSmallSection() const { return UseSmallSection; }
+
+ bool abiUsesSoftFloat() const;
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+ /// Return true if the subtarget allows calls to immediate address.
+ bool isLegalToCallImmediateAddr() const;
+
+ bool isPositionIndependent() const;
+
+ bool isRegisterReservedByUser(Register R) const {
+ assert(R < M68k::NUM_TARGET_REGS && "Register out of range");
+ return UserReservedRegister[R];
+ }
+
+ /// Classify a global variable reference for the current subtarget according
+ /// to how we should reference it in a non-pcrel context.
+ unsigned char classifyLocalReference(const GlobalValue *GV) const;
+
+ /// Classify a global variable reference for the current subtarget according
+ /// to how we should reference it in a non-pcrel context.
+ unsigned char classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+ /// Classify a external variable reference for the current subtarget according
+ /// to how we should reference it in a non-pcrel context.
+ unsigned char classifyExternalReference(const Module &M) const;
+
+ /// Classify a global function reference for the current subtarget.
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
+
+ /// Classify a blockaddress reference for the current subtarget according to
+ /// how we should reference it in a non-pcrel context.
+ unsigned char classifyBlockAddressReference() const;
+
+ unsigned getJumpTableEncoding() const;
+
+ /// TODO this must be controlled by options like -malign-int and -mshort
+ Align getStackAlignment() const { return Align(stackAlignment); }
+
+ /// getSlotSize - Stack slot size in bytes.
+ unsigned getSlotSize() const { return 4; }
+
+ M68kSubtarget &initializeSubtargetDependencies(StringRef CPU, Triple TT,
+ StringRef FS,
+ const M68kTargetMachine &TM);
+
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ const M68kInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const M68kFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const M68kRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const M68kTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+protected:
+ // GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+public:
+ const CallLowering *getCallLowering() const override;
+ InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+};
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.cpp
new file mode 100644
index 0000000..cb7d8f8
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -0,0 +1,192 @@
+//===-- M68kTargetMachine.cpp - M68k target machine ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains implementation for M68k target machine.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kTargetMachine.h"
+#include "M68k.h"
+#include "M68kSubtarget.h"
+#include "M68kTargetObjectFile.h"
+#include "TargetInfo/M68kTargetInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k"
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() {
+ RegisterTargetMachine<M68kTargetMachine> X(getTheM68kTarget());
+ auto *PR = PassRegistry::getPassRegistry();
+ initializeGlobalISel(*PR);
+}
+
+namespace {
+
+std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options) {
+ std::string Ret = "";
+ // M68k is Big Endian
+ Ret += "E";
+
+ // FIXME how to wire it with the used object format?
+ Ret += "-m:e";
+
+ // M68k pointers are always 32 bit wide even for 16-bit CPUs.
+ // The ABI only specifies 16-bit alignment.
+ // On at least the 68020+ with a 32-bit bus, there is a performance benefit
+ // to having 32-bit alignment.
+ Ret += "-p:32:16:32";
+
+ // Bytes do not require special alignment, words are word aligned and
+ // long words are word aligned at minimum.
+ Ret += "-i8:8:8-i16:16:16-i32:16:32";
+
+ // FIXME no floats at the moment
+
+ // The registers can hold 8, 16, 32 bits
+ Ret += "-n8:16:32";
+
+ Ret += "-a:0:16-S16";
+
+ return Ret;
+}
+
+Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ // If not defined we default to static
+ if (!RM.hasValue()) {
+ return Reloc::Static;
+ }
+
+ return *RM;
+}
+
+CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
+ bool JIT) {
+ if (!CM) {
+ return CodeModel::Small;
+ } else if (CM == CodeModel::Large) {
+ llvm_unreachable("Large code model is not supported");
+ } else if (CM == CodeModel::Kernel) {
+ llvm_unreachable("Kernel code model is not implemented yet");
+ }
+ return CM.getValue();
+}
+} // end anonymous namespace
+
+M68kTargetMachine::M68kTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
+ Options, getEffectiveRelocModel(TT, RM),
+ ::getEffectiveCodeModel(CM, JIT), OL),
+ TLOF(std::make_unique<M68kELFTargetObjectFile>()),
+ Subtarget(TT, CPU, FS, *this) {
+ initAsmInfo();
+}
+
+M68kTargetMachine::~M68kTargetMachine() {}
+
+const M68kSubtarget *
+M68kTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ auto CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ auto FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = std::make_unique<M68kSubtarget>(TargetTriple, CPU, FS, *this);
+ }
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+class M68kPassConfig : public TargetPassConfig {
+public:
+ M68kPassConfig(M68kTargetMachine &TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ M68kTargetMachine &getM68kTargetMachine() const {
+ return getTM<M68kTargetMachine>();
+ }
+
+ const M68kSubtarget &getM68kSubtarget() const {
+ return *getM68kTargetMachine().getSubtargetImpl();
+ }
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+ bool addInstSelector() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *M68kTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new M68kPassConfig(*this, PM);
+}
+
+bool M68kPassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createM68kISelDag(getM68kTargetMachine()));
+ addPass(createM68kGlobalBaseRegPass());
+ return false;
+}
+
+bool M68kPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool M68kPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool M68kPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool M68kPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
+
+void M68kPassConfig::addPreSched2() { addPass(createM68kExpandPseudoPass()); }
+
+void M68kPassConfig::addPreEmitPass() {
+ addPass(createM68kCollapseMOVEMPass());
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.h b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.h
new file mode 100644
index 0000000..34fae8e
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetMachine.h
@@ -0,0 +1,56 @@
+//===-- M68kTargetMachine.h - Define TargetMachine for M68k ----- C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the M68k specific subclass of TargetMachine.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KTARGETMACHINE_H
+#define LLVM_LIB_TARGET_M68K_M68KTARGETMACHINE_H
+
+#include "M68kSubtarget.h"
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class M68kRegisterInfo;
+
+class M68kTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ M68kSubtarget Subtarget;
+
+ mutable StringMap<std::unique_ptr<M68kSubtarget>> SubtargetMap;
+
+public:
+ M68kTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
+
+ ~M68kTargetMachine() override;
+
+ const M68kSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+ const M68kSubtarget *getSubtargetImpl(const Function &F) const override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp
new file mode 100644
index 0000000..3e26b37
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp
@@ -0,0 +1,48 @@
+//===-- M68kELFTargetObjectFile.cpp - M68k Object Files -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains definitions for M68k ELF object file lowering.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kTargetObjectFile.h"
+
+#include "M68kSubtarget.h"
+#include "M68kTargetMachine.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SSThreshold(
+ "m68k-ssection-threshold", cl::Hidden,
+ cl::desc("Small data and bss section threshold size (default=8)"),
+ cl::init(8));
+
+void M68kELFTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ this->TM = &static_cast<const M68kTargetMachine &>(TM);
+
+ // FIXME do we need `.sdata` and `.sbss` explicitly?
+ SmallDataSection = getContext().getELFSection(
+ ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+ SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.h b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.h
new file mode 100644
index 0000000..dbc5375
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/M68kTargetObjectFile.h
@@ -0,0 +1,31 @@
+//===-- M68kELFTargetObjectFile.h - M68k Object Info ---------*- C++ -====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains declarations for M68k ELF object file lowering.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_M68KTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_M68K_M68KTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class M68kTargetMachine;
+class M68kELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ const M68kTargetMachine *TM;
+ MCSection *SmallDataSection;
+ MCSection *SmallBSSSection;
+
+public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..acd8be9
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_component_library(LLVMM68kDesc
+ M68kAsmBackend.cpp
+ M68kELFObjectWriter.cpp
+ M68kInstPrinter.cpp
+ M68kMCAsmInfo.cpp
+ M68kMCCodeEmitter.cpp
+ M68kMCTargetDesc.cpp
+
+ LINK_COMPONENTS
+ MC
+ MCDisassembler
+ Support
+ M68kInfo
+
+ ADD_TO_COMPONENT
+ M68k
+)
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
new file mode 100644
index 0000000..8a0f32b
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -0,0 +1,239 @@
+//===-- M68kAsmBackend.cpp - M68k Assembler Backend ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains definitions for M68k assembler backend.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/M68kBaseInfo.h"
+#include "MCTargetDesc/M68kFixupKinds.h"
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class M68kAsmBackend : public MCAsmBackend {
+
+public:
+ M68kAsmBackend(const Target &T) : MCAsmBackend(support::big) {}
+
+ unsigned getNumFixupKinds() const override { return 0; }
+
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override {
+ unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
+
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert(isIntN(Size * 8 + 1, Value) &&
+ "Value does not fit in the Fixup field");
+
+ // Write in Big Endian
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> ((Size - i - 1) * 8));
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+
+ /// Returns the minimum size of a nop in bytes on this target. The assembler
+ /// will use this to emit excess padding in situations where the padding
+ /// required for simple alignment would be less than the minimum nop size.
+ unsigned getMinimumNopSize() const override { return 2; }
+
+ /// Write a sequence of optimal nops to the output, covering \p Count bytes.
+ /// \return - true on success, false on failure
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+};
+} // end anonymous namespace
+
+/// cc—Carry clear GE—Greater than or equal
+/// LS—Lower or same PL—Plus
+/// CS—Carry set GT—Greater than
+/// LT—Less than
+/// EQ—Equal HI—Higher
+/// MI—Minus VC—Overflow clear
+/// LE—Less than or equal
+/// NE—Not equal VS—Overflow set
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst) {
+ unsigned Op = Inst.getOpcode();
+ switch (Op) {
+ default:
+ return Op;
+ case M68k::BRA8:
+ return M68k::BRA16;
+ case M68k::Bcc8:
+ return M68k::Bcc16;
+ case M68k::Bls8:
+ return M68k::Bls16;
+ case M68k::Blt8:
+ return M68k::Blt16;
+ case M68k::Beq8:
+ return M68k::Beq16;
+ case M68k::Bmi8:
+ return M68k::Bmi16;
+ case M68k::Bne8:
+ return M68k::Bne16;
+ case M68k::Bge8:
+ return M68k::Bge16;
+ case M68k::Bcs8:
+ return M68k::Bcs16;
+ case M68k::Bpl8:
+ return M68k::Bpl16;
+ case M68k::Bgt8:
+ return M68k::Bgt16;
+ case M68k::Bhi8:
+ return M68k::Bhi16;
+ case M68k::Bvc8:
+ return M68k::Bvc16;
+ case M68k::Ble8:
+ return M68k::Ble16;
+ case M68k::Bvs8:
+ return M68k::Bvs16;
+ }
+}
+
+static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
+ unsigned Op = Inst.getOpcode();
+ // NOTE there will be some relaxations for PCD and ARD mem for x20
+ return Op;
+}
+
+static unsigned getRelaxedOpcode(const MCInst &Inst) {
+ unsigned R = getRelaxedOpcodeArith(Inst);
+ if (R != Inst.getOpcode())
+ return R;
+ return getRelaxedOpcodeBranch(Inst);
+}
+
+bool M68kAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ // Branches can always be relaxed in either mode.
+ if (getRelaxedOpcodeBranch(Inst) != Inst.getOpcode())
+ return true;
+
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcodeArith(Inst) == Inst.getOpcode())
+ return false;
+
+ // Check if the relaxable operand has an expression. For the current set of
+ // relaxable instructions, the relaxable operand is always the last operand.
+ // NOTE will change for x20 mem
+ unsigned RelaxableOp = Inst.getNumOperands() - 1;
+ if (Inst.getOperand(RelaxableOp).isExpr())
+ return true;
+
+ return false;
+}
+
+bool M68kAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // TODO Newer CPU can use 32 bit offsets, so check for this when ready
+ if (!isInt<16>(Value)) {
+ llvm_unreachable("Cannot relax the instruction, value does not fit");
+ }
+ // Relax if the value is too big for a (signed) i8. This means that byte-wide
+ // instructions have to matched by default
+ //
+ // NOTE
+ // A branch to the immediately following instruction automatically
+ // uses the 16-bit displacement format because the 8-bit
+ // displacement field contains $00 (zero offset).
+ return Value == 0 || !isInt<8>(Value);
+}
+
+// NOTE Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void M68kAsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ // The only relaxations M68k does is from a 1byte pcrel to a 2byte PCRel.
+ unsigned RelaxedOp = getRelaxedOpcode(Inst);
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Inst.setOpcode(RelaxedOp);
+}
+
+bool M68kAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+ // Cannot emit NOP with size being not multiple of 16 bits.
+ if (Count % 2 != 0)
+ return false;
+
+ uint64_t NumNops = Count / 2;
+ for (uint64_t i = 0; i != NumNops; ++i) {
+ OS << "\x4E\x71";
+ }
+
+ return true;
+}
+
+namespace {
+
+class M68kELFAsmBackend : public M68kAsmBackend {
+public:
+ uint8_t OSABI;
+ M68kELFAsmBackend(const Target &T, uint8_t OSABI)
+ : M68kAsmBackend(T), OSABI(OSABI) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createM68kELFObjectWriter(OSABI);
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createM68kAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new M68kELFAsmBackend(T, OSABI);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h
new file mode 100644
index 0000000..7c56cfd
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h
@@ -0,0 +1,247 @@
+//===-- M68kBaseInfo.h - Top level definitions for M68k MC --*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains small standalone helper functions and enum definitions
+/// for the M68k target useful for the compiler back-end and the MC
+/// libraries. As such, it deliberately does not include references to LLVM
+/// core code gen types, passes, etc..
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KBASEINFO_H
+#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KBASEINFO_H
+
+#include "M68kMCTargetDesc.h"
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_INSTRINFO_MI_OPS_INFO
+#define GET_INSTRINFO_OPERAND_TYPES_ENUM
+#define GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP
+#include "M68kGenInstrInfo.inc"
+
+namespace llvm {
+
+namespace M68k {
+
+/// Enums for memory operand decoding. Supports these forms:
+/// (d,An)
+/// (d,An,Xn)
+/// ([bd,An],Xn,od)
+/// ([bd,An,Xn],od)
+/// TODO Implement scaling other than 1
+enum { MemDisp = 0, MemBase = 1, MemIndex = 2, MemOuter = 3 };
+
+/// Enums for pc-relative memory operand decoding. Supports these forms:
+/// (d,PC)
+/// (d,PC,Xn)
+/// ([bd,PC],Xn,od)
+/// ([bd,PC,Xn],od)
+enum { PCRelDisp = 0, PCRelIndex = 1, PCRelOuter = 2 };
+} // namespace M68k
+
+namespace M68kBeads {
+enum {
+ Ctrl = 0x0,
+ Bits1 = 0x1,
+ Bits2 = 0x2,
+ Bits3 = 0x3,
+ Bits4 = 0x4,
+ DAReg = 0x5,
+ DA = 0x6,
+ Reg = 0x7,
+ DReg = 0x8,
+ Disp8 = 0x9,
+ Imm8 = 0xA,
+ Imm16 = 0xB,
+ Imm32 = 0xC,
+ Imm3 = 0xD,
+};
+
+// Ctrl payload
+enum {
+ Term = 0x0,
+ Ignore = 0x1,
+};
+} // namespace M68kBeads
+
+/// This namespace holds all of the target specific flags that instruction info
+/// tracks.
+namespace M68kII {
+/// Target Operand Flag enum.
+enum TOF {
+
+ MO_NO_FLAG,
+
+ /// On a symbol operand this indicates that the immediate is the absolute
+ /// address of the symbol.
+ MO_ABSOLUTE_ADDRESS,
+
+ /// On a symbol operand this indicates that the immediate is the pc-relative
+ /// address of the symbol.
+ MO_PC_RELATIVE_ADDRESS,
+
+ /// On a symbol operand this indicates that the immediate is the offset to
+ /// the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// name@GOT
+ MO_GOT,
+
+ /// On a symbol operand this indicates that the immediate is the offset to
+ /// the location of the symbol name from the base of the GOT.
+ ///
+ /// name@GOTOFF
+ MO_GOTOFF,
+
+ /// On a symbol operand this indicates that the immediate is offset to the
+ /// GOT entry for the symbol name from the current code location.
+ ///
+ /// name@GOTPCREL
+ MO_GOTPCREL,
+
+ /// On a symbol operand this indicates that the immediate is offset to the
+ /// PLT entry of symbol name from the current code location.
+ ///
+ /// name@PLT
+ MO_PLT,
+}; // enum TOF
+
+/// Return true if the specified TargetFlag operand is a reference to a stub
+/// for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ default:
+ return false;
+ case M68kII::MO_GOTPCREL: // pc-relative GOT reference.
+ case M68kII::MO_GOT: // normal GOT reference.
+ return true;
+ }
+}
+
+/// Return True if the specified GlobalValue is a direct reference for a
+/// symbol.
+inline static bool isDirectGlobalReference(unsigned char Flag) {
+ switch (Flag) {
+ default:
+ return false;
+ case M68kII::MO_NO_FLAG:
+ case M68kII::MO_ABSOLUTE_ADDRESS:
+ case M68kII::MO_PC_RELATIVE_ADDRESS:
+ return true;
+ }
+}
+
+/// Return true if the specified global value reference is relative to a 32-bit
+/// PIC base (M68kISD::GLOBAL_BASE_REG). If this is true, the addressing mode
+/// has the PIC base register added in.
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ default:
+ return false;
+ case M68kII::MO_GOTOFF: // isPICStyleGOT: local global.
+ case M68kII::MO_GOT: // isPICStyleGOT: other global.
+ return true;
+ }
+}
+
+/// Return True if the specified GlobalValue requires PC addressing mode.
+inline static bool isPCRelGlobalReference(unsigned char Flag) {
+ switch (Flag) {
+ default:
+ return false;
+ case M68kII::MO_GOTPCREL:
+ case M68kII::MO_PC_RELATIVE_ADDRESS:
+ return true;
+ }
+}
+
+/// Return True if the Block is referenced using PC
+inline static bool isPCRelBlockReference(unsigned char Flag) {
+ switch (Flag) {
+ default:
+ return false;
+ case M68kII::MO_PC_RELATIVE_ADDRESS:
+ return true;
+ }
+}
+
+static inline bool isAddressRegister(unsigned RegNo) {
+ switch (RegNo) {
+ case M68k::WA0:
+ case M68k::WA1:
+ case M68k::WA2:
+ case M68k::WA3:
+ case M68k::WA4:
+ case M68k::WA5:
+ case M68k::WA6:
+ case M68k::WSP:
+ case M68k::A0:
+ case M68k::A1:
+ case M68k::A2:
+ case M68k::A3:
+ case M68k::A4:
+ case M68k::A5:
+ case M68k::A6:
+ case M68k::SP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool hasMultiMIOperands(unsigned Op, unsigned LogicalOpIdx) {
+ return M68k::getLogicalOperandSize(Op, LogicalOpIdx) > 1;
+}
+
+static inline unsigned getMaskedSpillRegister(unsigned order) {
+ switch (order) {
+ default:
+ return 0;
+ case 0:
+ return M68k::D0;
+ case 1:
+ return M68k::D1;
+ case 2:
+ return M68k::D2;
+ case 3:
+ return M68k::D3;
+ case 4:
+ return M68k::D4;
+ case 5:
+ return M68k::D5;
+ case 6:
+ return M68k::D6;
+ case 7:
+ return M68k::D7;
+ case 8:
+ return M68k::A0;
+ case 9:
+ return M68k::A1;
+ case 10:
+ return M68k::A2;
+ case 11:
+ return M68k::A3;
+ case 12:
+ return M68k::A4;
+ case 13:
+ return M68k::A5;
+ case 14:
+ return M68k::A6;
+ case 15:
+ return M68k::SP;
+ }
+}
+
+} // namespace M68kII
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
new file mode 100644
index 0000000..4c9a329
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -0,0 +1,120 @@
+//===---------- M68kELFObjectWriter.cpp - M68k ELF Writer ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains definitions for M68k ELF Writers
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/M68kFixupKinds.h"
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class M68kELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ M68kELFObjectWriter(uint8_t OSABI);
+
+ ~M68kELFObjectWriter() override;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // namespace
+
+M68kELFObjectWriter::M68kELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(false, OSABI, ELF::EM_68K, /* RELA */ true) {}
+
+M68kELFObjectWriter::~M68kELFObjectWriter() {}
+
+enum M68kRelType { RT_32, RT_16, RT_8 };
+
+static M68kRelType
+getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel) {
+ switch (Kind) {
+ case FK_Data_4:
+ case FK_PCRel_4:
+ return RT_32;
+ case FK_PCRel_2:
+ case FK_Data_2:
+ return RT_16;
+ case FK_PCRel_1:
+ case FK_Data_1:
+ return RT_8;
+ }
+ llvm_unreachable("Unimplemented");
+}
+
+unsigned M68kELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+ unsigned Kind = Fixup.getKind();
+ M68kRelType Type = getType(Kind, Modifier, IsPCRel);
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ switch (Type) {
+ case RT_32:
+ return IsPCRel ? ELF::R_68K_PC32 : ELF::R_68K_32;
+ case RT_16:
+ return IsPCRel ? ELF::R_68K_PC16 : ELF::R_68K_16;
+ case RT_8:
+ return IsPCRel ? ELF::R_68K_PC8 : ELF::R_68K_8;
+ }
+ llvm_unreachable("Unrecognized size");
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ switch (Type) {
+ case RT_32:
+ return ELF::R_68K_GOTPCREL32;
+ case RT_16:
+ return ELF::R_68K_GOTPCREL16;
+ case RT_8:
+ return ELF::R_68K_GOTPCREL8;
+ }
+ llvm_unreachable("Unrecognized size");
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT_32:
+ return ELF::R_68K_GOTOFF32;
+ case RT_16:
+ return ELF::R_68K_GOTOFF16;
+ case RT_8:
+ return ELF::R_68K_GOTOFF8;
+ }
+ llvm_unreachable("Unrecognized size");
+ case MCSymbolRefExpr::VK_PLT:
+ switch (Type) {
+ case RT_32:
+ return ELF::R_68K_PLT32;
+ case RT_16:
+ return ELF::R_68K_PLT16;
+ case RT_8:
+ return ELF::R_68K_PLT8;
+ }
+ llvm_unreachable("Unrecognized size");
+ }
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createM68kELFObjectWriter(uint8_t OSABI) {
+ return std::make_unique<M68kELFObjectWriter>(OSABI);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h
new file mode 100644
index 0000000..2b760de
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h
@@ -0,0 +1,54 @@
+//===-- M68kFixupKinds.h - M68k Specific Fixup Entries ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains M68k specific fixup entries.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68k_MCTARGETDESC_M68kFIXUPKINDS_H
+#define LLVM_LIB_TARGET_M68k_MCTARGETDESC_M68kFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+static inline unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 0;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 1;
+ case FK_PCRel_4:
+ case FK_SecRel_4:
+ case FK_Data_4:
+ return 2;
+ }
+ llvm_unreachable("invalid fixup kind!");
+}
+
+static inline MCFixupKind getFixupForSize(unsigned Size, bool isPCRel) {
+ switch (Size) {
+ case 8:
+ return isPCRel ? FK_PCRel_1 : FK_Data_1;
+ case 16:
+ return isPCRel ? FK_PCRel_2 : FK_Data_2;
+ case 32:
+ return isPCRel ? FK_PCRel_4 : FK_Data_4;
+ case 64:
+ return isPCRel ? FK_PCRel_8 : FK_Data_8;
+ }
+ llvm_unreachable("Invalid generic fixup size!");
+}
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
new file mode 100644
index 0000000..e5f5909
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
@@ -0,0 +1,219 @@
+//===-- M68kInstPrinter.cpp - Convert M68k MCInst to asm ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains definitions for an M68k MCInst printer.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO Conform with all supported Motorola ASM syntax
+// Motorola's assembly has several syntax variants, especially on
+// addressing modes.
+// For example, you can write pc indirect w/ displacement as
+// `x(%pc)`, where `x` is the displacement imm, or `(x,%pc)`.
+// Currently we're picking the variant that is different from
+// GCC, albeit being recognizable by GNU AS.
+// Not sure what is the impact now (e.g. some syntax might
+// not be recognized by some old consoles' toolchains, in which
+// case we can not use our integrated assembler), but either way,
+// it will be great to support all of the variants in the future.
+
+#include "M68kInstPrinter.h"
+#include "M68kBaseInfo.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "M68kGenAsmWriter.inc"
+
+void M68kInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << "%" << getRegisterName(RegNo);
+}
+
+void M68kInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (!printAliasInstr(MI, Address, O))
+ printInstruction(MI, Address, O);
+
+ printAnnotation(O, Annot);
+}
+
+void M68kInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNo);
+ if (MO.isReg()) {
+ printRegName(O, MO.getReg());
+ return;
+ }
+
+ if (MO.isImm()) {
+ printImmediate(MI, OpNo, O);
+ return;
+ }
+
+ assert(MO.isExpr() && "Unknown operand kind in printOperand");
+ MO.getExpr()->print(O, &MAI);
+}
+
+void M68kInstPrinter::printImmediate(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(opNum);
+ if (MO.isImm())
+ O << '#' << MO.getImm();
+ else if (MO.isExpr()) {
+ O << '#';
+ MO.getExpr()->print(O, &MAI);
+ } else
+ llvm_unreachable("Unknown immediate kind");
+}
+
+void M68kInstPrinter::printMoveMask(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ unsigned Mask = MI->getOperand(opNum).getImm();
+ assert((Mask & 0xFFFF) == Mask && "Mask is always 16 bits");
+
+ // A move mask is splitted into two parts:
+ // bits 0 ~ 7 correspond to D0 ~ D7 regs
+ // bits 8 ~ 15 correspond to A0 ~ A7 regs
+ //
+ // In the assembly syntax, we want to use a dash to replace
+ // a continuous range of registers. For example, if the bit
+ // mask is 0b101110, we want to print "D1-D3,D5" instead of
+ // "D1,D2,D3,D4,D5".
+ //
+ // However, we don't want a dash to cross between data registers
+ // and address registers (i.e. there shouldn't be a dash crossing
+ // bit 7 and 8) since that is not really intuitive. So we simply
+ // print the data register part (bit 0~7) and address register part
+ // separately.
+ uint8_t HalfMask;
+ unsigned Reg;
+ for (int s = 0; s < 16; s += 8) {
+ HalfMask = (Mask >> s) & 0xFF;
+ // Print separation comma only if
+ // both data & register parts have bit(s) set
+ if (s != 0 && (Mask & 0xFF) && HalfMask)
+ O << ',';
+
+ for (int i = 0; HalfMask; ++i) {
+ if ((HalfMask >> i) & 0b1) {
+ HalfMask ^= 0b1 << i;
+ Reg = M68kII::getMaskedSpillRegister(i + s);
+ printRegName(O, Reg);
+
+ int j = i;
+ while ((HalfMask >> (j + 1)) & 0b1)
+ HalfMask ^= 0b1 << ++j;
+
+ if (j != i) {
+ O << '-';
+ Reg = M68kII::getMaskedSpillRegister(j + s);
+ printRegName(O, Reg);
+ }
+
+ i = j;
+
+ if (HalfMask)
+ O << ',';
+ }
+ }
+ }
+}
+
+void M68kInstPrinter::printDisp(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(opNum);
+ if (Op.isImm()) {
+ O << Op.getImm();
+ return;
+ }
+ assert(Op.isExpr() && "Unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+}
+
+void M68kInstPrinter::printARIMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ O << '(';
+ printOperand(MI, opNum, O);
+ O << ')';
+}
+
+void M68kInstPrinter::printARIPIMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ O << "(";
+ printOperand(MI, opNum, O);
+ O << ")+";
+}
+
+void M68kInstPrinter::printARIPDMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ O << "-(";
+ printOperand(MI, opNum, O);
+ O << ")";
+}
+
+void M68kInstPrinter::printARIDMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ O << '(';
+ printDisp(MI, opNum + M68k::MemDisp, O);
+ O << ',';
+ printOperand(MI, opNum + M68k::MemBase, O);
+ O << ')';
+}
+
+void M68kInstPrinter::printARIIMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ O << '(';
+ printDisp(MI, opNum + M68k::MemDisp, O);
+ O << ',';
+ printOperand(MI, opNum + M68k::MemBase, O);
+ O << ',';
+ printOperand(MI, opNum + M68k::MemIndex, O);
+ O << ')';
+}
+
+// NOTE forcing (W,L) size available since M68020 only
+void M68kInstPrinter::printAbsMem(const MCInst *MI, unsigned opNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(opNum);
+
+ if (MO.isExpr()) {
+ MO.getExpr()->print(O, &MAI);
+ return;
+ }
+
+ assert(MO.isImm() && "absolute memory addressing needs an immediate");
+ O << format("$%0" PRIx64, (uint64_t)MO.getImm());
+}
+
+void M68kInstPrinter::printPCDMem(const MCInst *MI, uint64_t Address,
+ unsigned opNum, raw_ostream &O) {
+ O << '(';
+ printDisp(MI, opNum + M68k::PCRelDisp, O);
+ O << ",%pc)";
+}
+
+void M68kInstPrinter::printPCIMem(const MCInst *MI, uint64_t Address,
+ unsigned opNum, raw_ostream &O) {
+ O << '(';
+ printDisp(MI, opNum + M68k::PCRelDisp, O);
+ O << ",%pc,";
+ printOperand(MI, opNum + M68k::PCRelIndex, O);
+ O << ')';
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h
new file mode 100644
index 0000000..ec26bc4
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h
@@ -0,0 +1,169 @@
+//===-- M68kInstPrinter.h - Convert M68k MCInst to asm ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains declarations for an M68k MCInst printer.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_INSTPRINTER_M68KINSTPRINTER_H
+#define LLVM_LIB_TARGET_M68K_INSTPRINTER_M68KINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+class M68kInstPrinter : public MCInstPrinter {
+public:
+ M68kInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &O) override;
+
+ bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+
+private:
+ void printOperand(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printImmediate(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ /// Print register mask for MOVEM instruction in order D0-D7,A0-A7
+ void printMoveMask(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ /// Print register mask for MOVEM instruction in order A7-A0,D7-D0
+ void printMoveMaskR(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printDisp(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printARIMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printARIPIMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printARIPDMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printARIDMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printARIIMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printAbsMem(const MCInst *MI, unsigned opNum, raw_ostream &O);
+ void printPCDMem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O);
+ void printPCIMem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O);
+
+ //===----------------------------------------------------------------------===//
+ // Specializations
+ //===----------------------------------------------------------------------===//
+ //
+ void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+
+ void printARI8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIMem(MI, opNum, O);
+ }
+ void printARI16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIMem(MI, opNum, O);
+ }
+ void printARI32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIMem(MI, opNum, O);
+ }
+
+ void printARIPI8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPIMem(MI, opNum, O);
+ }
+ void printARIPI16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPIMem(MI, opNum, O);
+ }
+ void printARIPI32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPIMem(MI, opNum, O);
+ }
+
+ void printARIPD8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPDMem(MI, opNum, O);
+ }
+ void printARIPD16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPDMem(MI, opNum, O);
+ }
+ void printARIPD32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIPDMem(MI, opNum, O);
+ }
+
+ void printARID8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIDMem(MI, opNum, O);
+ }
+ void printARID16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIDMem(MI, opNum, O);
+ }
+ void printARID32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIDMem(MI, opNum, O);
+ }
+
+ void printARII8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIIMem(MI, opNum, O);
+ }
+ void printARII16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIIMem(MI, opNum, O);
+ }
+ void printARII32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printARIIMem(MI, opNum, O);
+ }
+
+ void printAS8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+ void printAS16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+ void printAS32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+
+ void printAL8Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+ void printAL16Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+ void printAL32Mem(const MCInst *MI, unsigned opNum, raw_ostream &O) {
+ printAbsMem(MI, opNum, O);
+ }
+
+ void printPCD8Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCDMem(MI, Address, opNum, O);
+ }
+ void printPCD16Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCDMem(MI, Address, opNum, O);
+ }
+ void printPCD32Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCDMem(MI, Address, opNum, O);
+ }
+
+ void printPCI8Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCIMem(MI, Address, opNum, O);
+ }
+ void printPCI16Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCIMem(MI, Address, opNum, O);
+ }
+ void printPCI32Mem(const MCInst *MI, uint64_t Address, unsigned opNum,
+ raw_ostream &O) {
+ printPCIMem(MI, Address, opNum, O);
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
new file mode 100644
index 0000000..ee20410
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -0,0 +1,36 @@
+//===-- M68kMCAsmInfo.cpp - M68k Asm Properties -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definitions of the M68k MCAsmInfo properties.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void M68kELFMCAsmInfo::anchor() {}
+
+M68kELFMCAsmInfo::M68kELFMCAsmInfo(const Triple &T) {
+ CodePointerSize = 4;
+ CalleeSaveStackSlotSize = 4;
+
+ IsLittleEndian = false;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ UseMotorolaIntegers = true;
+ CommentString = ";";
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
new file mode 100644
index 0000000..b3a58cc
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- M68kMCAsmInfo.h - M68k Asm Info --------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the M68k MCAsmInfo properties.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H
+#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class M68kELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit M68kELFMCAsmInfo(const Triple &Triple);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
new file mode 100644
index 0000000..9708aba
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
@@ -0,0 +1,387 @@
+//===-- M68kMCCodeEmitter.cpp - Convert M68k code emitter ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains defintions for M68k code emitter.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/M68kMCCodeEmitter.h"
+#include "MCTargetDesc/M68kBaseInfo.h"
+#include "MCTargetDesc/M68kFixupKinds.h"
+#include "MCTargetDesc/M68kMCTargetDesc.h"
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "m68k-mccodeemitter"
+
+namespace {
+class M68kMCCodeEmitter : public MCCodeEmitter {
+ M68kMCCodeEmitter(const M68kMCCodeEmitter &) = delete;
+ void operator=(const M68kMCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+
+public:
+ M68kMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {}
+
+ ~M68kMCCodeEmitter() override {}
+
+ // TableGen'erated function
+ const uint8_t *getGenInstrBeads(const MCInst &MI) const {
+ return M68k::getMCInstrBeads(MI.getOpcode());
+ }
+
+ unsigned encodeBits(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
+ const MCInstrDesc &Desc, uint64_t &Buffer,
+ unsigned Offset, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned encodeReg(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
+ const MCInstrDesc &Desc, uint64_t &Buffer, unsigned Offset,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned encodeImm(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
+ const MCInstrDesc &Desc, uint64_t &Buffer, unsigned Offset,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+unsigned M68kMCCodeEmitter::encodeBits(unsigned ThisByte, uint8_t Bead,
+ const MCInst &MI,
+ const MCInstrDesc &Desc,
+ uint64_t &Buffer, unsigned Offset,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Num = 0;
+ switch (Bead & 0xF) {
+ case M68kBeads::Bits1:
+ Num = 1;
+ break;
+ case M68kBeads::Bits2:
+ Num = 2;
+ break;
+ case M68kBeads::Bits3:
+ Num = 3;
+ break;
+ case M68kBeads::Bits4:
+ Num = 4;
+ break;
+ }
+ unsigned char Val = (Bead & 0xF0) >> 4;
+
+ LLVM_DEBUG(dbgs() << "\tEncodeBits"
+ << " Num: " << Num << " Val: 0x");
+ LLVM_DEBUG(dbgs().write_hex(Val) << "\n");
+
+ Buffer |= (Val << Offset);
+
+ return Num;
+}
+
+unsigned M68kMCCodeEmitter::encodeReg(unsigned ThisByte, uint8_t Bead,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ uint64_t &Buffer, unsigned Offset,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ bool DA, Reg;
+ switch (Bead & 0xF) {
+ default:
+ llvm_unreachable("Unrecognized Bead code for register type");
+ case M68kBeads::DAReg:
+ Reg = true;
+ DA = true;
+ break;
+ case M68kBeads::DA:
+ Reg = false;
+ DA = true;
+ break;
+ case M68kBeads::DReg:
+ case M68kBeads::Reg:
+ Reg = true;
+ DA = false;
+ break;
+ }
+
+ unsigned Op = (Bead & 0x70) >> 4;
+ bool Alt = (Bead & 0x80);
+ LLVM_DEBUG(dbgs() << "\tEncodeReg"
+ << " Op: " << Op << ", DA: " << DA << ", Reg: " << Reg
+ << ", Alt: " << Alt << "\n");
+
+ auto MIOpIdx = M68k::getLogicalOperandIdx(MI.getOpcode(), Op);
+ bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
+
+ MCOperand MCO;
+ if (M68kII::hasMultiMIOperands(MI.getOpcode(), Op)) {
+ if (IsPCRel) {
+ assert(Alt &&
+ "PCRel addresses use Alt bead register encoding by default");
+ MCO = MI.getOperand(MIOpIdx + M68k::PCRelIndex);
+ } else {
+ MCO = MI.getOperand(MIOpIdx + (Alt ? M68k::MemIndex : M68k::MemBase));
+ }
+ } else {
+ assert(!Alt && "You cannot use Alt register with a simple operand");
+ MCO = MI.getOperand(MIOpIdx);
+ }
+
+ unsigned RegNum = MCO.getReg();
+ auto RI = Ctx.getRegisterInfo();
+
+ unsigned Written = 0;
+ if (Reg) {
+ uint32_t Val = RI->getEncodingValue(RegNum);
+ Buffer |= (Val & 7) << Offset;
+ Offset += 3;
+ Written += 3;
+ }
+
+ if (DA) {
+ Buffer |= (uint64_t)M68kII::isAddressRegister(RegNum) << Offset;
+ Written++;
+ }
+
+ return Written;
+}
+
+static unsigned EmitConstant(uint64_t Val, unsigned Size, unsigned Pad,
+ uint64_t &Buffer, unsigned Offset) {
+ assert(Size + Offset <= 64 && isUIntN(Size, Val) && "Value does not fit");
+
+ // Writing Value in host's endianness
+ Buffer |= (Val & ((1ULL << Size) - 1)) << Offset;
+ return Size + Pad;
+}
+
+unsigned M68kMCCodeEmitter::encodeImm(unsigned ThisByte, uint8_t Bead,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ uint64_t &Buffer, unsigned Offset,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned ThisWord = ThisByte / 2;
+ unsigned Size = 0;
+ unsigned Pad = 0;
+ unsigned FixOffset = 0;
+ int64_t Addendum = 0;
+ bool NoExpr = false;
+
+ unsigned Type = Bead & 0xF;
+ unsigned Op = (Bead & 0x70) >> 4;
+ bool Alt = (Bead & 0x80);
+
+ auto MIOpIdx = M68k::getLogicalOperandIdx(MI.getOpcode(), Op);
+ bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
+
+ // The PC value upon instruction reading of a short jump will point to the
+ // next instruction, thus we need to compensate 2 bytes, which is the diff
+ // between the patch point and the PC.
+ if (IsPCRel && ThisWord == 0)
+ Addendum -= 2;
+
+ switch (Type) {
+ // ??? what happens if it is not byte aligned
+ // ??? is it even possible
+ case M68kBeads::Disp8:
+ Size = 8;
+ Pad = 0;
+ FixOffset = ThisByte + 1;
+ Addendum += 1;
+ break;
+ case M68kBeads::Imm8:
+ Size = 8;
+ Pad = 8;
+ FixOffset = ThisByte;
+ break;
+ case M68kBeads::Imm16:
+ Size = 16;
+ Pad = 0;
+ FixOffset = ThisByte;
+ break;
+ case M68kBeads::Imm32:
+ Size = 32;
+ Pad = 0;
+ FixOffset = ThisByte;
+ break;
+ case M68kBeads::Imm3:
+ Size = 3;
+ Pad = 0;
+ NoExpr = true;
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "\tEncodeImm"
+ << " Op: " << Op << ", Size: " << Size << ", Alt: " << Alt
+ << "\n");
+
+ MCOperand MCO;
+ if (M68kII::hasMultiMIOperands(MI.getOpcode(), Op)) {
+
+ if (IsPCRel) {
+ assert(!Alt && "You cannot use ALT operand with PCRel");
+ MCO = MI.getOperand(MIOpIdx + M68k::PCRelDisp);
+ } else {
+ MCO = MI.getOperand(MIOpIdx + (Alt ? M68k::MemOuter : M68k::MemDisp));
+ }
+
+ if (MCO.isExpr()) {
+ assert(!NoExpr && "Cannot use expression here");
+ const MCExpr *Expr = MCO.getExpr();
+
+ // This only makes sense for PCRel instructions since PC points to the
+ // extension word and Disp8 for example is right justified and requires
+ // correction. E.g. R_68K_PC32 is calculated as S + A - P, P for Disp8
+ // will be EXTENSION_WORD + 1 thus we need to have A equal to 1 to
+ // compensate.
+ // TODO count extension words
+ if (IsPCRel && Addendum != 0) {
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(Addendum, Ctx), Ctx);
+ }
+
+ Fixups.push_back(MCFixup::create(
+ FixOffset, Expr, getFixupForSize(Size, IsPCRel), MI.getLoc()));
+ // Write zeros
+ return EmitConstant(0, Size, Pad, Buffer, Offset);
+ }
+
+ } else {
+ MCO = MI.getOperand(MIOpIdx);
+ if (MCO.isExpr()) {
+ assert(!NoExpr && "Cannot use expression here");
+ const MCExpr *Expr = MCO.getExpr();
+
+ if (Addendum != 0) {
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(Addendum, Ctx), Ctx);
+ }
+
+ Fixups.push_back(MCFixup::create(
+ FixOffset, Expr, getFixupForSize(Size, IsPCRel), MI.getLoc()));
+ // Write zeros
+ return EmitConstant(0, Size, Pad, Buffer, Offset);
+ }
+ }
+
+ int64_t I = MCO.getImm();
+
+ // Store 8 as 0, thus making range 1-8
+ if (Type == M68kBeads::Imm3 && Alt) {
+ assert(I && "Cannot encode Alt Imm3 zero value");
+ I %= 8;
+ } else {
+ assert(isIntN(Size, I));
+ }
+
+ uint64_t Imm = I;
+
+ // 32 bit Imm requires HI16 first then LO16
+ if (Size == 32) {
+ Offset += EmitConstant((Imm >> 16) & 0xFFFF, 16, Pad, Buffer, Offset);
+ EmitConstant(Imm & 0xFFFF, 16, Pad, Buffer, Offset);
+ return Size;
+ }
+
+ return EmitConstant(Imm & ((1ULL << Size) - 1), Size, Pad, Buffer, Offset);
+}
+
+#include "M68kGenMCCodeBeads.inc"
+
+void M68kMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+
+ LLVM_DEBUG(dbgs() << "EncodeInstruction: " << MCII.getName(Opcode) << "("
+ << Opcode << ")\n");
+
+ const uint8_t *Beads = getGenInstrBeads(MI);
+ if (!Beads || !*Beads) {
+ llvm_unreachable("*** Instruction does not have Beads defined");
+ }
+
+ uint64_t Buffer = 0;
+ unsigned Offset = 0;
+ unsigned ThisByte = 0;
+
+ for (uint8_t Bead = *Beads; Bead; Bead = *++Beads) {
+ // Check for control beads
+ if (!(Bead & 0xF)) {
+ switch (Bead >> 4) {
+ case M68kBeads::Ignore:
+ continue;
+ }
+ }
+
+ switch (Bead & 0xF) {
+ default:
+ llvm_unreachable("Unknown Bead code");
+ break;
+ case M68kBeads::Bits1:
+ case M68kBeads::Bits2:
+ case M68kBeads::Bits3:
+ case M68kBeads::Bits4:
+ Offset +=
+ encodeBits(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
+ break;
+ case M68kBeads::DAReg:
+ case M68kBeads::DA:
+ case M68kBeads::DReg:
+ case M68kBeads::Reg:
+ Offset +=
+ encodeReg(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
+ break;
+ case M68kBeads::Disp8:
+ case M68kBeads::Imm8:
+ case M68kBeads::Imm16:
+ case M68kBeads::Imm32:
+ case M68kBeads::Imm3:
+ Offset +=
+ encodeImm(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
+ break;
+ }
+
+ // Since M68k is Big Endian we need to rotate each instruction word
+ while (Offset / 16) {
+ support::endian::write<uint16_t>(OS, Buffer, support::big);
+ Buffer >>= 16;
+ Offset -= 16;
+ ThisByte += 2;
+ }
+ }
+
+ assert(Offset == 0 && "M68k Instructions are % 2 bytes");
+ assert((ThisByte && !(ThisByte % 2)) && "M68k Instructions are % 2 bytes");
+}
+
+MCCodeEmitter *llvm::createM68kMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new M68kMCCodeEmitter(MCII, Ctx);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
new file mode 100644
index 0000000..242a129
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
@@ -0,0 +1,28 @@
+//===-- M68kMCCodeEmitter.h - M68k Code Emitter ----------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations for the code emitter which are useful
+/// outside of the emitter itself.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCCODEEMITTER_H
+
+#include <cstdint>
+
+namespace llvm {
+namespace M68k {
+
+const uint8_t *getMCInstrBeads(unsigned);
+
+} // namespace M68k
+} // namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
new file mode 100644
index 0000000..0a438ea
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
@@ -0,0 +1,135 @@
+//===-- M68kMCTargetDesc.cpp - M68k Target Descriptions -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides M68k target specific descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "M68kMCTargetDesc.h"
+#include "M68kInstPrinter.h"
+#include "M68kMCAsmInfo.h"
+#include "TargetInfo/M68kTargetInfo.h"
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "M68kGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "M68kGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "M68kGenRegisterInfo.inc"
+
+// TODO Implement feature set parsing logics
+static std::string ParseM68kTriple(const Triple &TT, StringRef CPU) {
+ return "";
+}
+
+static MCInstrInfo *createM68kMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitM68kMCInstrInfo(X); // defined in M68kGenInstrInfo.inc
+ return X;
+}
+
+static MCRegisterInfo *createM68kMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitM68kMCRegisterInfo(X, llvm::M68k::A0, 0, 0, llvm::M68k::PC);
+ return X;
+}
+
+static MCSubtargetInfo *createM68kMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string ArchFS = ParseM68kTriple(TT, CPU);
+ if (!FS.empty()) {
+ if (!ArchFS.empty()) {
+ ArchFS = (ArchFS + "," + FS).str();
+ } else {
+ ArchFS = FS.str();
+ }
+ }
+ return createM68kMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, ArchFS);
+}
+
+static MCAsmInfo *createM68kMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT,
+ const MCTargetOptions &TO) {
+ MCAsmInfo *MAI = new M68kELFMCAsmInfo(TT);
+
+ // Initialize initial frame state.
+ // Calculate amount of bytes used for return address storing
+ int StackGrowth = -4;
+
+ // Initial state of the frame pointer is SP+StackGrowth.
+ // TODO: Add tests for `cfi_*` directives
+ MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
+ nullptr, MRI.getDwarfRegNum(llvm::M68k::SP, true), -StackGrowth);
+ MAI->addInitialFrameState(Inst);
+
+ // Add return address to move list
+ Inst = MCCFIInstruction::createOffset(
+ nullptr, MRI.getDwarfRegNum(M68k::PC, true), StackGrowth);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static MCRelocationInfo *createM68kMCRelocationInfo(const Triple &TheTriple,
+ MCContext &Ctx) {
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+static MCInstPrinter *createM68kMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new M68kInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTargetMC() {
+ Target &T = getTheM68kTarget();
+
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(T, createM68kMCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(T, createM68kMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(T, createM68kMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(T, createM68kMCSubtargetInfo);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(T, createM68kMCCodeEmitter);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(T, createM68kMCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(T, createM68kMCRelocationInfo);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(T, createM68kAsmBackend);
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
new file mode 100644
index 0000000..a0ebca0
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
@@ -0,0 +1,61 @@
+//===-- M68kMCTargetDesc.h - M68k Target Descriptions -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides M68k specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCTARGETDESC_H
+#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCTARGETDESC_H
+
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+MCAsmBackend *createM68kAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options);
+
+MCCodeEmitter *createM68kMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+/// Construct an M68k ELF object writer.
+std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI);
+
+} // namespace llvm
+
+// Defines symbolic names for M68k registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "M68kGenRegisterInfo.inc"
+
+// Defines symbolic names for the M68k instructions.
+#define GET_INSTRINFO_ENUM
+#include "M68kGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "M68kGenSubtargetInfo.inc"
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..d376dcd
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_component_library(LLVMM68kInfo
+ M68kTargetInfo.cpp
+
+ LINK_COMPONENTS
+ Support
+
+ ADD_TO_COMPONENT
+ M68k
+)
diff --git a/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
new file mode 100644
index 0000000..5f08b90
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
@@ -0,0 +1,27 @@
+//===-- M68kTargetInfo.cpp - M68k Target Implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains M68k target initializer.
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace llvm {
+Target &getTheM68kTarget() {
+ static Target TheM68kTarget;
+ return TheM68kTarget;
+}
+} // namespace llvm
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTargetInfo() {
+ RegisterTarget<Triple::m68k, /*HasJIT=*/true> X(
+ getTheM68kTarget(), "m68k", "Motorola 68000 family", "M68k");
+}
diff --git a/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.h b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.h
new file mode 100644
index 0000000..018e865
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.h
@@ -0,0 +1,18 @@
+//===-- M68kTargetInfo.h - M68k Target Implementation -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_M68K_TARGETINFO_M68KTARGETINFO_H
+#define LLVM_LIB_TARGET_M68K_TARGETINFO_M68KTARGETINFO_H
+
+namespace llvm {
+class Target;
+
+Target &getTheM68kTarget();
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_M68K_TARGETINFO_M68KTARGETINFO_H
diff --git a/src/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/src/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index f32418c..4bad036 100644
--- a/src/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -327,7 +327,7 @@
bool MSP430AsmParser::parseJccInstruction(ParseInstructionInfo &Info,
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
- if (!Name.startswith_lower("j"))
+ if (!Name.startswith_insensitive("j"))
return true;
auto CC = Name.drop_front().lower();
@@ -390,7 +390,7 @@
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
// Drop .w suffix
- if (Name.endswith_lower(".w"))
+ if (Name.endswith_insensitive(".w"))
Name = Name.drop_back(2);
if (!parseJccInstruction(Info, Name, NameLoc, Operands))
diff --git a/src/llvm-project/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/src/llvm-project/llvm/lib/Target/MSP430/MSP430InstrInfo.td
index aaca350..e7218ca 100644
--- a/src/llvm-project/llvm/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -443,7 +443,7 @@
//===----------------------------------------------------------------------===//
// Arithmetic Instructions
-multiclass Arith<bits<4> opcode, string asmstring, SDNode node,
+multiclass Arith<bits<4> opcode, string asmstring, SDPatternOperator node,
bit commutes, list<Register> uses> {
let Defs = [SR], Uses = uses in {
let Constraints = "$src2 = $rd" in {
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 9de34cc..b81ebed 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -740,9 +740,8 @@
return RegNo;
} else if (MO.isImm()) {
return static_cast<unsigned>(MO.getImm());
- } else if (MO.isFPImm()) {
- return static_cast<unsigned>(APFloat(MO.getFPImm())
- .bitcastToAPInt().getHiBits(32).getLimitedValue());
+ } else if (MO.isDFPImm()) {
+ return static_cast<unsigned>(bit_cast<double>(MO.getDFPImm()));
}
// MO must be an Expr.
assert(MO.isExpr());
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index a4a953b..befa883 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -35,7 +35,7 @@
// 1-byte long nor fixed length but it matches the value GAS emits.
MCSectionELF *Sec =
Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
- ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
+ ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1);
MCA.registerSection(*Sec);
Sec->setAlignment(Align(8));
Streamer->SwitchSection(Sec);
@@ -53,7 +53,7 @@
Streamer->emitIntValue(ri_gp_value, 8);
} else {
MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
- ELF::SHF_ALLOC, 24, "");
+ ELF::SHF_ALLOC, 24);
MCA.registerSection(*Sec);
Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
Streamer->SwitchSection(Sec);
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 6ec8fe8..232d0eb 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1309,7 +1309,7 @@
MCContext &Context = MCA.getContext();
MCStreamer &OS = getStreamer();
MCSectionELF *Sec = Context.getELFSection(
- ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
+ ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24);
MCA.registerSection(*Sec);
Sec->setAlignment(Align(8));
OS.SwitchSection(Sec);
diff --git a/src/llvm-project/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/src/llvm-project/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index cc1f72c..6c5f638 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -359,7 +359,7 @@
"llvm.log10.f32", "llvm.log10.f64",
"llvm.nearbyint.f32", "llvm.nearbyint.f64",
"llvm.pow.f32", "llvm.pow.f64",
- "llvm.powi.f32", "llvm.powi.f64",
+ "llvm.powi.f32.i32", "llvm.powi.f64.i32",
"llvm.rint.f32", "llvm.rint.f64",
"llvm.round.f32", "llvm.round.f64",
"llvm.sin.f32", "llvm.sin.f64",
diff --git a/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index a3b86bd..136612c 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -156,7 +156,7 @@
}
bool Mips16TargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
+ EVT VT, unsigned, Align, MachineMemOperand::Flags, bool *Fast) const {
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.h b/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.h
index 2002499..e8b5f60 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/Mips16ISelLowering.h
@@ -22,7 +22,7 @@
const MipsSubtarget &STI);
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Align,
+ Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.cpp
index fe3fe82..76acfa9 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -12,8 +12,7 @@
using namespace llvm;
-/// This function returns true if CallSym is a long double emulation routine.
-static bool isF128SoftLibCall(const char *CallSym) {
+bool MipsCCState::isF128SoftLibCall(const char *CallSym) {
const char *const LibCalls[] = {
"__addtf3", "__divtf3", "__eqtf2", "__extenddftf2",
"__extendsftf2", "__fixtfdi", "__fixtfsi", "__fixtfti",
@@ -37,7 +36,7 @@
/// This function returns true if Ty is fp128, {f128} or i128 which was
/// originally a fp128.
-static bool originalTypeIsF128(const Type *Ty, const char *Func) {
+bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) {
if (Ty->isFP128Ty())
return true;
@@ -47,11 +46,12 @@
// If the Ty is i128 and the function being called is a long double emulation
// routine, then the original type is f128.
+ // FIXME: This is unsound because these functions could be indirectly called
return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func));
}
/// Return true if the original type was vXfXX.
-static bool originalEVTTypeIsVectorFloat(EVT Ty) {
+bool MipsCCState::originalEVTTypeIsVectorFloat(EVT Ty) {
if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint())
return true;
@@ -59,7 +59,7 @@
}
/// Return true if the original type was vXfXX / vXfXX.
-static bool originalTypeIsVectorFloat(const Type * Ty) {
+bool MipsCCState::originalTypeIsVectorFloat(const Type *Ty) {
if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy())
return true;
@@ -126,6 +126,18 @@
}
}
+void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
+ OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
+}
+
+void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed,
+ const char *Func) {
+ OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
+ OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
+ OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
+ CallOperandIsFixed.push_back(IsFixed);
+}
+
/// Identify lowered values that originated from f128, float and sret to vXfXX
/// arguments and record this.
void MipsCCState::PreAnalyzeCallOperands(
@@ -142,6 +154,27 @@
}
}
+void MipsCCState::PreAnalyzeFormalArgument(const Type *ArgTy,
+ ISD::ArgFlagsTy Flags) {
+ // SRet arguments cannot originate from f128 or {f128} returns so we just
+ // push false. We have to handle this specially since SRet arguments
+ // aren't mapped to an original argument.
+ if (Flags.isSRet()) {
+ OriginalArgWasF128.push_back(false);
+ OriginalArgWasFloat.push_back(false);
+ OriginalArgWasFloatVector.push_back(false);
+ return;
+ }
+
+ OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, nullptr));
+ OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
+
+ // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
+ // first argument is actually an SRet pointer to a vector, then the next
+ // argument slot is $a2.
+ OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
+}
+
/// Identify lowered values that originated from f128, float and vXfXX arguments
/// and record this.
void MipsCCState::PreAnalyzeFormalArgumentsForF128(
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.h b/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.h
index fd2fd97..bbb5225 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsCCState.h
@@ -26,6 +26,21 @@
getSpecialCallingConvForCallee(const SDNode *Callee,
const MipsSubtarget &Subtarget);
+ /// This function returns true if CallSym is a long double emulation routine.
+ ///
+ /// FIXME: Changing the ABI based on the callee name is unsound. The lib func
+ /// address could be captured.
+ static bool isF128SoftLibCall(const char *CallSym);
+
+ static bool originalTypeIsF128(const Type *Ty, const char *Func);
+ static bool originalEVTTypeIsVectorFloat(EVT Ty);
+ static bool originalTypeIsVectorFloat(const Type *Ty);
+
+ void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func);
+
+ void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
+ void PreAnalyzeReturnValue(EVT ArgVT);
+
private:
/// Identify lowered values that originated from f128 arguments and record
/// this for use by RetCC_MipsN.
@@ -85,17 +100,23 @@
SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
: CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
+ void PreAnalyzeCallOperands(
+ const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn,
+ std::vector<TargetLowering::ArgListEntry> &FuncArgs, const char *Func) {
+ OriginalArgWasF128.clear();
+ OriginalArgWasFloat.clear();
+ OriginalArgWasFloatVector.clear();
+ CallOperandIsFixed.clear();
+ PreAnalyzeCallOperands(Outs, FuncArgs, Func);
+ }
+
void
AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
CCAssignFn Fn,
std::vector<TargetLowering::ArgListEntry> &FuncArgs,
const char *Func) {
- PreAnalyzeCallOperands(Outs, FuncArgs, Func);
+ PreAnalyzeCallOperands(Outs, Fn, FuncArgs, Func);
CCState::AnalyzeCallOperands(Outs, Fn);
- OriginalArgWasF128.clear();
- OriginalArgWasFloat.clear();
- OriginalArgWasFloatVector.clear();
- CallOperandIsFixed.clear();
}
// The AnalyzeCallOperands in the base class is not usable since we must
@@ -107,34 +128,56 @@
SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
CCAssignFn Fn) = delete;
- void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- PreAnalyzeFormalArgumentsForF128(Ins);
- CCState::AnalyzeFormalArguments(Ins, Fn);
+ void PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn) {
OriginalArgWasFloat.clear();
OriginalArgWasF128.clear();
OriginalArgWasFloatVector.clear();
+ PreAnalyzeFormalArgumentsForF128(Ins);
+ }
+
+ void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn) {
+ PreAnalyzeFormalArguments(Ins, Fn);
+ CCState::AnalyzeFormalArguments(Ins, Fn);
+ }
+
+ void PreAnalyzeCallResult(const Type *RetTy, const char *Func) {
+ OriginalArgWasF128.push_back(originalTypeIsF128(RetTy, Func));
+ OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
+ OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
+ }
+
+ void PreAnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn, const Type *RetTy,
+ const char *Func) {
+ OriginalArgWasFloat.clear();
+ OriginalArgWasF128.clear();
+ OriginalArgWasFloatVector.clear();
+ PreAnalyzeCallResultForF128(Ins, RetTy, Func);
+ PreAnalyzeCallResultForVectorFloat(Ins, RetTy);
}
void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn, const Type *RetTy,
const char *Func) {
- PreAnalyzeCallResultForF128(Ins, RetTy, Func);
- PreAnalyzeCallResultForVectorFloat(Ins, RetTy);
+ PreAnalyzeCallResult(Ins, Fn, RetTy, Func);
CCState::AnalyzeCallResult(Ins, Fn);
+ }
+
+ void PreAnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn) {
OriginalArgWasFloat.clear();
OriginalArgWasF128.clear();
OriginalArgWasFloatVector.clear();
+ PreAnalyzeReturnForF128(Outs);
+ PreAnalyzeReturnForVectorFloat(Outs);
}
void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
CCAssignFn Fn) {
- PreAnalyzeReturnForF128(Outs);
- PreAnalyzeReturnForVectorFloat(Outs);
+ PreAnalyzeReturn(Outs, Fn);
CCState::AnalyzeReturn(Outs, Fn);
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
}
bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 377aa48..5c2549e 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -24,98 +24,89 @@
MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
: CallLowering(&TLI) {}
-bool MipsCallLowering::MipsHandler::assign(Register VReg, const CCValAssign &VA,
- const EVT &VT) {
- if (VA.isRegLoc()) {
- assignValueToReg(VReg, VA, VT);
- } else if (VA.isMemLoc()) {
- assignValueToAddress(VReg, VA);
- } else {
- return false;
+struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
+ /// This is the name of the function being called
+ /// FIXME: Relying on this is unsound
+ const char *Func = nullptr;
+
+ /// Is this a return value, or an outgoing call operand.
+ bool IsReturn;
+
+ MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
+ bool IsReturn)
+ : OutgoingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
+
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State_) override {
+ MipsCCState &State = static_cast<MipsCCState &>(State_);
+
+ if (IsReturn)
+ State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
+ else
+ State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func);
+
+ return CallLowering::OutgoingValueAssigner::assignArg(
+ ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
}
- return true;
-}
+};
-bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<Register> VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex,
- const EVT &VT) {
- for (unsigned i = 0; i < VRegs.size(); ++i)
- if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i], VT))
- return false;
- return true;
-}
+struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
+ /// This is the name of the function being called
+ /// FIXME: Relying on this is unsound
+ const char *Func = nullptr;
-void MipsCallLowering::MipsHandler::setLeastSignificantFirst(
- SmallVectorImpl<Register> &VRegs) {
- if (!MIRBuilder.getMF().getDataLayout().isLittleEndian())
- std::reverse(VRegs.begin(), VRegs.end());
-}
+ /// Is this a call return value, or an incoming function argument.
+ bool IsReturn;
-bool MipsCallLowering::MipsHandler::handle(
- ArrayRef<CCValAssign> ArgLocs, ArrayRef<CallLowering::ArgInfo> Args) {
- SmallVector<Register, 4> VRegs;
- unsigned SplitLength;
- const Function &F = MIRBuilder.getMF().getFunction();
- const DataLayout &DL = F.getParent()->getDataLayout();
- const MipsTargetLowering &TLI = *static_cast<const MipsTargetLowering *>(
- MIRBuilder.getMF().getSubtarget().getTargetLowering());
+ MipsIncomingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
+ bool IsReturn)
+ : IncomingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
- for (unsigned ArgsIndex = 0, ArgLocsIndex = 0; ArgsIndex < Args.size();
- ++ArgsIndex, ArgLocsIndex += SplitLength) {
- EVT VT = TLI.getValueType(DL, Args[ArgsIndex].Ty);
- SplitLength = TLI.getNumRegistersForCallingConv(F.getContext(),
- F.getCallingConv(), VT);
- assert(Args[ArgsIndex].Regs.size() == 1 && "Can't handle multple regs yet");
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State_) override {
+ MipsCCState &State = static_cast<MipsCCState &>(State_);
- if (SplitLength > 1) {
- VRegs.clear();
- MVT RegisterVT = TLI.getRegisterTypeForCallingConv(
- F.getContext(), F.getCallingConv(), VT);
- for (unsigned i = 0; i < SplitLength; ++i)
- VRegs.push_back(MRI.createGenericVirtualRegister(LLT{RegisterVT}));
+ if (IsReturn)
+ State.PreAnalyzeCallResult(Info.Ty, Func);
+ else
+ State.PreAnalyzeFormalArgument(Info.Ty, Flags);
- if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Regs[0],
- VT))
- return false;
- } else {
- if (!assign(Args[ArgsIndex].Regs[0], ArgLocs[ArgLocsIndex], VT))
- return false;
- }
+ return CallLowering::IncomingValueAssigner::assignArg(
+ ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
}
- return true;
-}
+};
namespace {
-class MipsIncomingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsIncomingValueHandler : public CallLowering::IncomingValueHandler {
+ const MipsSubtarget &STI;
+
public:
MipsIncomingValueHandler(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI)
- : MipsHandler(MIRBuilder, MRI) {}
+ : IncomingValueHandler(MIRBuilder, MRI),
+ STI(MIRBuilder.getMF().getSubtarget<MipsSubtarget>()) {}
private:
- void assignValueToReg(Register ValVReg, const CCValAssign &VA,
- const EVT &VT) override;
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override;
- Register getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) override;
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override;
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override;
- void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override;
-
- bool handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) override;
+ unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
+ ArrayRef<CCValAssign> VAs) override;
virtual void markPhysRegUsed(unsigned PhysReg) {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
-
- MachineInstrBuilder buildLoad(const DstOp &Res, const CCValAssign &VA) {
- MachineMemOperand *MMO;
- Register Addr = getStackAddress(VA, MMO);
- return MIRBuilder.buildLoad(Res, Addr, *MMO);
- }
};
class CallReturnHandler : public MipsIncomingValueHandler {
@@ -135,190 +126,154 @@
} // end anonymous namespace
void MipsIncomingValueHandler::assignValueToReg(Register ValVReg,
- const CCValAssign &VA,
- const EVT &VT) {
- Register PhysReg = VA.getLocReg();
- if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
- const MipsSubtarget &STI =
- static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
- bool IsEL = STI.isLittle();
- LLT s32 = LLT::scalar(32);
- auto Lo = MIRBuilder.buildCopy(s32, Register(PhysReg + (IsEL ? 0 : 1)));
- auto Hi = MIRBuilder.buildCopy(s32, Register(PhysReg + (IsEL ? 1 : 0)));
- MIRBuilder.buildMerge(ValVReg, {Lo, Hi});
- markPhysRegUsed(PhysReg);
- markPhysRegUsed(PhysReg + 1);
- } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- markPhysRegUsed(PhysReg);
- } else {
- switch (VA.getLocInfo()) {
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- default:
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
- }
- markPhysRegUsed(PhysReg);
- }
+ Register PhysReg,
+ CCValAssign &VA) {
+ markPhysRegUsed(PhysReg);
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
}
-Register MipsIncomingValueHandler::getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) {
+Register MipsIncomingValueHandler::getStackAddress(uint64_t Size,
+ int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) {
+
MachineFunction &MF = MIRBuilder.getMF();
- unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
- unsigned Offset = VA.getLocMemOffset();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ // FIXME: This should only be immutable for non-byval memory arguments.
int FI = MFI.CreateFixedObject(Size, Offset, true);
- MachinePointerInfo MPO =
- MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
-
- const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
- Align Alignment = commonAlignment(TFL->getStackAlign(), Offset);
- MMO =
- MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Alignment);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0);
}
void MipsIncomingValueHandler::assignValueToAddress(Register ValVReg,
- const CCValAssign &VA) {
- if (VA.getLocInfo() == CCValAssign::SExt ||
- VA.getLocInfo() == CCValAssign::ZExt ||
- VA.getLocInfo() == CCValAssign::AExt) {
- auto Load = buildLoad(LLT::scalar(32), VA);
- MIRBuilder.buildTrunc(ValVReg, Load);
- } else
- buildLoad(ValVReg, VA);
+ Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO,
+ CCValAssign &VA) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, MemTy,
+ inferAlignFromPtrInfo(MF, MPO));
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
-bool MipsIncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) {
- if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
- return false;
- setLeastSignificantFirst(VRegs);
- MIRBuilder.buildMerge(ArgsReg, VRegs);
- return true;
+/// Handle cases when f64 is split into 2 32-bit GPRs. This is a custom
+/// assignment because generic code assumes getNumRegistersForCallingConv is
+/// accurate. In this case it is not because the type/number are context
+/// dependent on other arguments.
+unsigned
+MipsIncomingValueHandler::assignCustomValue(CallLowering::ArgInfo &Arg,
+ ArrayRef<CCValAssign> VAs) {
+ const CCValAssign &VALo = VAs[0];
+ const CCValAssign &VAHi = VAs[1];
+
+ assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
+ VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
+ "unexpected custom value");
+
+ auto CopyLo = MIRBuilder.buildCopy(LLT::scalar(32), VALo.getLocReg());
+ auto CopyHi = MIRBuilder.buildCopy(LLT::scalar(32), VAHi.getLocReg());
+ if (!STI.isLittle())
+ std::swap(CopyLo, CopyHi);
+
+ Arg.OrigRegs.assign(Arg.Regs.begin(), Arg.Regs.end());
+ Arg.Regs = { CopyLo.getReg(0), CopyHi.getReg(0) };
+ MIRBuilder.buildMerge(Arg.OrigRegs[0], {CopyLo, CopyHi});
+
+ markPhysRegUsed(VALo.getLocReg());
+ markPhysRegUsed(VAHi.getLocReg());
+ return 2;
}
namespace {
-class MipsOutgoingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+ const MipsSubtarget &STI;
+
public:
MipsOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI, MachineInstrBuilder &MIB)
- : MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
+ : OutgoingValueHandler(MIRBuilder, MRI),
+ STI(MIRBuilder.getMF().getSubtarget<MipsSubtarget>()), MIB(MIB) {}
private:
- void assignValueToReg(Register ValVReg, const CCValAssign &VA,
- const EVT &VT) override;
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override;
- Register getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) override;
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override;
- void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override;
-
- bool handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) override;
-
- Register extendRegister(Register ValReg, const CCValAssign &VA);
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override;
+ unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
+ ArrayRef<CCValAssign> VAs) override;
MachineInstrBuilder &MIB;
};
} // end anonymous namespace
void MipsOutgoingValueHandler::assignValueToReg(Register ValVReg,
- const CCValAssign &VA,
- const EVT &VT) {
- Register PhysReg = VA.getLocReg();
- if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
- const MipsSubtarget &STI =
- static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
- bool IsEL = STI.isLittle();
- auto Unmerge = MIRBuilder.buildUnmerge(LLT::scalar(32), ValVReg);
- MIRBuilder.buildCopy(Register(PhysReg + (IsEL ? 0 : 1)), Unmerge.getReg(0));
- MIRBuilder.buildCopy(Register(PhysReg + (IsEL ? 1 : 0)), Unmerge.getReg(1));
- } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
- MIRBuilder.buildCopy(PhysReg, ValVReg);
- } else {
- Register ExtReg = extendRegister(ValVReg, VA);
- MIRBuilder.buildCopy(PhysReg, ExtReg);
- MIB.addUse(PhysReg, RegState::Implicit);
- }
+ Register PhysReg,
+ CCValAssign &VA) {
+ Register ExtReg = extendRegister(ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
}
-Register MipsOutgoingValueHandler::getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) {
+Register MipsOutgoingValueHandler::getStackAddress(uint64_t Size,
+ int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) {
MachineFunction &MF = MIRBuilder.getMF();
- const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+ MPO = MachinePointerInfo::getStack(MF, Offset);
LLT p0 = LLT::pointer(0, 32);
LLT s32 = LLT::scalar(32);
auto SPReg = MIRBuilder.buildCopy(p0, Register(Mips::SP));
- unsigned Offset = VA.getLocMemOffset();
auto OffsetReg = MIRBuilder.buildConstant(s32, Offset);
-
auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
-
- MachinePointerInfo MPO =
- MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
- unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
- Align Alignment = commonAlignment(TFL->getStackAlign(), Offset);
- MMO =
- MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, Alignment);
-
return AddrReg.getReg(0);
}
void MipsOutgoingValueHandler::assignValueToAddress(Register ValVReg,
- const CCValAssign &VA) {
- MachineMemOperand *MMO;
- Register Addr = getStackAddress(VA, MMO);
+ Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO,
+ CCValAssign &VA) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ uint64_t LocMemOffset = VA.getLocMemOffset();
+
+ auto MMO = MF.getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, MemTy,
+ commonAlignment(STI.getStackAlignment(), LocMemOffset));
+
Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
-Register MipsOutgoingValueHandler::extendRegister(Register ValReg,
- const CCValAssign &VA) {
- LLT LocTy{VA.getLocVT()};
- switch (VA.getLocInfo()) {
- case CCValAssign::SExt: {
- return MIRBuilder.buildSExt(LocTy, ValReg).getReg(0);
- }
- case CCValAssign::ZExt: {
- return MIRBuilder.buildZExt(LocTy, ValReg).getReg(0);
- }
- case CCValAssign::AExt: {
- return MIRBuilder.buildAnyExt(LocTy, ValReg).getReg(0);
- }
- // TODO : handle upper extends
- case CCValAssign::Full:
- return ValReg;
- default:
- break;
- }
- llvm_unreachable("unable to extend register");
-}
+unsigned
+MipsOutgoingValueHandler::assignCustomValue(CallLowering::ArgInfo &Arg,
+ ArrayRef<CCValAssign> VAs) {
+ const CCValAssign &VALo = VAs[0];
+ const CCValAssign &VAHi = VAs[1];
-bool MipsOutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) {
- MIRBuilder.buildUnmerge(VRegs, ArgsReg);
- setLeastSignificantFirst(VRegs);
- if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
- return false;
+ assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
+ VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
+ "unexpected custom value");
- return true;
+ auto Unmerge =
+ MIRBuilder.buildUnmerge({LLT::scalar(32), LLT::scalar(32)}, Arg.Regs[0]);
+ Register Lo = Unmerge.getReg(0);
+ Register Hi = Unmerge.getReg(1);
+
+ Arg.OrigRegs.assign(Arg.Regs.begin(), Arg.Regs.end());
+ Arg.Regs = { Lo, Hi };
+ if (!STI.isLittle())
+ std::swap(Lo, Hi);
+
+ MIRBuilder.buildCopy(VALo.getLocReg(), Lo);
+ MIRBuilder.buildCopy(VAHi.getLocReg(), Hi);
+ return 2;
}
static bool isSupportedArgumentType(Type *T) {
@@ -343,36 +298,6 @@
return false;
}
-static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
- const ISD::ArgFlagsTy &Flags) {
- // > does not mean loss of information as type RegisterVT can't hold type VT,
- // it means that type VT is split into multiple registers of type RegisterVT
- if (VT.getFixedSizeInBits() >= RegisterVT.getFixedSizeInBits())
- return CCValAssign::LocInfo::Full;
- if (Flags.isSExt())
- return CCValAssign::LocInfo::SExt;
- if (Flags.isZExt())
- return CCValAssign::LocInfo::ZExt;
- return CCValAssign::LocInfo::AExt;
-}
-
-template <typename T>
-static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<T> &Arguments) {
- for (unsigned i = 0; i < ArgLocs.size(); ++i) {
- const CCValAssign &VA = ArgLocs[i];
- CCValAssign::LocInfo LocInfo = determineLocInfo(
- Arguments[i].VT, Arguments[i].ArgVT, Arguments[i].Flags);
- if (VA.isMemLoc())
- ArgLocs[i] =
- CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
- VA.getLocMemOffset(), VA.getLocVT(), LocInfo);
- else
- ArgLocs[i] = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
- VA.getLocReg(), VA.getLocVT(), LocInfo);
- }
-}
-
bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val, ArrayRef<Register> VRegs,
FunctionLoweringInfo &FLI) const {
@@ -389,26 +314,29 @@
const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
SmallVector<ArgInfo, 8> RetInfos;
- SmallVector<unsigned, 8> OrigArgIndices;
- ArgInfo ArgRetInfo(VRegs, Val->getType());
+ ArgInfo ArgRetInfo(VRegs, *Val, 0);
setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F);
- splitToValueTypes(DL, ArgRetInfo, 0, RetInfos, OrigArgIndices);
-
- SmallVector<ISD::OutputArg, 8> Outs;
- subTargetRegTypeForCallingConv(F, RetInfos, OrigArgIndices, Outs);
+ splitToValueTypes(ArgRetInfo, RetInfos, DL, F.getCallingConv());
SmallVector<CCValAssign, 16> ArgLocs;
+ SmallVector<ISD::OutputArg, 8> Outs;
+
MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
F.getContext());
- CCInfo.AnalyzeReturn(Outs, TLI.CCAssignFnForReturn());
- setLocInfo(ArgLocs, Outs);
MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
- if (!RetHandler.handle(ArgLocs, RetInfos)) {
+ std::string FuncName = F.getName().str();
+ MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
+ FuncName.c_str(), /*IsReturn*/ true);
+
+ if (!determineAssignments(Assigner, RetInfos, CCInfo))
return false;
- }
+
+ if (!handleAssignments(RetHandler, RetInfos, CCInfo, ArgLocs, MIRBuilder))
+ return false;
}
+
MIRBuilder.insertInstr(Ret);
return true;
}
@@ -432,18 +360,16 @@
const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
SmallVector<ArgInfo, 8> ArgInfos;
- SmallVector<unsigned, 8> OrigArgIndices;
unsigned i = 0;
for (auto &Arg : F.args()) {
- ArgInfo AInfo(VRegs[i], Arg.getType());
+ ArgInfo AInfo(VRegs[i], Arg, i);
setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F);
- ArgInfos.push_back(AInfo);
- OrigArgIndices.push_back(i);
+
+ splitToValueTypes(AInfo, ArgInfos, DL, F.getCallingConv());
++i;
}
SmallVector<ISD::InputArg, 8> Ins;
- subTargetRegTypeForCallingConv(F, ArgInfos, OrigArgIndices, Ins);
SmallVector<CCValAssign, 16> ArgLocs;
MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
@@ -454,11 +380,15 @@
const MipsABIInfo &ABI = TM.getABI();
CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
Align(1));
- CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
- setLocInfo(ArgLocs, Ins);
+
+ const std::string FuncName = F.getName().str();
+ MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), FuncName.c_str(),
+ /*IsReturn*/ false);
+ if (!determineAssignments(Assigner, ArgInfos, CCInfo))
+ return false;
MipsIncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
- if (!Handler.handle(ArgLocs, ArgInfos))
+ if (!handleAssignments(Handler, ArgInfos, CCInfo, ArgLocs, MIRBuilder))
return false;
if (F.isVarArg()) {
@@ -481,15 +411,16 @@
for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += RegSize) {
MIRBuilder.getMBB().addLiveIn(ArgRegs[I]);
-
+ LLT RegTy = LLT::scalar(RegSize * 8);
MachineInstrBuilder Copy =
- MIRBuilder.buildCopy(LLT::scalar(RegSize * 8), Register(ArgRegs[I]));
+ MIRBuilder.buildCopy(RegTy, Register(ArgRegs[I]));
FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true);
MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MF, FI);
- MachineInstrBuilder FrameIndex =
- MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI);
+
+ const LLT PtrTy = LLT::pointer(MPO.getAddrSpace(), 32);
+ auto FrameIndex = MIRBuilder.buildFrameIndex(PtrTy, FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, RegSize, Align(RegSize));
+ MPO, MachineMemOperand::MOStore, RegTy, Align(RegSize));
MIRBuilder.buildStore(Copy, FrameIndex, *MMO);
}
}
@@ -543,27 +474,14 @@
} else
MIB.add(Info.Callee);
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
TargetLowering::ArgListTy FuncOrigArgs;
FuncOrigArgs.reserve(Info.OrigArgs.size());
SmallVector<ArgInfo, 8> ArgInfos;
- SmallVector<unsigned, 8> OrigArgIndices;
- unsigned i = 0;
- for (auto &Arg : Info.OrigArgs) {
-
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = Arg.Ty;
- FuncOrigArgs.push_back(Entry);
-
- ArgInfos.push_back(Arg);
- OrigArgIndices.push_back(i);
- ++i;
- }
-
- SmallVector<ISD::OutputArg, 8> Outs;
- subTargetRegTypeForCallingConv(F, ArgInfos, OrigArgIndices, Outs);
+ for (auto &Arg : Info.OrigArgs)
+ splitToValueTypes(Arg, ArgInfos, DL, Info.CallConv);
SmallVector<CCValAssign, 8> ArgLocs;
bool IsCalleeVarArg = false;
@@ -571,24 +489,33 @@
const Function *CF = static_cast<const Function *>(Info.Callee.getGlobal());
IsCalleeVarArg = CF->isVarArg();
}
- MipsCCState CCInfo(F.getCallingConv(), IsCalleeVarArg, MF, ArgLocs,
+
+ // FIXME: Should use MipsCCState::getSpecialCallingConvForCallee, but it
+ // depends on looking directly at the call target.
+ MipsCCState CCInfo(Info.CallConv, IsCalleeVarArg, MF, ArgLocs,
F.getContext());
CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv),
Align(1));
+
const char *Call =
Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
- CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
- setLocInfo(ArgLocs, Outs);
- MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
- if (!RetHandler.handle(ArgLocs, ArgInfos)) {
+ MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), Call,
+ /*IsReturn*/ false);
+ if (!determineAssignments(Assigner, ArgInfos, CCInfo))
return false;
- }
+
+ MipsOutgoingValueHandler ArgHandler(MIRBuilder, MF.getRegInfo(), MIB);
+ if (!handleAssignments(ArgHandler, ArgInfos, CCInfo, ArgLocs, MIRBuilder))
+ return false;
unsigned NextStackOffset = CCInfo.getNextStackOffset();
- const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
- unsigned StackAlignment = TFL->getStackAlignment();
+ unsigned StackAlignment = F.getParent()->getOverrideStackAlignment();
+ if (!StackAlignment) {
+ const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+ StackAlignment = TFL->getStackAlignment();
+ }
NextStackOffset = alignTo(NextStackOffset, StackAlignment);
CallSeqStart.addImm(NextStackOffset).addImm(0);
@@ -608,23 +535,25 @@
if (!Info.OrigRet.Ty->isVoidTy()) {
ArgInfos.clear();
- SmallVector<unsigned, 8> OrigRetIndices;
- splitToValueTypes(DL, Info.OrigRet, 0, ArgInfos, OrigRetIndices);
+ CallLowering::splitToValueTypes(Info.OrigRet, ArgInfos, DL,
+ F.getCallingConv());
+ const std::string FuncName = F.getName().str();
SmallVector<ISD::InputArg, 8> Ins;
- subTargetRegTypeForCallingConv(F, ArgInfos, OrigRetIndices, Ins);
-
SmallVector<CCValAssign, 8> ArgLocs;
+ MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
+ FuncName.c_str(),
+ /*IsReturn*/ true);
+ CallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
+
MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
F.getContext());
- CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), Info.OrigRet.Ty,
- Call);
- setLocInfo(ArgLocs, Ins);
+ if (!determineAssignments(Assigner, ArgInfos, CCInfo))
+ return false;
- CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB);
- if (!Handler.handle(ArgLocs, ArgInfos))
+ if (!handleAssignments(RetHandler, ArgInfos, CCInfo, ArgLocs, MIRBuilder))
return false;
}
@@ -632,54 +561,3 @@
return true;
}
-
-template <typename T>
-void MipsCallLowering::subTargetRegTypeForCallingConv(
- const Function &F, ArrayRef<ArgInfo> Args,
- ArrayRef<unsigned> OrigArgIndices, SmallVectorImpl<T> &ISDArgs) const {
- const DataLayout &DL = F.getParent()->getDataLayout();
- const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
-
- unsigned ArgNo = 0;
- for (auto &Arg : Args) {
-
- EVT VT = TLI.getValueType(DL, Arg.Ty);
- MVT RegisterVT = TLI.getRegisterTypeForCallingConv(F.getContext(),
- F.getCallingConv(), VT);
- unsigned NumRegs = TLI.getNumRegistersForCallingConv(
- F.getContext(), F.getCallingConv(), VT);
-
- for (unsigned i = 0; i < NumRegs; ++i) {
- ISD::ArgFlagsTy Flags = Arg.Flags[0];
-
- if (i == 0)
- Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
- else
- Flags.setOrigAlign(Align(1));
-
- ISDArgs.emplace_back(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo],
- 0);
- }
- ++ArgNo;
- }
-}
-
-void MipsCallLowering::splitToValueTypes(
- const DataLayout &DL, const ArgInfo &OrigArg, unsigned OriginalIndex,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const {
-
- SmallVector<EVT, 4> SplitEVTs;
- SmallVector<Register, 4> SplitVRegs;
- const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
- LLVMContext &Ctx = OrigArg.Ty->getContext();
-
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitEVTs);
-
- for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- ArgInfo Info = ArgInfo{OrigArg.Regs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
- Info.Flags = OrigArg.Flags;
- SplitArgs.push_back(Info);
- SplitArgsOrigIndices.push_back(OriginalIndex);
- }
-}
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h b/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
index 1c1c208..1d1406d 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
@@ -22,45 +22,7 @@
class MipsTargetLowering;
class MipsCallLowering : public CallLowering {
-
public:
- class MipsHandler {
- public:
- MipsHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
- : MIRBuilder(MIRBuilder), MRI(MRI) {}
-
- virtual ~MipsHandler() = default;
-
- bool handle(ArrayRef<CCValAssign> ArgLocs,
- ArrayRef<CallLowering::ArgInfo> Args);
-
- protected:
- bool assignVRegs(ArrayRef<Register> VRegs, ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex, const EVT &VT);
-
- void setLeastSignificantFirst(SmallVectorImpl<Register> &VRegs);
-
- MachineIRBuilder &MIRBuilder;
- MachineRegisterInfo &MRI;
-
- private:
- bool assign(Register VReg, const CCValAssign &VA, const EVT &VT);
-
- virtual Register getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) = 0;
-
- virtual void assignValueToReg(Register ValVReg, const CCValAssign &VA,
- const EVT &VT) = 0;
-
- virtual void assignValueToAddress(Register ValVReg,
- const CCValAssign &VA) = 0;
-
- virtual bool handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex, Register ArgsReg,
- const EVT &VT) = 0;
- };
-
MipsCallLowering(const MipsTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
@@ -73,22 +35,6 @@
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
-
-private:
- /// Based on registers available on target machine split or extend
- /// type if needed, also change pointer type to appropriate integer
- /// type.
- template <typename T>
- void subTargetRegTypeForCallingConv(const Function &F, ArrayRef<ArgInfo> Args,
- ArrayRef<unsigned> OrigArgIndices,
- SmallVectorImpl<T> &ISDArgs) const;
-
- /// Split structures and arrays, save original argument indices since
- /// Mips calling convention needs info about original argument type.
- void splitToValueTypes(const DataLayout &DL, const ArgInfo &OrigArg,
- unsigned OriginalIndex,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp
index 8a847ea..e963185 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -228,14 +228,13 @@
unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
+ unsigned Op0, unsigned Op1);
// for some reason, this default is not generated by tablegen
// so we explicitly generate it here.
unsigned fastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill, uint64_t imm1,
- uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+ unsigned Op0, uint64_t imm1, uint64_t imm2,
+ unsigned Op3) {
return 0;
}
@@ -2122,8 +2121,7 @@
unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+ unsigned Op0, unsigned Op1) {
// We treat the MUL instruction in a special way because it clobbers
// the HI0 & LO0 registers. The TableGen definition of this instruction can
// mark these registers only as implicitly defined. As a result, the
@@ -2136,15 +2134,14 @@
Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(Op0, getKillRegState(Op0IsKill))
- .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op0)
+ .addReg(Op1)
.addReg(Mips::HI0, RegState::ImplicitDefine | RegState::Dead)
.addReg(Mips::LO0, RegState::ImplicitDefine | RegState::Dead);
return ResultReg;
}
- return FastISel::fastEmitInst_rr(MachineInstOpcode, RC, Op0, Op0IsKill, Op1,
- Op1IsKill);
+ return FastISel::fastEmitInst_rr(MachineInstOpcode, RC, Op0, Op1);
}
namespace llvm {
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.cpp
index 8d5eabf..99d225f 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -95,15 +95,15 @@
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
- TRI->needsStackRealignment(MF);
+ MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ TRI->hasStackRealignment(MF);
}
bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
- return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
}
// Estimate the size of the stack, including the incoming arguments. We need to
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 8b599bc..9399c94 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -122,9 +122,7 @@
CallingConv::ID CC,
EVT VT) const {
if (VT.isVector())
- return std::max(((unsigned)VT.getSizeInBits() /
- (Subtarget.isABI_O32() ? 32 : 64)),
- 1U);
+ return divideCeil(VT.getSizeInBits(), Subtarget.isABI_O32() ? 32 : 64);
return MipsTargetLowering::getNumRegisters(Context, VT);
}
@@ -134,10 +132,10 @@
// Break down vector types to either 2 i64s or 4 i32s.
RegisterVT = getRegisterTypeForCallingConv(Context, CC, VT);
IntermediateVT = RegisterVT;
- NumIntermediates = VT.getFixedSizeInBits() < RegisterVT.getFixedSizeInBits()
- ? VT.getVectorNumElements()
- : VT.getSizeInBits() / RegisterVT.getSizeInBits();
-
+ NumIntermediates =
+ VT.getFixedSizeInBits() < RegisterVT.getFixedSizeInBits()
+ ? VT.getVectorNumElements()
+ : divideCeil(VT.getSizeInBits(), RegisterVT.getSizeInBits());
return NumIntermediates;
}
@@ -2928,13 +2926,23 @@
Reg = State.AllocateReg(IntRegs);
LocVT = MVT::i32;
} else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+ LocVT = MVT::i32;
+
// Allocate int register and shadow next int register. If first
// available register is Mips::A1 or Mips::A3, shadow it too.
Reg = State.AllocateReg(IntRegs);
if (Reg == Mips::A1 || Reg == Mips::A3)
Reg = State.AllocateReg(IntRegs);
- State.AllocateReg(IntRegs);
- LocVT = MVT::i32;
+
+ if (Reg) {
+ State.addLoc(
+ CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ MCRegister HiReg = State.AllocateReg(IntRegs);
+ assert(HiReg);
+ State.addLoc(
+ CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo));
+ return false;
+ }
} else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
// we are guaranteed to find an available float register
if (ValVT == MVT::f32) {
@@ -2994,12 +3002,6 @@
// Call Calling Convention Implementation
//===----------------------------------------------------------------------===//
-// Return next O32 integer argument register.
-static unsigned getNextIntArgReg(unsigned Reg) {
- assert((Reg == Mips::A0) || (Reg == Mips::A2));
- return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
-}
-
SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
SDValue Chain, SDValue Arg,
const SDLoc &DL, bool IsTailCall,
@@ -3251,11 +3253,11 @@
CCInfo.rewindByValRegsInfo();
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- SDValue Arg = OutVals[i];
+ for (unsigned i = 0, e = ArgLocs.size(), OutIdx = 0; i != e; ++i, ++OutIdx) {
+ SDValue Arg = OutVals[OutIdx];
CCValAssign &VA = ArgLocs[i];
MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ ISD::ArgFlagsTy Flags = Outs[OutIdx].Flags;
bool UseUpperBits = false;
// ByVal Arg.
@@ -3293,8 +3295,11 @@
Arg, DAG.getConstant(1, DL, MVT::i32));
if (!Subtarget.isLittle())
std::swap(Lo, Hi);
+
+ assert(VA.needsCustom());
+
Register LocRegLo = VA.getLocReg();
- unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+ Register LocRegHigh = ArgLocs[++i].getLocReg();
RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
continue;
@@ -3325,7 +3330,7 @@
}
if (UseUpperBits) {
- unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+ unsigned ValSizeInBits = Outs[OutIdx].ArgVT.getSizeInBits();
unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
Arg = DAG.getNode(
ISD::SHL, DL, VA.getLocVT(), Arg,
@@ -3636,18 +3641,18 @@
unsigned CurArgIdx = 0;
CCInfo.rewindByValRegsInfo();
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) {
CCValAssign &VA = ArgLocs[i];
- if (Ins[i].isOrigArg()) {
- std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
- CurArgIdx = Ins[i].getOrigArgIndex();
+ if (Ins[InsIdx].isOrigArg()) {
+ std::advance(FuncArg, Ins[InsIdx].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[InsIdx].getOrigArgIndex();
}
EVT ValVT = VA.getValVT();
- ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ ISD::ArgFlagsTy Flags = Ins[InsIdx].Flags;
bool IsRegLoc = VA.isRegLoc();
if (Flags.isByVal()) {
- assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
+ assert(Ins[InsIdx].isOrigArg() && "Byval arguments cannot be implicit");
unsigned FirstByValReg, LastByValReg;
unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
@@ -3672,7 +3677,8 @@
unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
- ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+ ArgValue =
+ UnpackFromArgumentSlot(ArgValue, VA, Ins[InsIdx].ArgVT, DL, DAG);
// Handle floating point arguments passed in integer registers and
// long double arguments passed in floating point registers.
@@ -3682,8 +3688,10 @@
ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
else if (ABI.IsO32() && RegVT == MVT::i32 &&
ValVT == MVT::f64) {
- unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
- getNextIntArgReg(ArgReg), RC);
+ assert(VA.needsCustom() && "Expected custom argument for f64 split");
+ CCValAssign &NextVA = ArgLocs[++i];
+ unsigned Reg2 =
+ addLiveIn(DAG.getMachineFunction(), NextVA.getLocReg(), RC);
SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
if (!Subtarget.isLittle())
std::swap(ArgValue, ArgValue2);
@@ -3695,6 +3703,8 @@
} else { // VA.isRegLoc()
MVT LocVT = VA.getLocVT();
+ assert(!VA.needsCustom() && "unexpected custom memory argument");
+
if (ABI.IsO32()) {
// We ought to be able to use LocVT directly but O32 sets it to i32
// when allocating floating point values to integer registers.
@@ -3718,17 +3728,24 @@
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
OutChains.push_back(ArgValue.getValue(1));
- ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+ ArgValue =
+ UnpackFromArgumentSlot(ArgValue, VA, Ins[InsIdx].ArgVT, DL, DAG);
InVals.push_back(ArgValue);
}
}
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) {
+
+ if (ArgLocs[i].needsCustom()) {
+ ++i;
+ continue;
+ }
+
// The mips ABIs for returning structs by value requires that we copy
// the sret argument into $v0 for the return. Save the argument into
// a virtual register so that we can access it from the return points.
- if (Ins[i].Flags.isSRet()) {
+ if (Ins[InsIdx].Flags.isSRet()) {
unsigned Reg = MipsFI->getSRetReturnReg();
if (!Reg) {
Reg = MF.getRegInfo().createVirtualRegister(
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h b/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
index 3820c42..3905a18 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -303,7 +303,7 @@
/// Return the correct alignment for the current calling convention.
Align getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const override {
+ const DataLayout &DL) const override {
const Align ABIAlign = DL.getABITypeAlign(ArgTy);
if (ArgTy->isVectorTy())
return std::min(ABIAlign, Align(8));
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index 2692c08..588b7e85 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -37,7 +37,7 @@
static bool
CheckTy0Ty1MemSizeAlign(const LegalityQuery &Query,
std::initializer_list<TypesAndMemOps> SupportedValues) {
- unsigned QueryMemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned QueryMemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
// Non power of two memory access is never legal.
if (!isPowerOf2_64(QueryMemSize))
@@ -60,22 +60,21 @@
static bool CheckTyN(unsigned N, const LegalityQuery &Query,
std::initializer_list<LLT> SupportedValues) {
- for (auto &Val : SupportedValues)
- if (Val == Query.Types[N])
- return true;
- return false;
+ return llvm::is_contained(SupportedValues, Query.Types[N]);
}
MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
using namespace TargetOpcode;
const LLT s1 = LLT::scalar(1);
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
- const LLT v16s8 = LLT::vector(16, 8);
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v16s8 = LLT::fixed_vector(16, 8);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
const LLT p0 = LLT::pointer(0, 32);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
@@ -128,13 +127,13 @@
return false;
unsigned Size = Query.Types[0].getSizeInBits();
- unsigned QueryMemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned QueryMemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
assert(QueryMemSize <= Size && "Scalar can't hold MemSize");
if (Size > 64 || QueryMemSize > 64)
return false;
- if (!isPowerOf2_64(Query.MMODescrs[0].SizeInBits))
+ if (!isPowerOf2_64(Query.MMODescrs[0].MemoryTy.getSizeInBits()))
return true;
if (!ST.systemSupportsUnalignedAccess() &&
@@ -146,7 +145,8 @@
return false;
})
- .minScalar(0, s32);
+ .minScalar(0, s32)
+ .lower();
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
.legalFor({s32, s64});
@@ -158,8 +158,8 @@
.legalFor({{s64, s32}});
getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD})
- .legalForTypesWithMemDesc({{s32, p0, 8, 8},
- {s32, p0, 16, 8}})
+ .legalForTypesWithMemDesc({{s32, p0, s8, 8},
+ {s32, p0, s16, 8}})
.clampScalar(0, s32, s32);
getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
@@ -324,7 +324,7 @@
getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
- computeTables();
+ getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -516,13 +516,14 @@
}
case Intrinsic::vacopy: {
MachinePointerInfo MPO;
+ LLT PtrTy = LLT::pointer(0, 32);
auto Tmp =
- MIRBuilder.buildLoad(LLT::pointer(0, 32), MI.getOperand(2),
+ MIRBuilder.buildLoad(PtrTy, MI.getOperand(2),
*MI.getMF()->getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad, 4, Align(4)));
+ MPO, MachineMemOperand::MOLoad, PtrTy, Align(4)));
MIRBuilder.buildStore(Tmp, MI.getOperand(1),
*MI.getMF()->getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, 4, Align(4)));
+ MPO, MachineMemOperand::MOStore, PtrTy, Align(4)));
MI.eraseFromParent();
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/src/llvm-project/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index 3e32574..301f1c1 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -365,18 +365,14 @@
}]>;
def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
- (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
- immAllOnesV))>;
+ (and node:$ws, (vnot (shl vsplat_imm_eq_1, node:$wt)))>;
def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
- (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
- immAllOnesV))>;
+ (and node:$ws, (vnot (shl vsplat_imm_eq_1, node:$wt)))>;
def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
- (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
- immAllOnesV))>;
+ (and node:$ws, (vnot (shl vsplat_imm_eq_1, node:$wt)))>;
def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
- (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
- node:$wt),
- (bitconvert (v4i32 immAllOnesV))))>;
+ (and node:$ws, (vnot (shl (v2i64 vsplati64_imm_eq_1),
+ node:$wt)))>;
def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
(xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
@@ -3884,21 +3880,17 @@
defm : MSABitPats<xor, "BNEG">;
defm : MSABitPats<or, "BSET">;
-def : MSAPat<(and v16i8:$ws, (xor (shl vsplat_imm_eq_1,
- (vsplati8imm7 v16i8:$wt)),
- immAllOnesV)),
+def : MSAPat<(and v16i8:$ws, (vnot (shl vsplat_imm_eq_1,
+ (vsplati8imm7 v16i8:$wt)))),
(v16i8 (BCLR_B v16i8:$ws, v16i8:$wt))>;
-def : MSAPat<(and v8i16:$ws, (xor (shl vsplat_imm_eq_1,
- (vsplati16imm15 v8i16:$wt)),
- immAllOnesV)),
+def : MSAPat<(and v8i16:$ws, (vnot (shl vsplat_imm_eq_1,
+ (vsplati16imm15 v8i16:$wt)))),
(v8i16 (BCLR_H v8i16:$ws, v8i16:$wt))>;
-def : MSAPat<(and v4i32:$ws, (xor (shl vsplat_imm_eq_1,
- (vsplati32imm31 v4i32:$wt)),
- immAllOnesV)),
+def : MSAPat<(and v4i32:$ws, (vnot (shl vsplat_imm_eq_1,
+ (vsplati32imm31 v4i32:$wt)))),
(v4i32 (BCLR_W v4i32:$ws, v4i32:$wt))>;
-def : MSAPat<(and v2i64:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
- (vsplati64imm63 v2i64:$wt)),
- (bitconvert (v4i32 immAllOnesV)))),
+def : MSAPat<(and v2i64:$ws, (vnot (shl (v2i64 vsplati64_imm_eq_1),
+ (vsplati64imm63 v2i64:$wt)))),
(v2i64 (BCLR_D v2i64:$ws, v2i64:$wt))>;
// Vector extraction with fixed index.
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 310e54b..2ad9ffe 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -42,6 +42,8 @@
switch (MI.getOpcode()) {
default:
return false;
+ case TargetOpcode::G_MEMCPY_INLINE:
+ return Helper.tryEmitMemcpyInline(MI);
case TargetOpcode::G_LOAD:
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD: {
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 3101820..04b69c6 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -716,10 +716,11 @@
static void
combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
- MachineInstr &MI, GISelChangeObserver &Observer) {
+ GUnmerge &MI, GISelChangeObserver &Observer) {
SmallVector<Register, 4> UpdatedDefs;
SmallVector<MachineInstr *, 2> DeadInstrs;
- ArtCombiner.tryCombineUnmergeValues(MI, DeadInstrs, UpdatedDefs, Observer);
+ ArtCombiner.tryCombineUnmergeValues(MI, DeadInstrs,
+ UpdatedDefs, Observer);
for (MachineInstr *DeadMI : DeadInstrs)
DeadMI->eraseFromParent();
}
@@ -750,8 +751,8 @@
// This is new G_UNMERGE that was created during narrowScalar and will
// not be considered for regbank selection. RegBankSelect for mips
// visits/makes corresponding G_MERGE first. Combine them here.
- if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
- combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, NewInstrObserver);
+ if (auto *Unmerge = dyn_cast<GUnmerge>(NewMI))
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, *Unmerge, NewInstrObserver);
// This G_MERGE will be combined away when its corresponding G_UNMERGE
// gets regBankSelected.
else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
@@ -763,7 +764,8 @@
return;
}
case TargetOpcode::G_UNMERGE_VALUES:
- combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, NewInstrObserver);
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, cast<GUnmerge>(MI),
+ NewInstrObserver);
return;
default:
break;
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 3452bf4..7cba311 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -198,8 +198,7 @@
// Reserve the base register if we need to both realign the stack and
// allocate variable-sized objects at runtime. This should test the
// same conditions as MipsFrameLowering::hasBP().
- if (needsStackRealignment(MF) &&
- MF.getFrameInfo().hasVarSizedObjects()) {
+ if (hasStackRealignment(MF) && MF.getFrameInfo().hasVarSizedObjects()) {
Reserved.set(Mips::S7);
Reserved.set(Mips::S7_64);
}
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index f31ba06..bb4b9c6 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -535,7 +535,7 @@
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
- if (RegInfo.needsStackRealignment(MF)) {
+ if (RegInfo.hasStackRealignment(MF)) {
// addiu $Reg, $zero, -MaxAlignment
// andi $sp, $sp, $Reg
Register VR = MF.getRegInfo().createVirtualRegister(RC);
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index bed2776..226f787 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -10,8 +10,6 @@
#define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
#include "MipsFrameLowering.h"
-#include "llvm/Support/TypeSize.h"
-#include <vector>
namespace llvm {
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 4a448a5..37d4313 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -422,7 +422,7 @@
}
bool MipsSETargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
+ EVT VT, unsigned, Align, MachineMemOperand::Flags, bool *Fast) const {
MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
if (Subtarget.systemSupportsUnalignedAccess()) {
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.h b/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.h
index 433d019..0ee36ae 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -41,7 +41,7 @@
const TargetRegisterClass *RC);
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS = 0, unsigned Align = 1,
+ EVT VT, unsigned AS = 0, Align Alignment = Align(1),
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index a48088c..b05e9ad 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -180,7 +180,7 @@
if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI ||
IsISRRegFI)
FrameReg = ABI.GetStackPtr();
- else if (RegInfo->needsStackRealignment(MF)) {
+ else if (RegInfo->hasStackRealignment(MF)) {
if (MFI.hasVarSizedObjects() && !MFI.isFixedObjectIndex(FrameIndex))
FrameReg = ABI.GetBasePtr();
else if (MFI.isFixedObjectIndex(FrameIndex))
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 7e2c431..7dd030f 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -120,15 +120,11 @@
getEffectiveCodeModel(CM, CodeModel::Small), OL),
isLittle(isLittle), TLOF(std::make_unique<MipsTargetObjectFile>()),
ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
- Subtarget(nullptr),
- DefaultSubtarget(TT, CPU, FS, isLittle, *this,
- MaybeAlign(Options.StackAlignmentOverride)),
+ Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this, None),
NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
- isLittle, *this,
- MaybeAlign(Options.StackAlignmentOverride)),
+ isLittle, *this, None),
Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
- isLittle, *this,
- MaybeAlign(Options.StackAlignmentOverride)) {
+ isLittle, *this, None) {
Subtarget = &DefaultSubtarget;
initAsmInfo();
@@ -176,9 +172,7 @@
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
// function, so we can enable it as a subtarget feature.
- bool softFloat =
- F.hasFnAttribute("use-soft-float") &&
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool softFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
if (hasMips16Attr)
FS += FS.empty() ? "+mips16" : ",+mips16";
@@ -199,7 +193,7 @@
resetTargetOptions(F);
I = std::make_unique<MipsSubtarget>(
TargetTriple, CPU, FS, isLittle, *this,
- MaybeAlign(Options.StackAlignmentOverride));
+ MaybeAlign(F.getParent()->getOverrideStackAlignment()));
}
return I.get();
}
@@ -335,6 +329,6 @@
}
bool MipsPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/Mips/MipsTargetStreamer.h b/src/llvm-project/llvm/lib/Target/Mips/MipsTargetStreamer.h
index f4282f5..44615b9 100644
--- a/src/llvm-project/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -19,6 +19,8 @@
namespace llvm {
+class formatted_raw_ostream;
+
class MipsTargetStreamer : public MCTargetStreamer {
public:
MipsTargetStreamer(MCStreamer &S);
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/NVPTX/CMakeLists.txt
index 6a678ec..4db593b 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -12,6 +12,7 @@
set(NVPTXCodeGen_sources
NVPTXAllocaHoisting.cpp
+ NVPTXAtomicLower.cpp
NVPTXAsmPrinter.cpp
NVPTXAssignValidGlobalNames.cpp
NVPTXFrameLowering.cpp
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index cdb70ff..fe335f1 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -94,7 +94,8 @@
outputDwarfFileDirectives();
OS << "\t.section";
Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
- FI->getTargetTriple(), OS, SubSection);
+ getStreamer().getContext().getTargetTriple(),
+ OS, SubSection);
// DWARF sections are enclosed into braces - emit the open one.
OS << "\t{\n";
HasSections = true;
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTX.td b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTX.td
index 2b39e9f4..2b0972b 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTX.td
@@ -57,6 +57,8 @@
"Target SM 7.5">;
def SM80 : SubtargetFeature<"sm_80", "SmVersion", "80",
"Target SM 8.0">;
+def SM86 : SubtargetFeature<"sm_86", "SmVersion", "86",
+ "Target SM 8.6">;
// PTX Versions
def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -83,6 +85,10 @@
"Use PTX version 6.5">;
def PTX70 : SubtargetFeature<"ptx70", "PTXVersion", "70",
"Use PTX version 7.0">;
+def PTX71 : SubtargetFeature<"ptx71", "PTXVersion", "71",
+ "Use PTX version 7.1">;
+def PTX72 : SubtargetFeature<"ptx72", "PTXVersion", "72",
+ "Use PTX version 7.2">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -107,6 +113,7 @@
def : Proc<"sm_72", [SM72, PTX61]>;
def : Proc<"sm_75", [SM75, PTX63]>;
def : Proc<"sm_80", [SM80, PTX70]>;
+def : Proc<"sm_86", [SM86, PTX71]>;
def NVPTXInstrInfo : InstrInfo {
}
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp
new file mode 100644
index 0000000..10bf56f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp
@@ -0,0 +1,70 @@
+//===-- NVPTXAtomicLower.cpp - Lower atomics of local memory ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower atomics of local memory to simple load/stores
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAtomicLower.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+using namespace llvm;
+
+namespace {
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAtomicLower : public FunctionPass {
+public:
+ static char ID; // Pass ID
+ NVPTXAtomicLower() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override {
+ return "NVPTX lower atomics of local memory";
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+bool NVPTXAtomicLower::runOnFunction(Function &F) {
+ SmallVector<AtomicRMWInst *> LocalMemoryAtomics;
+ for (Instruction &I : instructions(F))
+ if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I))
+ if (RMWI->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL)
+ LocalMemoryAtomics.push_back(RMWI);
+
+ bool Changed = false;
+ for (AtomicRMWInst *RMWI : LocalMemoryAtomics)
+ Changed |= lowerAtomicRMWInst(RMWI);
+ return Changed;
+}
+
+char NVPTXAtomicLower::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAtomicLowerPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAtomicLower, "nvptx-atomic-lower",
+ "Lower atomics of local memory to simple load/stores", false,
+ false)
+
+FunctionPass *llvm::createNVPTXAtomicLowerPass() {
+ return new NVPTXAtomicLower();
+}
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.h b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.h
new file mode 100644
index 0000000..faf5765
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXAtomicLower.h
@@ -0,0 +1,22 @@
+//===-- NVPTXAtomicLower.h - Lower atomics of local memory ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower atomics of local memory to simple load/stores
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXATOMICLOWER_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXATOMICLOWER_H
+
+namespace llvm {
+class FunctionPass;
+
+extern FunctionPass *createNVPTXAtomicLowerPass();
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 08f4ab8..00913e9 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -850,7 +850,7 @@
if (!LoadedVT.isSimple())
return false;
- AtomicOrdering Ordering = LD->getOrdering();
+ AtomicOrdering Ordering = LD->getSuccessOrdering();
// In order to lower atomic loads with stronger guarantees we would need to
// use load.acquire or insert fences. However these features were only added
// with PTX ISA 6.0 / sm_70.
@@ -1717,7 +1717,7 @@
if (!StoreVT.isSimple())
return false;
- AtomicOrdering Ordering = ST->getOrdering();
+ AtomicOrdering Ordering = ST->getSuccessOrdering();
// In order to lower atomic loads with stronger guarantees we would need to
// use store.release or insert fences. However these features were only added
// with PTX ISA 6.0 / sm_70.
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8860e90..d4842c9 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1175,7 +1175,8 @@
TargetLoweringBase::LegalizeTypeAction
NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ VT.getScalarType() == MVT::i1)
return TypeSplitVector;
if (VT == MVT::v2f16)
return TypeLegal;
@@ -2652,7 +2653,7 @@
if (!isABI)
return Chain;
- const DataLayout DL = DAG.getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
@@ -3489,6 +3490,10 @@
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
@@ -3496,7 +3501,11 @@
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v2i32;
Info.ptrVal = I.getArgOperand(0);
@@ -3514,6 +3523,14 @@
case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
@@ -3522,7 +3539,15 @@
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v4i32;
Info.ptrVal = I.getArgOperand(0);
@@ -3602,7 +3627,11 @@
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
@@ -3612,6 +3641,16 @@
return true;
}
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
+
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
+
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
@@ -3650,6 +3689,37 @@
return true;
}
+ case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
+
+ case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::f64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = Align(8);
+ return true;
+ }
+
+ case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
+ case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v2f64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = Align(16);
+ return true;
+ }
+
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
@@ -3682,7 +3752,11 @@
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
@@ -3730,6 +3804,19 @@
return true;
}
+ case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
+ case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
+ case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::v2f64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = Align(16);
+ return true;
+ }
+
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:
@@ -4304,14 +4391,7 @@
// Allow unsafe math if unsafe-fp-math attribute explicitly says so.
const Function &F = MF.getFunction();
- if (F.hasFnAttribute("unsafe-fp-math")) {
- Attribute Attr = F.getFnAttribute("unsafe-fp-math");
- StringRef Val = Attr.getValueAsString();
- if (Val == "true")
- return true;
- }
-
- return false;
+ return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
}
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 381ed4d..4834985 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,11 +144,15 @@
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
+def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">;
+def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
+def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
+def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
@@ -1039,39 +1043,39 @@
[(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
Requires<[allowUnsafeFPMath]>;
-// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
// i.e. "poor man's fmod()"
// frem - f32 FTZ
def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
(FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
- (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+ (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
Float32Regs:$y))>,
Requires<[doF32FTZ]>;
def : Pat<(frem Float32Regs:$x, fpimm:$y),
(FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
- (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+ (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
fpimm:$y))>,
Requires<[doF32FTZ]>;
// frem - f32
def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
(FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
- (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+ (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
Float32Regs:$y))>;
def : Pat<(frem Float32Regs:$x, fpimm:$y),
(FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
- (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+ (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
fpimm:$y))>;
// frem - f64
def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
(FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
- (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+ (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
Float64Regs:$y))>;
def : Pat<(frem Float64Regs:$x, fpimm:$y),
(FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
- (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+ (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
fpimm:$y))>;
//-----------------------------------
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8ccd47c..de4bf2e 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -274,6 +274,22 @@
defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
i64imm>;
+multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
+ def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
+ "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
+ [(set Int32Regs:$dst, (Intrin Int32Regs:$src, Int32Regs:$mask))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm REDUX_SYNC_UMIN : REDUX_SYNC<"min", "u32", int_nvvm_redux_sync_umin>;
+defm REDUX_SYNC_UMAX : REDUX_SYNC<"max", "u32", int_nvvm_redux_sync_umax>;
+defm REDUX_SYNC_ADD : REDUX_SYNC<"add", "s32", int_nvvm_redux_sync_add>;
+defm REDUX_SYNC_MIN : REDUX_SYNC<"min", "s32", int_nvvm_redux_sync_min>;
+defm REDUX_SYNC_MAX : REDUX_SYNC<"max", "s32", int_nvvm_redux_sync_max>;
+defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and>;
+defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>;
+defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
+
} // isConvergent = true
//-----------------------------------
@@ -289,6 +305,211 @@
//-----------------------------------
+// Async Copy Functions
+//-----------------------------------
+
+multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+ !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int32Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+ !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int64Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_MBARRIER_ARRIVE :
+ CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
+defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
+ CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
+ CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
+ CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
+
+multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
+
+multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
+ CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
+
+def CP_ASYNC_COMMIT_GROUP :
+ NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_GROUP :
+ NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
+ [(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_ALL :
+ NVPTXInst<(outs), (ins), "cp.async.wait_all;",
+ [(int_nvvm_cp_async_wait_all)]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+//-----------------------------------
+// MBarrier Functions
+//-----------------------------------
+
+multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+ [(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+ [(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
+defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
+ int_nvvm_mbarrier_init_shared>;
+
+multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int32Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int64Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
+defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
+ int_nvvm_mbarrier_inval_shared>;
+
+multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
+defm MBARRIER_ARRIVE_SHARED :
+ MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
+
+multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_NOCOMPLETE :
+ MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
+defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
+ MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.arrive_drop", AddrSpace,
+ ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.arrive_drop", AddrSpace,
+ ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP :
+ MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
+defm MBARRIER_ARRIVE_DROP_SHARED :
+ MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
+ MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
+ MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
+ int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
+
+multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
+ !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+ [(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
+ !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+ [(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_TEST_WAIT :
+ MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
+defm MBARRIER_TEST_WAIT_SHARED :
+ MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
+
+class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
+ NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
+ "mbarrier.pending_count.b64 $res, $state;",
+ [(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def MBARRIER_PENDING_COUNT :
+ MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
+
+//-----------------------------------
// Math Functions
//-----------------------------------
@@ -1722,21 +1943,21 @@
!strconcat("ldu.global.", TyStr), []>;
}
-multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int32Regs:$src),
+ regclass:$dst4), (ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int64Regs:$src),
+ regclass:$dst4), (ins Int64Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins MEMri:$src),
+ regclass:$dst4), (ins MEMri:$src),
!strconcat("ldu.global.", TyStr), []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins MEMri64:$src),
+ regclass:$dst4), (ins MEMri64:$src),
!strconcat("ldu.global.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
+ regclass:$dst4), (ins imemAny:$src),
!strconcat("ldu.global.", TyStr), []>;
}
@@ -1776,7 +1997,7 @@
//-----------------------------------
-// Support for ldg on sm_35 or later
+// Support for ldg on sm_35 or later
//-----------------------------------
// Don't annotate ld.global.nc as mayLoad, because these loads go through the
@@ -1824,7 +2045,7 @@
// vector
-// Elementized vector ldg
+// Elementized vector ldg
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins Int32Regs:$src),
@@ -1843,21 +2064,21 @@
!strconcat("ld.global.nc.", TyStr), []>;
}
-multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int32Regs:$src),
+ regclass:$dst4), (ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int64Regs:$src),
+ regclass:$dst4), (ins Int64Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins MEMri:$src),
+ regclass:$dst4), (ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins MEMri64:$src),
+ regclass:$dst4), (ins MEMri64:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
+ regclass:$dst4), (ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr), []>;
}
@@ -7347,12 +7568,15 @@
// In addition to target-independent fields provided by WMMA_REGS, it adds
// the fields commonly used to implement specific PTX instruction -- register
// types and names, constraints, parts of assembly, etc.
-class WMMA_REGINFO<WMMA_REGS r>
+class WMMA_REGINFO<WMMA_REGS r, string op>
: WMMA_REGS<r.geom, r.frag, r.ptx_elt_type> {
// NVPTX register types used to carry fragment data.
NVPTXRegClass regclass = !cond(
!eq(ptx_elt_type, "f16") : Float16x2Regs,
!eq(ptx_elt_type, "f32") : Float32Regs,
+ !eq(ptx_elt_type, "f64") : Float64Regs,
+ !eq(ptx_elt_type, "bf16") : Int32Regs,
+ !eq(ptx_elt_type, "tf32") : Int32Regs,
!eq(ptx_elt_type, "s32") : Int32Regs,
!eq(ptx_elt_type, "s8") : Int32Regs,
!eq(ptx_elt_type, "u8") : Int32Regs,
@@ -7381,6 +7605,9 @@
!or(!eq(ptx_elt_type, "f16"),
!eq(ptx_elt_type, "f32"))) : [hasSM70, hasPTX60],
+ !and(!eq(geom,"m8n8k4"),
+ !eq(ptx_elt_type, "f64")) : [hasSM80, hasPTX70],
+
// fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
!and(!or(!eq(geom, "m8n32k16"),
!eq(geom, "m32n8k16")),
@@ -7395,11 +7622,46 @@
!eq(ptx_elt_type, "s8"),
!eq(ptx_elt_type, "s32"))) : [hasSM72, hasPTX63],
- // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
- !or(!eq(geom,"m8n8k128"),
- !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63],
+ !and(!or(!eq(geom,"m16n16k16"),
+ !eq(geom,"m8n32k16"),
+ !eq(geom,"m32n8k16")),
+ !eq(ptx_elt_type, "bf16")) : [hasSM80, hasPTX70],
- !eq(geom, "m8n8k4") : [hasSM70, hasPTX64]);
+ !and(!eq(geom,"m16n16k8"),
+ !eq(ptx_elt_type, "tf32")) : [hasSM80, hasPTX70],
+
+ !and(!eq(geom,"m16n16k8"),
+ !eq(ptx_elt_type, "f32")) : [hasSM80, hasPTX70],
+
+ // b1 -> s32 @ m8n8k128(b1)
+ !and(!ne(op,"mma"),
+ !eq(geom,"m8n8k128")) : [hasSM75, hasPTX63],
+
+ // u4/s4 -> s32 @ m8n8k32 (u4/s4)
+ !and(!ne(op,"mma"),
+ !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63],
+
+ !or(!eq(geom,"m16n8k8"),
+ !eq(geom,"m8n8k16")) : [hasSM75, hasPTX65],
+
+ !and(!ne(ptx_elt_type,"f64"),
+ !eq(geom, "m8n8k4")) : [hasSM70, hasPTX64],
+
+ // mma m8n8k32 requires higher PTX version
+ !and(!eq(op,"mma"),
+ !eq(geom,"m8n8k32")) : [hasSM75, hasPTX65],
+
+ !and(!eq(ptx_elt_type,"f64"),
+ !eq(geom, "m8n8k4")) : [hasSM80, hasPTX70],
+
+ !and(!eq(op,"mma"),
+ !or(!eq(geom, "m16n8k16"),
+ !eq(geom, "m16n8k4"),
+ !eq(geom, "m16n8k32"),
+ !eq(geom, "m16n8k64"),
+ !eq(geom, "m8n8k128"),
+ !eq(geom, "m16n8k128"),
+ !eq(geom, "m16n8k256"))) : [hasSM80, hasPTX70]);
// template DAGs for instruction inputs/output.
dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -7523,60 +7785,109 @@
foreach space = [".global", ".shared", ""] in {
foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
foreach frag = NVVM_MMA_OPS.all_ld_ops in
- if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
- def : WMMA_LOAD<WMMA_REGINFO<frag>, layout, space, stride, addr>;
+ if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
+ def : WMMA_LOAD<WMMA_REGINFO<frag, "load">, layout, space, stride, addr>;
foreach frag = NVVM_MMA_OPS.all_st_ops in
- if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
- def : WMMA_STORE_D<WMMA_REGINFO<frag>, layout, space, stride, addr>;
+ if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
+ def : WMMA_STORE_D<WMMA_REGINFO<frag, "store">, layout, space, stride, addr>;
} // addr
} // space
} // stride
} // layout
} // defset
+// B1 instruction variants need extra constraints.
+class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
+ string Op = b1op;
+ WMMA_REGINFO Frag = FragA;
+ list<Predicate> ret = !listconcat(
+ FragA.Predicates,
+ !if(!eq(b1op, ".and.popc"), [hasSM80,hasPTX71],[])
+ );
+}
// WMMA.MMA
class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
WMMA_REGINFO FragC, WMMA_REGINFO FragD,
- string ALayout, string BLayout, int Satfinite>
- : WMMA_INSTR<WMMA_NAME_MMA<ALayout, BLayout, Satfinite, FragA, FragB, FragC, FragD>.record,
- [FragA.Ins, FragB.Ins, FragC.Ins]>,
+ string ALayout, string BLayout, int Satfinite, string rnd, string b1op>
+ : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record,
+ [FragA.Ins, FragB.Ins, FragC.Ins]>,
// Requires does not seem to have effect on Instruction w/o Patterns.
// We set it here anyways and propagate to the Pat<> we construct below.
- Requires<FragA.Predicates> {
+ Requires<MMA_OP_PREDICATES<FragA, b1op>.ret> {
let OutOperandList = FragD.Outs;
let InOperandList = !con(Args, (ins MmaCode:$ptx));
string TypeList = !cond(
- !eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type
- # ".f16.f16."
- # FragC.ptx_elt_type,
- !eq(FragD.ptx_elt_type, "s32") : ".s32"
- # "." # FragA.ptx_elt_type
- # "." # FragB.ptx_elt_type
- # ".s32",
- 1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type,
+ !eq(FragA.ptx_elt_type, "f16") : "." # FragD.ptx_elt_type
+ # "." # FragC.ptx_elt_type,
+ 1: "." # FragD.ptx_elt_type
+ # "." # FragA.ptx_elt_type
+ # "." # FragB.ptx_elt_type
+ # "." # FragC.ptx_elt_type,
);
- let AsmString = !if(!eq(FragA.geom, "m8n8k4"),
- "mma.sync.aligned.m8n8k4"
- # "." # ALayout
- # "." # BLayout
- # TypeList # "\n\t\t"
- # FragD.regstring # ",\n\t\t"
- # FragA.regstring # ",\n\t\t"
- # FragB.regstring # ",\n\t\t"
- # FragC.regstring # ";",
- "wmma.mma"
- # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
- # ".sync"
- # "${ptx:aligned}"
- # "." # ALayout
- # "." # BLayout
- # "." # FragA.geom
- # TypeList
- # !if(Satfinite, ".satfinite", "") # "\n\t\t"
- # FragD.regstring # ",\n\t\t"
- # FragA.regstring # ",\n\t\t"
- # FragB.regstring # ",\n\t\t"
- # FragC.regstring # ";");
+ let AsmString = "wmma.mma"
+ # b1op
+ # ".sync"
+ # "${ptx:aligned}"
+ # "." # ALayout
+ # "." # BLayout
+ # "." # FragA.geom
+ # !if(!ne(rnd, ""), !strconcat(".", rnd), "")
+ # TypeList
+ # !if(Satfinite, ".satfinite", "") # "\n\t\t"
+ # FragD.regstring # ",\n\t\t"
+ # FragA.regstring # ",\n\t\t"
+ # FragB.regstring # ",\n\t\t"
+ # FragC.regstring # ";";
+}
+
+defset list<WMMA_INSTR> WMMAs = {
+ foreach layout_a = ["row", "col"] in {
+ foreach layout_b = ["row", "col"] in {
+ foreach satf = [0, 1] in {
+ foreach rnd = ["", "rn", "rz", "rm", "rp"] in {
+ foreach op = NVVM_MMA_OPS.all_wmma_ops in {
+ foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
+ if NVVM_WMMA_SUPPORTED<op, layout_a, layout_b, satf, rnd>.ret then {
+ def : WMMA_MMA<WMMA_REGINFO<op[0], "wmma.mma">,
+ WMMA_REGINFO<op[1], "wmma.mma">,
+ WMMA_REGINFO<op[2], "wmma.mma">,
+ WMMA_REGINFO<op[3], "wmma.mma">,
+ layout_a, layout_b, satf, rnd, b1op>;
+ }
+ } // b1op
+ } // op
+ } // rnd
+ } // satf
+ } // layout_b
+ } // layout_a
+} // defset
+
+// MMA
+class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
+ WMMA_REGINFO FragC, WMMA_REGINFO FragD,
+ string ALayout, string BLayout, int Satfinite, string b1op>
+ : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, FragA, FragB, FragC, FragD>.record,
+ [FragA.Ins, FragB.Ins, FragC.Ins]>,
+ // Requires does not seem to have effect on Instruction w/o Patterns.
+ // We set it here anyways and propagate to the Pat<> we construct below.
+ Requires<MMA_OP_PREDICATES<FragA, b1op>.ret> {
+ let OutOperandList = FragD.Outs;
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
+ string TypeList = "." # FragD.ptx_elt_type
+ # "." # FragA.ptx_elt_type
+ # "." # FragB.ptx_elt_type
+ # "." # FragC.ptx_elt_type;
+ let AsmString = "mma.sync.aligned."
+ # FragA.geom
+ # "." # ALayout
+ # "." # BLayout
+ # !if(Satfinite, ".satfinite", "")
+ # TypeList
+ # b1op # "\n\t\t"
+ # FragD.regstring # ",\n\t\t"
+ # FragA.regstring # ",\n\t\t"
+ # FragB.regstring # ",\n\t\t"
+ # FragC.regstring # ";";
}
defset list<WMMA_INSTR> MMAs = {
@@ -7584,13 +7895,15 @@
foreach layout_b = ["row", "col"] in {
foreach satf = [0, 1] in {
foreach op = NVVM_MMA_OPS.all_mma_ops in {
- if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
- def : WMMA_MMA<WMMA_REGINFO<op[0]>,
- WMMA_REGINFO<op[1]>,
- WMMA_REGINFO<op[2]>,
- WMMA_REGINFO<op[3]>,
- layout_a, layout_b, satf>;
- }
+ foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
+ if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
+ def : MMA<WMMA_REGINFO<op[0], "mma">,
+ WMMA_REGINFO<op[1], "mma">,
+ WMMA_REGINFO<op[2], "mma">,
+ WMMA_REGINFO<op[3], "mma">,
+ layout_a, layout_b, satf, b1op>;
+ }
+ } // b1op
} // op
} // satf
} // layout_b
@@ -7601,12 +7914,12 @@
// Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
// dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
// the instruction record.
-class WMMA_PAT<WMMA_INSTR wi>
+class MMA_PAT<WMMA_INSTR wi>
: Pat<wi.IntrinsicPattern,
!con(!foreach(tmp, wi.Args, !subst(ins, wi, tmp)),
(wi ptx.version))>,
Requires<wi.Predicates>;
// Build intrinsic->instruction patterns for all MMA instructions.
-foreach mma = !listconcat(MMAs, MMA_LDSTs) in
- def : WMMA_PAT<mma>;
+foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs) in
+ def : MMA_PAT<mma>;
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index cf488b06..1bd0255 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -70,7 +70,7 @@
for (auto &I : BB) {
if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
Changed = true;
- auto ETy = cast<PointerType>(allocaInst->getType())->getElementType();
+ auto ETy = allocaInst->getAllocatedType();
auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index fd58ff1..ddb7f09 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -99,6 +99,8 @@
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
+#define DEBUG_TYPE "nvptx-lower-args"
+
using namespace llvm;
namespace llvm {
@@ -140,6 +142,7 @@
// =============================================================================
// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// and we can't guarantee that the only accesses are loads,
// then add the following instructions to the first basic block:
//
// %temp = alloca %struct.x, align 8
@@ -150,7 +153,79 @@
// The above code allocates some space in the stack and copies the incoming
// struct from param space to local space.
// Then replace all occurrences of %d by %temp.
+//
+// In case we know that all users are GEPs or Loads, replace them with the same
+// ones in parameter AS, so we can access them using ld.param.
// =============================================================================
+
+// Replaces the \p OldUser instruction with the same in parameter AS.
+// Only Load and GEP are supported.
+static void convertToParamAS(Value *OldUser, Value *Param) {
+ Instruction *I = dyn_cast<Instruction>(OldUser);
+ assert(I && "OldUser must be an instruction");
+ struct IP {
+ Instruction *OldInstruction;
+ Value *NewParam;
+ };
+ SmallVector<IP> ItemsToConvert = {{I, Param}};
+ SmallVector<Instruction *> InstructionsToDelete;
+
+ auto CloneInstInParamAS = [](const IP &I) -> Value * {
+ if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
+ LI->setOperand(0, I.NewParam);
+ return LI;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
+ SmallVector<Value *, 4> Indices(GEP->indices());
+ auto *NewGEP = GetElementPtrInst::Create(GEP->getSourceElementType(),
+ I.NewParam, Indices,
+ GEP->getName(), GEP);
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ if (auto *BC = dyn_cast<BitCastInst>(I.OldInstruction)) {
+ auto *NewBCType = PointerType::getWithSamePointeeType(
+ cast<PointerType>(BC->getType()), ADDRESS_SPACE_PARAM);
+ return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,
+ BC->getName(), BC);
+ }
+ if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I.OldInstruction)) {
+ assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
+ (void)ASC;
+ // Just pass through the argument, the old ASC is no longer needed.
+ return I.NewParam;
+ }
+ llvm_unreachable("Unsupported instruction");
+ };
+
+ while (!ItemsToConvert.empty()) {
+ IP I = ItemsToConvert.pop_back_val();
+ Value *NewInst = CloneInstInParamAS(I);
+
+ if (NewInst && NewInst != I.OldInstruction) {
+ // We've created a new instruction. Queue users of the old instruction to
+ // be converted and the instruction itself to be deleted. We can't delete
+ // the old instruction yet, because it's still in use by a load somewhere.
+ llvm::for_each(
+ I.OldInstruction->users(), [NewInst, &ItemsToConvert](Value *V) {
+ ItemsToConvert.push_back({cast<Instruction>(V), NewInst});
+ });
+
+ InstructionsToDelete.push_back(I.OldInstruction);
+ }
+ }
+
+ // Now we know that all argument loads are using addresses in parameter space
+ // and we can finally remove the old instructions in generic AS. Instructions
+ // scheduled for removal should be processed in reverse order so the ones
+ // closest to the load are deleted first. Otherwise they may still be in use.
+ // E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will
+ // have {GEP,BitCast}. GEP can't be deleted first, because it's still used by
+ // the BitCast.
+ llvm::for_each(reverse(InstructionsToDelete),
+ [](Instruction *I) { I->eraseFromParent(); });
+}
+
void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
Function *Func = Arg->getParent();
Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -159,6 +234,49 @@
assert(PType && "Expecting pointer type in handleByValParam");
Type *StructType = PType->getElementType();
+
+ auto IsALoadChain = [&](Value *Start) {
+ SmallVector<Value *, 16> ValuesToCheck = {Start};
+ auto IsALoadChainInstr = [](Value *V) -> bool {
+ if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
+ return true;
+ // ASC to param space are OK, too -- we'll just strip them.
+ if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
+ if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
+ return true;
+ }
+ return false;
+ };
+
+ while (!ValuesToCheck.empty()) {
+ Value *V = ValuesToCheck.pop_back_val();
+ if (!IsALoadChainInstr(V)) {
+ LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V
+ << "\n");
+ (void)Arg;
+ return false;
+ }
+ if (!isa<LoadInst>(V))
+ llvm::append_range(ValuesToCheck, V->users());
+ }
+ return true;
+ };
+
+ if (llvm::all_of(Arg->users(), IsALoadChain)) {
+ // Convert all loads and intermediate operations to use parameter AS and
+ // skip creation of a local copy of the argument.
+ SmallVector<User *, 16> UsersToUpdate(Arg->users());
+ Value *ArgInParamAS = new AddrSpaceCastInst(
+ Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+ FirstInst);
+ llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
+ convertToParamAS(V, ArgInParamAS);
+ });
+ LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
+ return;
+ }
+
+ // Otherwise we have to create a temporary copy.
const DataLayout &DL = Func->getParent()->getDataLayout();
unsigned AS = DL.getAllocaAddrSpace();
AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
@@ -198,8 +316,9 @@
}
Instruction *PtrInGlobal = new AddrSpaceCastInst(
- Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
- ADDRESS_SPACE_GLOBAL),
+ Ptr,
+ PointerType::getWithSamePointeeType(cast<PointerType>(Ptr->getType()),
+ ADDRESS_SPACE_GLOBAL),
Ptr->getName(), &*InsertPt);
Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
Ptr->getName(), &*InsertPt);
@@ -231,6 +350,7 @@
}
}
+ LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");
for (Argument &Arg : F.args()) {
if (Arg.getType()->isPointerTy()) {
if (Arg.hasByValAttr())
@@ -244,6 +364,7 @@
// Device functions only need to copy byval args into local memory.
bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) {
+ LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
for (Argument &Arg : F.args())
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
handleByValParam(&Arg);
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 756355f..8e2299e 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -65,16 +65,25 @@
// way with simply the frame index and offset rather than any
// target-specific addressing mode.
if (MI.isDebugValue()) {
- assert(i == 0 && "Frame indices can only appear as the first "
- "operand of a DBG_VALUE machine instruction");
+ MachineOperand &Op = MI.getOperand(i);
+ assert(
+ MI.isDebugOperand(&Op) &&
+ "Frame indices can only appear as a debug operand in a DBG_VALUE*"
+ " machine instruction");
Register Reg;
- int64_t Offset =
- TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg)
- .getFixed();
- MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
- MI.getOperand(0).setIsDebug();
- auto *DIExpr = DIExpression::prepend(
- MI.getDebugExpression(), DIExpression::ApplyOffset, Offset);
+ auto Offset =
+ TFI.getFrameIndexReference(MF, Op.getIndex(), Reg);
+ Op.ChangeToRegister(Reg, /*isDef=*/false);
+ Op.setIsDebug();
+ const DIExpression *DIExpr = MI.getDebugExpression();
+ if (MI.isNonListDebugValue()) {
+ DIExpr = TRI.prependOffsetExpression(MI.getDebugExpression(), DIExpression::ApplyOffset, Offset);
+ } else {
+ SmallVector<uint64_t, 3> Ops;
+ TRI.getOffsetOpcodes(Offset, Ops);
+ unsigned OpIdx = MI.getDebugOperandIndex(&Op);
+ DIExpr = DIExpression::appendOpsToArg(DIExpr, Ops, OpIdx);
+ }
MI.getDebugExpressionOp().setMetadata(DIExpr);
continue;
}
@@ -229,7 +238,7 @@
// value.
Align StackAlign;
if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
- (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
+ (RegInfo->hasStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
StackAlign = TFI.getStackAlign();
else
StackAlign = TFI.getTransientStackAlign();
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index f1a82f1..e3515f3 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -13,6 +13,7 @@
#include "NVPTXTargetMachine.h"
#include "NVPTX.h"
#include "NVPTXAllocaHoisting.h"
+#include "NVPTXAtomicLower.h"
#include "NVPTXLowerAggrCopies.h"
#include "NVPTXTargetObjectFile.h"
#include "NVPTXTargetTransformInfo.h"
@@ -65,6 +66,7 @@
void initializeNVVMReflectPass(PassRegistry&);
void initializeGenericToNVVMPass(PassRegistry&);
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+void initializeNVPTXAtomicLowerPass(PassRegistry &);
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
void initializeNVPTXLowerArgsPass(PassRegistry &);
@@ -86,6 +88,7 @@
initializeGenericToNVVMPass(PR);
initializeNVPTXAllocaHoistingPass(PR);
initializeNVPTXAssignValidGlobalNamesPass(PR);
+ initializeNVPTXAtomicLowerPass(PR);
initializeNVPTXLowerArgsPass(PR);
initializeNVPTXLowerAllocaPass(PR);
initializeNVPTXLowerAggrCopiesPass(PR);
@@ -206,8 +209,7 @@
});
}
-void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) {
+void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
[](StringRef PassName, FunctionPassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
@@ -223,11 +225,12 @@
});
PB.registerPipelineStartEPCallback(
- [this, DebugPassManager](ModulePassManager &PM,
- PassBuilder::OptimizationLevel Level) {
- FunctionPassManager FPM(DebugPassManager);
+ [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM;
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
- FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
+ // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
+ // investigate and re-enable.
+ // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
});
}
@@ -250,6 +253,7 @@
addPass(createSROAPass());
addPass(createNVPTXLowerAllocaPass());
addPass(createInferAddressSpacesPass());
+ addPass(createNVPTXAtomicLowerPass());
}
void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index bef541c..39647eb 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -62,8 +62,7 @@
}
void adjustPassManager(PassManagerBuilder &) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index d4b2ae3..20bd227 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -368,14 +368,14 @@
return None;
}
-int NVPTXTTIImpl::getArithmeticInstrCost(
+InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
diff --git a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 6f07104..d5a52d4 100644
--- a/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -71,7 +71,9 @@
// Only <2 x half> should be vectorized, so always return 32 for the vector
// register size.
- unsigned getRegisterBitWidth(bool Vector) const { return 32; }
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ return TypeSize::getFixed(32);
+ }
unsigned getMinVectorRegisterBitWidth() const { return 32; }
// We don't want to prevent inlining because of target-cpu and -features
@@ -86,7 +88,7 @@
// calls are particularly expensive in NVPTX.
unsigned getInliningThresholdMultiplier() { return 5; }
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 197fd3c..7631bb4 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -294,6 +294,11 @@
return (unsigned) Imm.Val >> 1;
}
+ unsigned getG8pReg() const {
+ assert(isEvenRegNumber() && "Invalid access!");
+ return (unsigned)Imm.Val;
+ }
+
unsigned getCCReg() const {
assert(isCCRegNumber() && "Invalid access!");
return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
@@ -359,6 +364,15 @@
bool isS16ImmX4() const { return Kind == Expression ||
(Kind == Immediate && isInt<16>(getImm()) &&
(getImm() & 3) == 0); }
+
+ bool isHashImmX8() const {
+ // The Hash Imm form is used for instructions that check or store a hash.
+ // These instructions have a small immediate range that spans between
+ // -8 and -512.
+ return (Kind == Immediate && getImm() <= -8 && getImm() >= -512 &&
+ (getImm() & 7) == 0);
+ }
+
bool isS16ImmX16() const { return Kind == Expression ||
(Kind == Immediate && isInt<16>(getImm()) &&
(getImm() & 15) == 0); }
@@ -423,6 +437,9 @@
&& isUInt<5>(getExprCRVal())) ||
(Kind == Immediate
&& isUInt<5>(getImm())); }
+
+ bool isEvenRegNumber() const { return isRegNumber() && (getImm() & 1) == 0; }
+
bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
isPowerOf2_32(getImm()); }
bool isATBitsAsHint() const { return false; }
@@ -453,6 +470,11 @@
Inst.addOperand(MCOperand::createReg(XRegsNoX0[getReg()]));
}
+ void addRegG8pRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(XRegs[getG8pReg()]));
+ }
+
void addRegGxRCOperands(MCInst &Inst, unsigned N) const {
if (isPPC64())
addRegG8RCOperands(Inst, N);
@@ -1132,29 +1154,6 @@
}
break;
}
- case PPC::CP_COPYx:
- case PPC::CP_COPY_FIRST: {
- MCInst TmpInst;
- TmpInst.setOpcode(PPC::CP_COPY);
- TmpInst.addOperand(Inst.getOperand(0));
- TmpInst.addOperand(Inst.getOperand(1));
- TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_COPYx ? 0 : 1));
-
- Inst = TmpInst;
- break;
- }
- case PPC::CP_PASTEx :
- case PPC::CP_PASTE_LAST: {
- MCInst TmpInst;
- TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ? PPC::CP_PASTE
- : PPC::CP_PASTE_rec);
- TmpInst.addOperand(Inst.getOperand(0));
- TmpInst.addOperand(Inst.getOperand(1));
- TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_PASTEx ? 0 : 1));
-
- Inst = TmpInst;
- break;
- }
}
}
@@ -1208,28 +1207,28 @@
return true;
StringRef Name = getParser().getTok().getString();
- if (Name.equals_lower("lr")) {
+ if (Name.equals_insensitive("lr")) {
RegNo = isPPC64() ? PPC::LR8 : PPC::LR;
IntVal = 8;
- } else if (Name.equals_lower("ctr")) {
+ } else if (Name.equals_insensitive("ctr")) {
RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR;
IntVal = 9;
- } else if (Name.equals_lower("vrsave")) {
+ } else if (Name.equals_insensitive("vrsave")) {
RegNo = PPC::VRSAVE;
IntVal = 256;
- } else if (Name.startswith_lower("r") &&
+ } else if (Name.startswith_insensitive("r") &&
!Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal];
- } else if (Name.startswith_lower("f") &&
+ } else if (Name.startswith_insensitive("f") &&
!Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
RegNo = FRegs[IntVal];
- } else if (Name.startswith_lower("vs") &&
+ } else if (Name.startswith_insensitive("vs") &&
!Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
RegNo = VSRegs[IntVal];
- } else if (Name.startswith_lower("v") &&
+ } else if (Name.startswith_insensitive("v") &&
!Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
RegNo = VRegs[IntVal];
- } else if (Name.startswith_lower("cr") &&
+ } else if (Name.startswith_insensitive("cr") &&
!Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
RegNo = CRRegs[IntVal];
} else
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/PowerPC/CMakeLists.txt
index 6da1601..195eebf 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -27,6 +27,7 @@
PPCCallingConv.cpp
PPCCCState.cpp
PPCCTRLoops.cpp
+ PPCExpandAtomicPseudoInsts.cpp
PPCHazardRecognizers.cpp
PPCInstrInfo.cpp
PPCISelDAGToDAG.cpp
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 3e9286f..94416fc 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -160,6 +160,12 @@
return decodeRegisterClass(Inst, RegNo, XRegs);
}
+static DecodeStatus DecodeG8pRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, XRegs);
+}
+
static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
@@ -279,6 +285,23 @@
return MCDisassembler::Success;
}
+static DecodeStatus decodeMemRIHashOperands(MCInst &Inst, uint64_t Imm,
+ int64_t Address,
+ const void *Decoder) {
+ // Decode the memrix field for a hash store or hash check operation.
+ // The field is composed of a register and an immediate value that is 6 bits
+ // and covers the range -8 to -512. The immediate is always negative and 2s
+ // complement which is why we sign extend a 7 bit value.
+ const uint64_t Base = Imm >> 6;
+ const int64_t Disp = SignExtend64<7>((Imm & 0x3F) + 64) * 8;
+
+ assert(Base < 32 && "Invalid base register");
+
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createReg(RRegs[Base]));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
int64_t Address, const void *Decoder) {
// Decode the memrix16 field (imm, reg), which has the low 12-bits as the
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
index e8f8cbf..22731bb 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -13,7 +13,13 @@
//===----------------------------------------------------------------------===//
#include "PPCCallLowering.h"
+#include "PPCISelLowering.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "ppc-call-lowering"
@@ -36,18 +42,82 @@
return true;
}
+bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ return false;
+}
+
bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<ArrayRef<Register>> VRegs,
FunctionLoweringInfo &FLI) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const auto &DL = F.getParent()->getDataLayout();
+ auto &TLI = *getTLI<PPCTargetLowering>();
- // If VRegs is empty, then there are no formal arguments to lower and thus can
- // always return true. If there are formal arguments, we currently do not
- // handle them and thus return false.
- return VRegs.empty();
+ // Loop over each arg, set flags and split to single value types
+ SmallVector<ArgInfo, 8> SplitArgs;
+ unsigned I = 0;
+ for (const auto &Arg : F.args()) {
+ if (DL.getTypeStoreSize(Arg.getType()).isZero())
+ continue;
+
+ ArgInfo OrigArg{VRegs[I], Arg, I};
+ setArgFlags(OrigArg, I + AttributeList::FirstArgIndex, DL, F);
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
+ ++I;
+ }
+
+ CCAssignFn *AssignFn =
+ TLI.ccAssignFnForCall(F.getCallingConv(), false, F.isVarArg());
+ IncomingValueAssigner ArgAssigner(AssignFn);
+ FormalArgHandler ArgHandler(MIRBuilder, MRI);
+ return determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgs,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg());
}
-bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallLoweringInfo &Info) const {
- return false;
+void PPCIncomingValueHandler::assignValueToReg(Register ValVReg,
+ Register PhysReg,
+ CCValAssign &VA) {
+ markPhysRegUsed(PhysReg);
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
+}
+
+void PPCIncomingValueHandler::assignValueToAddress(Register ValVReg,
+ Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO,
+ CCValAssign &VA) {
+ // define a lambda expression to load value
+ auto BuildLoad = [](MachineIRBuilder &MIRBuilder, MachinePointerInfo &MPO,
+ LLT MemTy, const DstOp &Res, Register Addr) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, MemTy,
+ inferAlignFromPtrInfo(MF, MPO));
+ return MIRBuilder.buildLoad(Res, Addr, *MMO);
+ };
+
+ BuildLoad(MIRBuilder, MPO, MemTy, ValVReg, Addr);
+}
+
+Register PPCIncomingValueHandler::getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ const bool IsImmutable = !Flags.isByVal();
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+ // Build Frame Index based on whether the machine is 32-bit or 64-bit
+ llvm::LLT FramePtr = LLT::pointer(
+ 0, MIRBuilder.getMF().getDataLayout().getPointerSizeInBits());
+ MachineInstrBuilder AddrReg = MIRBuilder.buildFrameIndex(FramePtr, FI);
+ StackUsed = std::max(StackUsed, Size + Offset);
+ return AddrReg.getReg(0);
+}
+
+void FormalArgHandler::markPhysRegUsed(unsigned PhysReg) {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
index 5a449f4..b045032 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
@@ -35,6 +35,38 @@
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
};
+
+class PPCIncomingValueHandler : public CallLowering::IncomingValueHandler {
+public:
+ PPCIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI)
+ : CallLowering::IncomingValueHandler(MIRBuilder, MRI) {}
+
+ uint64_t StackUsed;
+
+private:
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override;
+
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override;
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override;
+
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+class FormalArgHandler : public PPCIncomingValueHandler {
+
+ void markPhysRegUsed(unsigned PhysReg) override;
+
+public:
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : PPCIncomingValueHandler(MIRBuilder, MRI) {}
+};
+
} // end namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
index c16bcae..5d196df 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
@@ -17,4 +17,6 @@
using namespace llvm;
using namespace LegalizeActions;
-PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) { computeTables(); }
+PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) {
+ getLegacyLegalizerInfo().computeTables();
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index 75c81e5..6433111 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -9,6 +9,7 @@
PPCELFObjectWriter.cpp
PPCXCOFFObjectWriter.cpp
PPCELFStreamer.cpp
+ PPCXCOFFStreamer.cpp
LINK_COMPONENTS
MC
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 7240166..28294b4 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -247,12 +247,19 @@
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_PPC64_NONE)
+ .Case("BFD_RELOC_16", ELF::R_PPC64_ADDR16)
+ .Case("BFD_RELOC_32", ELF::R_PPC64_ADDR32)
+ .Case("BFD_RELOC_64", ELF::R_PPC64_ADDR64)
.Default(-1u);
} else {
Type = llvm::StringSwitch<unsigned>(Name)
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_PPC_NONE)
+ .Case("BFD_RELOC_16", ELF::R_PPC_ADDR16)
+ .Case("BFD_RELOC_32", ELF::R_PPC_ADDR32)
.Default(-1u);
}
if (Type != -1u)
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index a291a34..3f6497a 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -514,6 +514,15 @@
O << ')';
}
+void PPCInstPrinter::printMemRegImmHash(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << MI->getOperand(OpNo).getImm();
+ O << '(';
+ printOperand(MI, OpNo + 1, STI, O);
+ O << ')';
+}
+
void PPCInstPrinter::printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 5e9b014..8f676da 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -97,6 +97,8 @@
void printMemRegImm(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemRegImmHash(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printMemRegImm34(const MCInst *MI, unsigned OpNo,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 2b76af2..21b368e 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -63,4 +63,13 @@
// A size of 8 is only supported by the assembler under 64-bit.
Data64bitsDirective = Is64Bit ? "\t.vbyte\t8, " : nullptr;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Set up DWARF directives
+ MinInstAlignment = 4;
+
+ // Support $ as PC in inline asm
+ DollarIsPC = true;
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 5f0769f..4dfa7d5 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -203,6 +203,24 @@
return RegBits;
}
+unsigned
+PPCMCCodeEmitter::getMemRIHashEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Encode (imm, reg) for the hash load/store to stack for the ROP Protection
+ // instructions.
+ const MCOperand &RegMO = MI.getOperand(OpNo + 1);
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ assert(RegMO.isReg() && "Base address must be a register.");
+ assert(MO.isImm() && "Expecting an immediate operand.");
+ assert(!(MO.getImm() % 8) && "Expecting offset to be 8 byte aligned.");
+
+ unsigned RegBits = getMachineOpValue(MI, RegMO, Fixups, STI) << 6;
+ unsigned DX = (MO.getImm() >> 3) & 0x3F;
+ return RegBits | DX;
+}
+
uint64_t
PPCMCCodeEmitter::getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index 347e163..39b2f12 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -69,6 +69,9 @@
unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getMemRIHashEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
uint64_t getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index bf9c6fe..e9fc056 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/PPCMCAsmInfo.h"
#include "PPCELFStreamer.h"
#include "PPCTargetStreamer.h"
+#include "PPCXCOFFStreamer.h"
#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringRef.h"
@@ -112,15 +113,23 @@
return MAI;
}
-static MCStreamer *createPPCMCStreamer(const Triple &T, MCContext &Context,
- std::unique_ptr<MCAsmBackend> &&MAB,
- std::unique_ptr<MCObjectWriter> &&OW,
- std::unique_ptr<MCCodeEmitter> &&Emitter,
- bool RelaxAll) {
+static MCStreamer *
+createPPCELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> &&MAB,
+ std::unique_ptr<MCObjectWriter> &&OW,
+ std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
return createPPCELFStreamer(Context, std::move(MAB), std::move(OW),
std::move(Emitter));
}
+static MCStreamer *createPPCXCOFFStreamer(
+ const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
+ std::unique_ptr<MCObjectWriter> &&OW,
+ std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
+ return createPPCXCOFFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter));
+}
+
namespace {
class PPCTargetAsmStreamer : public PPCTargetStreamer {
@@ -130,12 +139,22 @@
PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
: PPCTargetStreamer(S), OS(OS) {}
- void emitTCEntry(const MCSymbol &S) override {
+ void emitTCEntry(const MCSymbol &S,
+ MCSymbolRefExpr::VariantKind Kind) override {
if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
MCSymbolXCOFF *TCSym =
cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
->getQualNameSymbol();
- OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
+ // If the variant kind is VK_PPC_AIX_TLSGDM the entry represents the
+ // region handle for the symbol, we add the relocation specifier @m.
+ // If the variant kind is VK_PPC_AIX_TLSGD the entry represents the
+ // variable offset for the symbol, we add the relocation specifier @gd.
+ if (Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGD ||
+ Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM)
+ OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@"
+ << MCSymbolRefExpr::getVariantKindName(Kind) << '\n';
+ else
+ OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
if (TCSym->hasRename())
Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName());
@@ -172,7 +191,8 @@
return static_cast<MCELFStreamer &>(Streamer);
}
- void emitTCEntry(const MCSymbol &S) override {
+ void emitTCEntry(const MCSymbol &S,
+ MCSymbolRefExpr::VariantKind Kind) override {
// Creates a R_PPC64_TOC relocation
Streamer.emitValueToAlignment(8);
Streamer.emitSymbolValue(&S, 8);
@@ -276,7 +296,8 @@
public:
PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
- void emitTCEntry(const MCSymbol &S) override {
+ void emitTCEntry(const MCSymbol &S,
+ MCSymbolRefExpr::VariantKind Kind) override {
llvm_unreachable("Unknown pseudo-op: .tc");
}
@@ -298,11 +319,13 @@
public:
PPCTargetXCOFFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
- void emitTCEntry(const MCSymbol &S) override {
+ void emitTCEntry(const MCSymbol &S,
+ MCSymbolRefExpr::VariantKind Kind) override {
const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
const unsigned PointerSize = MAI->getCodePointerSize();
Streamer.emitValueToAlignment(PointerSize);
- Streamer.emitSymbolValue(&S, PointerSize);
+ Streamer.emitValue(MCSymbolRefExpr::create(&S, Kind, Streamer.getContext()),
+ PointerSize);
}
void emitMachine(StringRef CPU) override {
@@ -367,7 +390,10 @@
TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend);
// Register the elf streamer.
- TargetRegistry::RegisterELFStreamer(*T, createPPCMCStreamer);
+ TargetRegistry::RegisterELFStreamer(*T, createPPCELFStreamer);
+
+ // Register the XCOFF streamer.
+ TargetRegistry::RegisterXCOFFStreamer(*T, createPPCXCOFFStreamer);
// Register the object target streamer.
TargetRegistry::RegisterObjectTargetStreamer(*T,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 77b0331..64e11db 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -75,7 +75,18 @@
// Branches are 4 byte aligned, so the 24 bits we encode in
// the instruction actually represents a 26 bit offset.
return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
+ case PPC::fixup_ppc_br24abs:
+ return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
case FK_Data_4:
- return {XCOFF::RelocationType::R_POS, EncodedSignednessIndicator | 31};
+ switch (Modifier) {
+ default:
+ report_fatal_error("Unsupported modifier");
+ case MCSymbolRefExpr::VK_PPC_AIX_TLSGD:
+ return {XCOFF::RelocationType::R_TLS, EncodedSignednessIndicator | 31};
+ case MCSymbolRefExpr::VK_PPC_AIX_TLSGDM:
+ return {XCOFF::RelocationType::R_TLSM, EncodedSignednessIndicator | 31};
+ case MCSymbolRefExpr::VK_None:
+ return {XCOFF::RelocationType::R_POS, EncodedSignednessIndicator | 31};
+ }
}
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
new file mode 100644
index 0000000..e582ddf
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
@@ -0,0 +1,78 @@
+//===-------- PPCXCOFFStreamer.cpp - XCOFF Object Output ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCXCOFFStreamer for PowerPC.
+//
+// The purpose of the custom XCOFF streamer is to allow us to intercept
+// instructions as they are being emitted and align all 8 byte instructions
+// to a 64 byte boundary if required (by adding a 4 byte nop). This is important
+// because 8 byte instructions are not allowed to cross 64 byte boundaries
+// and by aligning anything that is within 4 bytes of the boundary we can
+// guarantee that the 8 byte instructions do not cross that boundary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCXCOFFStreamer.h"
+#include "PPCMCCodeEmitter.h"
+#include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+PPCXCOFFStreamer::PPCXCOFFStreamer(MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter)
+ : MCXCOFFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter)) {}
+
+void PPCXCOFFStreamer::emitPrefixedInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ // Prefixed instructions must not cross a 64-byte boundary (i.e. prefix is
+ // before the boundary and the remaining 4-bytes are after the boundary). In
+ // order to achieve this, a nop is added prior to any such boundary-crossing
+ // prefixed instruction. Align to 64 bytes if possible but add a maximum of 4
+ // bytes when trying to do that. If alignment requires adding more than 4
+ // bytes then the instruction won't be aligned.
+ emitCodeAlignment(64, 4);
+
+ // Emit the instruction.
+ // Since the previous emit created a new fragment then adding this instruction
+ // also forces the addition of a new fragment. Inst is now the first
+ // instruction in that new fragment.
+ MCXCOFFStreamer::emitInstruction(Inst, STI);
+}
+
+void PPCXCOFFStreamer::emitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ PPCMCCodeEmitter *Emitter =
+ static_cast<PPCMCCodeEmitter *>(getAssembler().getEmitterPtr());
+
+ // Special handling is only for prefixed instructions.
+ if (!Emitter->isPrefixedInstruction(Inst)) {
+ MCXCOFFStreamer::emitInstruction(Inst, STI);
+ return;
+ }
+ emitPrefixedInstruction(Inst, STI);
+}
+
+MCXCOFFStreamer *
+llvm::createPPCXCOFFStreamer(MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter) {
+ return new PPCXCOFFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter));
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h
new file mode 100644
index 0000000..f6eb5ed
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h
@@ -0,0 +1,39 @@
+//===- PPCXCOFFStreamer.h - XCOFF Object Output -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCXCOFFStreamer for PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_MCXCOFFSTREAMER_PPCXCOFFSTREAMER_H
+#define LLVM_LIB_TARGET_PPC_MCXCOFFSTREAMER_PPCXCOFFSTREAMER_H
+
+#include "llvm/MC/MCXCOFFStreamer.h"
+
+namespace llvm {
+
+class PPCXCOFFStreamer : public MCXCOFFStreamer {
+public:
+ PPCXCOFFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter);
+
+ void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+private:
+ void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
+};
+
+MCXCOFFStreamer *createPPCXCOFFStreamer(MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter);
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_PPC_MCXCOFFSTREAMER_PPCXCOFFSTREAMER_H
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td b/src/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
index 63531f7..76663acf 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -138,6 +138,7 @@
(instregex "SUBF(E|ME|ZE)?(8)?(O)?(_rec)?$"),
(instregex "NEG(8)?(O)?(_rec)?$"),
(instregex "POPCNTB$"),
+ (instregex "POPCNTB8$"),
(instregex "ADD(I|IS)?(8)?$"),
(instregex "LI(S)?(8)?$"),
(instregex "(X)?OR(I|IS)?(8)?(_rec)?$"),
@@ -733,7 +734,6 @@
(instregex "DCBZ(L)?(EP)?$"),
(instregex "DCBTST(EP)?$"),
(instregex "CP_COPY(8)?$"),
- (instregex "CP_PASTE(8)?$"),
(instregex "ICBI(EP)?$"),
(instregex "ICBT(LS)?$"),
(instregex "LBARX(L)?$"),
@@ -1079,7 +1079,8 @@
DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MTFSF(b|_rec)?$"),
- (instregex "MTFSFI(_rec)?$")
+ (instregex "MTFSFI(_rec)?$"),
+ MTFSFIb
)>;
// Cracked instruction made of two ALU ops.
@@ -1410,6 +1411,7 @@
(instregex "NOP_GT_PWR(6|7)$"),
(instregex "TLB(IA|IVAX|SX|SX2|SX2D|LD|LI|RE|RE2|WE|WE2)$"),
(instregex "WRTEE(I)?$"),
+ (instregex "HASH(ST|STP|CHK|CHKP)$"),
ATTN,
CLRBHRB,
MFBHRBE,
@@ -1427,5 +1429,6 @@
DCBA,
DCBI,
DCCCI,
- ICCCI
+ ICCCI,
+ ADDEX
)> { let Unsupported = 1; }
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPC.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPC.h
index 264582b..7235a87 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPC.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPC.h
@@ -52,6 +52,7 @@
FunctionPass *createPPCBoolRetToIntPass();
FunctionPass *createPPCExpandISELPass();
FunctionPass *createPPCPreEmitPeepholePass();
+ FunctionPass *createPPCExpandAtomicPseudoPass();
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP);
bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
@@ -75,6 +76,7 @@
void initializePPCPreEmitPeepholePass(PassRegistry &);
void initializePPCTLSDynamicCallPass(PassRegistry &);
void initializePPCMIPeepholePass(PassRegistry&);
+ void initializePPCExpandAtomicPseudoPass(PassRegistry &);
extern char &PPCVSXFMAMutateID;
@@ -116,7 +118,8 @@
MO_PCREL_OPT_FLAG = 16,
/// MO_TLSGD_FLAG - If this bit is set the symbol reference is relative to
- /// TLS General Dynamic model.
+ /// TLS General Dynamic model for Linux and the variable offset of TLS
+ /// General Dynamic model for AIX.
MO_TLSGD_FLAG = 32,
/// MO_TPREL_FLAG - If this bit is set the symbol reference is relative to
@@ -127,6 +130,10 @@
/// TLS Local Dynamic model.
MO_TLSLD_FLAG = 128,
+ /// MO_TLSGDM_FLAG - If this bit is set the symbol reference is relative
+ /// to the region handle of TLS General Dynamic model for AIX.
+ MO_TLSGDM_FLAG = 256,
+
/// MO_GOT_TLSGD_PCREL_FLAG - A combintaion of flags, if these bits are set
/// they should produce the relocation @got@tlsgd@pcrel.
/// Fix up is VK_PPC_GOT_TLSGD_PCREL
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPC.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPC.td
index 1e6ded23..ce43ced 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPC.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPC.td
@@ -161,6 +161,9 @@
def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics",
"HasPartwordAtomics", "true",
"Enable l[bh]arx and st[bh]cx.">;
+def FeatureQuadwordAtomic : SubtargetFeature<"quadword-atomics",
+ "HasQuadwordAtomics", "true",
+ "Enable lqarx and stqcx.">;
def FeatureInvariantFunctionDescriptors :
SubtargetFeature<"invariant-function-descriptors",
"HasInvariantFunctionDescriptors", "true",
@@ -210,9 +213,13 @@
def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true",
"Treat vector data stream cache control instructions as deprecated">;
+def FeatureISA2_07 : SubtargetFeature<"isa-v207-instructions", "IsISA2_07",
+ "true",
+ "Enable instructions in ISA 2.07.">;
def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
"true",
- "Enable instructions in ISA 3.0.">;
+ "Enable instructions in ISA 3.0.",
+ [FeatureISA2_07]>;
def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
"true",
"Enable instructions in ISA 3.1.",
@@ -252,6 +259,13 @@
"Enable MMA instructions",
[FeatureP8Vector, FeatureP9Altivec,
FeaturePairedVectorMemops]>;
+def FeatureROPProtect :
+ SubtargetFeature<"rop-protect", "HasROPProtect", "true",
+ "Add ROP protect">;
+
+def FeaturePrivileged :
+ SubtargetFeature<"privileged", "HasPrivileged", "true",
+ "Add privileged instructions">;
def FeaturePredictableSelectIsExpensive :
SubtargetFeature<"predictable-select-expensive",
@@ -320,7 +334,9 @@
FeatureDirectMove,
FeatureICBT,
FeaturePartwordAtomic,
- FeaturePredictableSelectIsExpensive
+ FeatureQuadwordAtomic,
+ FeaturePredictableSelectIsExpensive,
+ FeatureISA2_07
];
list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 6257709..d0109f9 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -61,6 +61,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TargetRegistry.h"
@@ -78,11 +79,48 @@
#define DEBUG_TYPE "asmprinter"
+static cl::opt<bool> EnableSSPCanaryBitInTB(
+ "aix-ssp-tb-bit", cl::init(false),
+ cl::desc("Enable Passing SSP Canary info in Trackback on AIX"), cl::Hidden);
+
+// Specialize DenseMapInfo to allow
+// std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind> in DenseMap.
+// This specialization is needed here because that type is used as keys in the
+// map representing TOC entries.
+namespace llvm {
+template <>
+struct DenseMapInfo<std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind>> {
+ using TOCKey = std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind>;
+
+ static inline TOCKey getEmptyKey() {
+ return {nullptr, MCSymbolRefExpr::VariantKind::VK_None};
+ }
+ static inline TOCKey getTombstoneKey() {
+ return {nullptr, MCSymbolRefExpr::VariantKind::VK_Invalid};
+ }
+ static unsigned getHashValue(const TOCKey &PairVal) {
+ return detail::combineHashValue(
+ DenseMapInfo<const MCSymbol *>::getHashValue(PairVal.first),
+ DenseMapInfo<int>::getHashValue(PairVal.second));
+ }
+ static bool isEqual(const TOCKey &A, const TOCKey &B) { return A == B; }
+};
+} // end namespace llvm
+
namespace {
class PPCAsmPrinter : public AsmPrinter {
protected:
- MapVector<const MCSymbol *, MCSymbol *> TOC;
+ // For TLS on AIX, we need to be able to identify TOC entries of specific
+ // VariantKind so we can add the right relocations when we generate the
+ // entries. So each entry is represented by a pair of MCSymbol and
+ // VariantKind. For example, we need to be able to identify the following
+ // entry as a TLSGD entry so we can add the @m relocation:
+ // .tc .i[TC],i[TL]@m
+ // By default, VK_None is used for the VariantKind.
+ MapVector<std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind>,
+ MCSymbol *>
+ TOC;
const PPCSubtarget *Subtarget = nullptr;
StackMaps SM;
@@ -93,7 +131,9 @@
StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
- MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym);
+ MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym,
+ MCSymbolRefExpr::VariantKind Kind =
+ MCSymbolRefExpr::VariantKind::VK_None);
bool doInitialization(Module &M) override {
if (!TOC.empty())
@@ -158,14 +198,18 @@
/// sinit/sterm function names.
std::string FormatIndicatorAndUniqueModId;
- static void ValidateGV(const GlobalVariable *GV);
// Record a list of GlobalAlias associated with a GlobalObject.
// This is used for AIX's extra-label-at-definition aliasing strategy.
DenseMap<const GlobalObject *, SmallVector<const GlobalAlias *, 1>>
GOAliasMap;
+ uint16_t getNumberOfVRSaved();
void emitTracebackTable();
+ SmallVector<const GlobalVariable *, 8> TOCDataGlobalVars;
+
+ void emitGlobalVariableHelper(const GlobalVariable *);
+
public:
PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: PPCAsmPrinter(TM, std::move(Streamer)) {
@@ -350,8 +394,10 @@
/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
/// exists for it. If not, create one. Then return a symbol that references
/// the TOC entry.
-MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym) {
- MCSymbol *&TOCEntry = TOC[Sym];
+MCSymbol *
+PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym,
+ MCSymbolRefExpr::VariantKind Kind) {
+ MCSymbol *&TOCEntry = TOC[{Sym, Kind}];
if (!TOCEntry)
TOCEntry = createTempSymbol("C");
return TOCEntry;
@@ -492,12 +538,20 @@
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
}
+/// This helper function creates the TlsGetAddr MCSymbol for AIX. We will
+/// create the csect and use the qual-name symbol instead of creating just the
+/// external symbol.
+static MCSymbol *createMCSymbolForTlsGetAddr(MCContext &Ctx) {
+ return Ctx
+ .getXCOFFSection(".__tls_get_addr", SectionKind::getText(),
+ XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER))
+ ->getQualNameSymbol();
+}
+
/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
/// call to __tls_get_addr to the current output stream.
void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
MCSymbolRefExpr::VariantKind VK) {
- StringRef Name = "__tls_get_addr";
- MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
unsigned Opcode = PPC::BL8_NOP_TLS;
@@ -518,6 +572,25 @@
(!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) &&
"GETtls[ld]ADDR[32] must read GPR3");
+ if (Subtarget->isAIXABI()) {
+ // On AIX, the variable offset should already be in R4 and the region handle
+ // should already be in R3.
+ // For TLSGD, which currently is the only supported access model, we only
+ // need to generate an absolute branch to .__tls_get_addr.
+ Register VarOffsetReg = Subtarget->isPPC64() ? PPC::X4 : PPC::R4;
+ (void)VarOffsetReg;
+ assert(MI->getOperand(2).isReg() &&
+ MI->getOperand(2).getReg() == VarOffsetReg &&
+ "GETtls[ld]ADDR[32] must read GPR4");
+ MCSymbol *TlsGetAddr = createMCSymbolForTlsGetAddr(OutContext);
+ const MCExpr *TlsRef = MCSymbolRefExpr::create(
+ TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BLA).addExpr(TlsRef));
+ return;
+ }
+
+ MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol("__tls_get_addr");
+
if (Subtarget->is32BitELFABI() && isPositionIndependent())
Kind = MCSymbolRefExpr::VK_PLT;
@@ -608,9 +681,11 @@
auto getTOCEntryLoadingExprForXCOFF =
[IsPPC64, getTOCRelocAdjustedExprForXCOFF,
- this](const MCSymbol *MOSymbol, const MCExpr *Expr) -> const MCExpr * {
+ this](const MCSymbol *MOSymbol, const MCExpr *Expr,
+ MCSymbolRefExpr::VariantKind VK =
+ MCSymbolRefExpr::VariantKind::VK_None) -> const MCExpr * {
const unsigned EntryByteSize = IsPPC64 ? 8 : 4;
- const auto TOCEntryIter = TOC.find(MOSymbol);
+ const auto TOCEntryIter = TOC.find({MOSymbol, VK});
assert(TOCEntryIter != TOC.end() &&
"Could not find the TOC entry for this symbol.");
const ptrdiff_t EntryDistanceFromTOCBase =
@@ -622,6 +697,16 @@
return Expr;
};
+ auto GetVKForMO = [&](const MachineOperand &MO) {
+ // For GD TLS access on AIX, we have two TOC entries for the symbol (one for
+ // the variable offset and the other for the region handle). They are
+ // differentiated by MO_TLSGD_FLAG and MO_TLSGDM_FLAG.
+ if (MO.getTargetFlags() & PPCII::MO_TLSGDM_FLAG)
+ return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM;
+ if (MO.getTargetFlags() & PPCII::MO_TLSGD_FLAG)
+ return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGD;
+ return MCSymbolRefExpr::VariantKind::VK_None;
+ };
// Lower multi-instruction pseudo operations.
switch (MI->getOpcode()) {
@@ -755,10 +840,12 @@
return;
}
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
+
// Otherwise, use the TOC. 'TOCEntry' is a label used to reference the
// storage allocated in the TOC which contains the address of
// 'MOSymbol'. Said TOC entry will be synthesized later.
- MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol, VK);
const MCExpr *Exp =
MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, OutContext);
@@ -769,8 +856,12 @@
assert(
TM.getCodeModel() == CodeModel::Small &&
"This pseudo should only be selected for 32-bit small code model.");
- Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp);
+ Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+
+ // Print MO for better readability
+ if (isVerbose())
+ OutStreamer->GetCommentOS() << MO << '\n';
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
@@ -784,6 +875,30 @@
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
+ case PPC::ADDItoc: {
+ assert(IsAIX && TM.getCodeModel() == CodeModel::Small &&
+ "Operand only valid in AIX 32 bit mode");
+
+ // Transform %rN = ADDItoc @op1, %r2.
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+
+ // Change the opcode to load address.
+ TmpInst.setOpcode(PPC::LA);
+
+ const MachineOperand &MO = MI->getOperand(1);
+ assert(MO.isGlobal() && "Invalid operand for ADDItoc.");
+
+ // Map the operand to its corresponding MCSymbol.
+ const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_None, OutContext);
+
+ TmpInst.getOperand(1) = TmpInst.getOperand(2);
+ TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
case PPC::LDtocJTI:
case PPC::LDtocCPT:
case PPC::LDtocBA:
@@ -801,17 +916,22 @@
// Map the operand to its corresponding MCSymbol.
const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
+
// Map the machine operand to its corresponding MCSymbol, then map the
// global address operand to be a reference to the TOC entry we will
// synthesize later.
- MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol, VK);
- const MCSymbolRefExpr::VariantKind VK =
+ MCSymbolRefExpr::VariantKind VKExpr =
IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
- const MCExpr *Exp =
- MCSymbolRefExpr::create(TOCEntry, VK, OutContext);
+ const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, VKExpr, OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(
- IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp) : Exp);
+ IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK) : Exp);
+
+ // Print MO for better readability
+ if (isVerbose() && IsAIX)
+ OutStreamer->GetCommentOS() << MO << '\n';
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
@@ -833,11 +953,13 @@
// Map the machine operand to its corresponding MCSymbol.
MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
+
// Always use TOC on AIX. Map the global address operand to be a reference
// to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
// reference the storage allocated in the TOC which contains the address of
// 'MOSymbol'.
- MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol, VK);
const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
MCSymbolRefExpr::VK_PPC_U,
OutContext);
@@ -863,11 +985,13 @@
// Map the machine operand to its corresponding MCSymbol.
MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
+
// Always use TOC on AIX. Map the global address operand to be a reference
// to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
// reference the storage allocated in the TOC which contains the address of
// 'MOSymbol'.
- MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol, VK);
const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
MCSymbolRefExpr::VK_PPC_L,
OutContext);
@@ -891,14 +1015,15 @@
const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
+
const bool GlobalToc =
MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
(MO.isCPI() && TM.getCodeModel() == CodeModel::Large))
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, VK);
- const MCSymbolRefExpr::VariantKind VK =
- IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
+ VK = IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
const MCExpr *Exp =
MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
@@ -935,11 +1060,12 @@
const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
- if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
- const MCSymbolRefExpr::VariantKind VK =
- IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
+ if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, VK);
+
+ VK = IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
const MCExpr *Exp =
MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
@@ -1088,6 +1214,11 @@
// Transform: %x3 = GETtlsADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
case PPC::GETtlsADDRPCREL:
+ case PPC::GETtlsADDR32AIX:
+ case PPC::GETtlsADDR64AIX:
+ // Transform: %r3 = GETtlsADDRNNAIX %r3, %r4 (for NN == 32/64).
+ // Into: BLA .__tls_get_addr()
+ // Unlike on Linux, there is no symbol or relocation needed for this call.
case PPC::GETtlsADDR32: {
// Transform: %r3 = GETtlsADDR32 %r3, @sym
// Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
@@ -1241,6 +1372,16 @@
// Now process the instruction normally.
break;
}
+ case PPC::PseudoEIEIO: {
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO));
+ return;
+ }
}
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
@@ -1291,8 +1432,7 @@
unsigned RetOpcode = MI->getOperand(0).getImm();
MCInst RetInst;
RetInst.setOpcode(RetOpcode);
- for (const auto &MO :
- make_range(std::next(MI->operands_begin()), MI->operands_end())) {
+ for (const auto &MO : llvm::drop_begin(MI->operands())) {
MCOperand MCOp;
if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this))
RetInst.addOperand(MCOp);
@@ -1511,12 +1651,12 @@
OutStreamer->emitValueToAlignment(4);
for (const auto &TOCMapPair : TOC) {
- const MCSymbol *const TOCEntryTarget = TOCMapPair.first;
+ const MCSymbol *const TOCEntryTarget = TOCMapPair.first.first;
MCSymbol *const TOCEntryLabel = TOCMapPair.second;
OutStreamer->emitLabel(TOCEntryLabel);
if (isPPC64 && TS != nullptr)
- TS->emitTCEntry(*TOCEntryTarget);
+ TS->emitTCEntry(*TOCEntryTarget, TOCMapPair.first.second);
else
OutStreamer->emitSymbolValue(TOCEntryTarget, 4);
}
@@ -1742,12 +1882,54 @@
return AsmPrinter::SetupMachineFunction(MF);
}
+uint16_t PPCAIXAsmPrinter::getNumberOfVRSaved() {
+ // Calculate the number of VRs be saved.
+ // Vector registers 20 through 31 are marked as reserved and cannot be used
+ // in the default ABI.
+ const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+ if (Subtarget.isAIXABI() && Subtarget.hasAltivec() &&
+ TM.getAIXExtendedAltivecABI()) {
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ for (unsigned Reg = PPC::V20; Reg <= PPC::V31; ++Reg)
+ if (MRI.isPhysRegModified(Reg))
+ // Number of VRs saved.
+ return PPC::V31 - Reg + 1;
+ }
+ return 0;
+}
+
void PPCAIXAsmPrinter::emitFunctionBodyEnd() {
if (!TM.getXCOFFTracebackTable())
return;
emitTracebackTable();
+
+ // If ShouldEmitEHBlock returns true, then the eh info table
+ // will be emitted via `AIXException::endFunction`. Otherwise, we
+ // need to emit a dumy eh info table when VRs are saved. We could not
+ // consolidate these two places into one because there is no easy way
+ // to access register information in `AIXException` class.
+ if (!TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF) &&
+ (getNumberOfVRSaved() > 0)) {
+ // Emit dummy EH Info Table.
+ OutStreamer->SwitchSection(getObjFileLowering().getCompactUnwindSection());
+ MCSymbol *EHInfoLabel =
+ TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
+ OutStreamer->emitLabel(EHInfoLabel);
+
+ // Version number.
+ OutStreamer->emitInt32(0);
+
+ const DataLayout &DL = MMI->getModule()->getDataLayout();
+ const unsigned PointerSize = DL.getPointerSize();
+ // Add necessary paddings in 64 bit mode.
+ OutStreamer->emitValueToAlignment(PointerSize);
+
+ OutStreamer->emitIntValue(0, PointerSize);
+ OutStreamer->emitIntValue(0, PointerSize);
+ OutStreamer->SwitchSection(MF->getSection());
+ }
}
void PPCAIXAsmPrinter::emitTracebackTable() {
@@ -1802,7 +1984,7 @@
// Check the function uses floating-point processor instructions or not
for (unsigned Reg = PPC::F0; Reg <= PPC::F31; ++Reg) {
- if (MRI.isPhysRegUsed(Reg)) {
+ if (MRI.isPhysRegUsed(Reg, /* SkipRegMaskTest */ true)) {
FirstHalfOfMandatoryField |= TracebackTable::IsFloatingPointPresentMask;
break;
}
@@ -1842,7 +2024,8 @@
FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask;
static_assert(XCOFF::AllocRegNo == 31, "Unexpected register usage!");
- if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31))
+ if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31,
+ /* SkipRegMaskTest */ true))
FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask;
const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
@@ -1887,7 +2070,24 @@
(SecondHalfOfMandatoryField & 0xff000000) >> 24, 1);
// Set the 6th byte of mandatory field.
- bool ShouldEmitEHBlock = TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF);
+
+ // Check whether has Vector Instruction,We only treat instructions uses vector
+ // register as vector instructions.
+ bool HasVectorInst = false;
+ for (unsigned Reg = PPC::V0; Reg <= PPC::V31; ++Reg)
+ if (MRI.isPhysRegUsed(Reg, /* SkipRegMaskTest */ true)) {
+ // Has VMX instruction.
+ HasVectorInst = true;
+ break;
+ }
+
+ if (FI->hasVectorParms() || HasVectorInst)
+ SecondHalfOfMandatoryField |= TracebackTable::HasVectorInfoMask;
+
+ uint16_t NumOfVRSaved = getNumberOfVRSaved();
+ bool ShouldEmitEHBlock =
+ TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF) || NumOfVRSaved > 0;
+
if (ShouldEmitEHBlock)
SecondHalfOfMandatoryField |= TracebackTable::HasExtensionTableMask;
@@ -1907,17 +2107,17 @@
SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) &
TracebackTable::GPRSavedMask;
- GENBOOLCOMMENT("", SecondHalfOfMandatoryField, HasVectorInfo);
- GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasExtensionTable);
+ GENBOOLCOMMENT("", SecondHalfOfMandatoryField, HasExtensionTable);
+ GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasVectorInfo);
GENVALUECOMMENT(", NumOfGPRsSaved", SecondHalfOfMandatoryField, GPRSaved);
EmitComment();
OutStreamer->emitIntValueInHexWithPadding(
(SecondHalfOfMandatoryField & 0x00ff0000) >> 16, 1);
// Set the 7th byte of mandatory field.
- uint32_t NumberOfFixedPara = FI->getFixedParamNum();
+ uint32_t NumberOfFixedParms = FI->getFixedParmsNum();
SecondHalfOfMandatoryField |=
- (NumberOfFixedPara << TracebackTable::NumberOfFixedParmsShift) &
+ (NumberOfFixedParms << TracebackTable::NumberOfFixedParmsShift) &
TracebackTable::NumberOfFixedParmsMask;
GENVALUECOMMENT("NumberOfFixedParms", SecondHalfOfMandatoryField,
NumberOfFixedParms);
@@ -1930,9 +2130,9 @@
// Always set parameter on stack.
SecondHalfOfMandatoryField |= TracebackTable::HasParmsOnStackMask;
- uint32_t NumberOfFPPara = FI->getFloatingPointParamNum();
+ uint32_t NumberOfFPParms = FI->getFloatingPointParmsNum();
SecondHalfOfMandatoryField |=
- (NumberOfFPPara << TracebackTable::NumberOfFloatingPointParmsShift) &
+ (NumberOfFPParms << TracebackTable::NumberOfFloatingPointParmsShift) &
TracebackTable::NumberOfFloatingPointParmsMask;
GENVALUECOMMENT("NumberOfFPParms", SecondHalfOfMandatoryField,
@@ -1945,18 +2145,25 @@
// Generate the optional fields of traceback table.
// Parameter type.
- if (NumberOfFixedPara || NumberOfFPPara) {
- assert((SecondHalfOfMandatoryField & TracebackTable::HasVectorInfoMask) ==
- 0 &&
- "VectorInfo has not been implemented.");
- uint32_t ParaType = FI->getParameterType();
- CommentOS << "Parameter type = "
- << XCOFF::parseParmsType(ParaType,
- NumberOfFixedPara + NumberOfFPPara);
- EmitComment();
- OutStreamer->emitIntValueInHexWithPadding(ParaType, sizeof(ParaType));
- }
+ if (NumberOfFixedParms || NumberOfFPParms) {
+ uint32_t ParmsTypeValue = FI->getParmsType();
+ Expected<SmallString<32>> ParmsType =
+ FI->hasVectorParms()
+ ? XCOFF::parseParmsTypeWithVecInfo(
+ ParmsTypeValue, NumberOfFixedParms, NumberOfFPParms,
+ FI->getVectorParmsNum())
+ : XCOFF::parseParmsType(ParmsTypeValue, NumberOfFixedParms,
+ NumberOfFPParms);
+
+ assert(ParmsType && toString(ParmsType.takeError()).c_str());
+ if (ParmsType) {
+ CommentOS << "Parameter type = " << ParmsType.get();
+ EmitComment();
+ }
+ OutStreamer->emitIntValueInHexWithPadding(ParmsTypeValue,
+ sizeof(ParmsTypeValue));
+ }
// Traceback table offset.
OutStreamer->AddComment("Function size");
if (FirstHalfOfMandatoryField & TracebackTable::HasTraceBackTableOffsetMask) {
@@ -1988,10 +2195,66 @@
OutStreamer->emitIntValueInHex(AllocReg, sizeof(AllocReg));
}
+ if (SecondHalfOfMandatoryField & TracebackTable::HasVectorInfoMask) {
+ uint16_t VRData = 0;
+ if (NumOfVRSaved) {
+ // Number of VRs saved.
+ VRData |= (NumOfVRSaved << TracebackTable::NumberOfVRSavedShift) &
+ TracebackTable::NumberOfVRSavedMask;
+ // This bit is supposed to set only when the special register
+ // VRSAVE is saved on stack.
+ // However, IBM XL compiler sets the bit when any vector registers
+ // are saved on the stack. We will follow XL's behavior on AIX
+ // so that we don't get surprise behavior change for C code.
+ VRData |= TracebackTable::IsVRSavedOnStackMask;
+ }
+
+ // Set has_varargs.
+ if (FI->getVarArgsFrameIndex())
+ VRData |= TracebackTable::HasVarArgsMask;
+
+ // Vector parameters number.
+ unsigned VectorParmsNum = FI->getVectorParmsNum();
+ VRData |= (VectorParmsNum << TracebackTable::NumberOfVectorParmsShift) &
+ TracebackTable::NumberOfVectorParmsMask;
+
+ if (HasVectorInst)
+ VRData |= TracebackTable::HasVMXInstructionMask;
+
+ GENVALUECOMMENT("NumOfVRsSaved", VRData, NumberOfVRSaved);
+ GENBOOLCOMMENT(", ", VRData, IsVRSavedOnStack);
+ GENBOOLCOMMENT(", ", VRData, HasVarArgs);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding((VRData & 0xff00) >> 8, 1);
+
+ GENVALUECOMMENT("NumOfVectorParams", VRData, NumberOfVectorParms);
+ GENBOOLCOMMENT(", ", VRData, HasVMXInstruction);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(VRData & 0x00ff, 1);
+
+ uint32_t VecParmTypeValue = FI->getVecExtParmsType();
+
+ Expected<SmallString<32>> VecParmsType =
+ XCOFF::parseVectorParmsType(VecParmTypeValue, VectorParmsNum);
+ assert(VecParmsType && toString(VecParmsType.takeError()).c_str());
+ if (VecParmsType) {
+ CommentOS << "Vector Parameter type = " << VecParmsType.get();
+ EmitComment();
+ }
+ OutStreamer->emitIntValueInHexWithPadding(VecParmTypeValue,
+ sizeof(VecParmTypeValue));
+ // Padding 2 bytes.
+ CommentOS << "Padding";
+ EmitCommentAndValue(0, 2);
+ }
+
uint8_t ExtensionTableFlag = 0;
if (SecondHalfOfMandatoryField & TracebackTable::HasExtensionTableMask) {
if (ShouldEmitEHBlock)
ExtensionTableFlag |= ExtendedTBTableFlag::TB_EH_INFO;
+ if (EnableSSPCanaryBitInTB &&
+ TargetLoweringObjectFileXCOFF::ShouldSetSSPCanaryBitInTB(MF))
+ ExtensionTableFlag |= ExtendedTBTableFlag::TB_SSP_CANARY;
CommentOS << "ExtensionTableFlag = "
<< getExtendedTBTableFlagString(ExtensionTableFlag);
@@ -2015,20 +2278,10 @@
OutStreamer->AddComment("EHInfo Table");
OutStreamer->emitValue(Exp, DL.getPointerSize());
}
-
#undef GENBOOLCOMMENT
#undef GENVALUECOMMENT
}
-void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) {
- // Early error checking limiting what is supported.
- if (GV->isThreadLocal())
- report_fatal_error("Thread local not yet supported on AIX.");
-
- if (GV->hasComdat())
- report_fatal_error("COMDAT not yet supported by AIX.");
-}
-
static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
return GV->hasAppendingLinkage() &&
StringSwitch<bool>(GV->getName())
@@ -2052,9 +2305,22 @@
if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
return;
+ // If the Global Variable has the toc-data attribute, it needs to be emitted
+ // when we emit the .toc section.
+ if (GV->hasAttribute("toc-data")) {
+ TOCDataGlobalVars.push_back(GV);
+ return;
+ }
+
+ emitGlobalVariableHelper(GV);
+}
+
+void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
assert(!GV->getName().startswith("llvm.") &&
"Unhandled intrinsic global variable.");
- ValidateGV(GV);
+
+ if (GV->hasComdat())
+ report_fatal_error("COMDAT not yet supported by AIX.");
MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
@@ -2064,10 +2330,20 @@
}
SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
- if (!GVKind.isGlobalWriteableData() && !GVKind.isReadOnly())
+ if (!GVKind.isGlobalWriteableData() && !GVKind.isReadOnly() &&
+ !GVKind.isThreadLocal()) // Checks for both ThreadData and ThreadBSS.
report_fatal_error("Encountered a global variable kind that is "
"not supported yet.");
+ // Print GV in verbose mode
+ if (isVerbose()) {
+ if (GV->hasInitializer()) {
+ GV->printAsOperand(OutStreamer->GetCommentOS(),
+ /*PrintType=*/false, GV->getParent());
+ OutStreamer->GetCommentOS() << '\n';
+ }
+ }
+
MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
@@ -2076,14 +2352,15 @@
const DataLayout &DL = GV->getParent()->getDataLayout();
- // Handle common symbols.
- if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+ // Handle common and zero-initialized local symbols.
+ if (GV->hasCommonLinkage() || GVKind.isBSSLocal() ||
+ GVKind.isThreadBSSLocal()) {
Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV));
- uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+ uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
GVSym->setStorageClass(
TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
- if (GVKind.isBSSLocal())
+ if (GVKind.isBSSLocal() || GVKind.isThreadBSSLocal())
OutStreamer->emitXCOFFLocalCommonSymbol(
OutContext.getOrCreateSymbol(GVSym->getSymbolTableName()), Size,
GVSym, Alignment.value());
@@ -2155,9 +2432,9 @@
}
void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
- // If there are no functions in this module, we will never need to reference
- // the TOC base.
- if (M.empty())
+ // If there are no functions and there are no toc-data definitions in this
+ // module, we will never need to reference the TOC base.
+ if (M.empty() && TOCDataGlobalVars.empty())
return;
// Switch to section to emit TOC base.
@@ -2167,15 +2444,31 @@
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
for (auto &I : TOC) {
- // Setup the csect for the current TC entry.
- MCSectionXCOFF *TCEntry = cast<MCSectionXCOFF>(
- getObjFileLowering().getSectionForTOCEntry(I.first, TM));
+ MCSectionXCOFF *TCEntry;
+ // Setup the csect for the current TC entry. If the variant kind is
+ // VK_PPC_AIX_TLSGDM the entry represents the region handle, we create a
+ // new symbol to prefix the name with a dot.
+ if (I.first.second == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM) {
+ SmallString<128> Name;
+ StringRef Prefix = ".";
+ Name += Prefix;
+ Name += I.first.first->getName();
+ MCSymbol *S = OutContext.getOrCreateSymbol(Name);
+ TCEntry = cast<MCSectionXCOFF>(
+ getObjFileLowering().getSectionForTOCEntry(S, TM));
+ } else {
+ TCEntry = cast<MCSectionXCOFF>(
+ getObjFileLowering().getSectionForTOCEntry(I.first.first, TM));
+ }
OutStreamer->SwitchSection(TCEntry);
OutStreamer->emitLabel(I.second);
if (TS != nullptr)
- TS->emitTCEntry(*I.first);
+ TS->emitTCEntry(*I.first.first, I.first.second);
}
+
+ for (const auto *GV : TOCDataGlobalVars)
+ emitGlobalVariableHelper(GV);
}
bool PPCAIXAsmPrinter::doInitialization(Module &M) {
@@ -2248,6 +2541,14 @@
switch (MI->getOpcode()) {
default:
break;
+ case PPC::GETtlsADDR64AIX:
+ case PPC::GETtlsADDR32AIX: {
+ // The reference to .__tls_get_addr is unknown to the assembler
+ // so we need to emit an external symbol reference.
+ MCSymbol *TlsGetAddr = createMCSymbolForTlsGetAddr(OutContext);
+ ExtSymSDNodeSymbols.insert(TlsGetAddr);
+ break;
+ }
case PPC::BL8:
case PPC::BL:
case PPC::BL8_NOP:
@@ -2273,11 +2574,28 @@
if (MI->getOperand(0).isSymbol())
report_fatal_error("Tail call for extern symbol not yet supported.");
break;
+ case PPC::DST:
+ case PPC::DST64:
+ case PPC::DSTT:
+ case PPC::DSTT64:
+ case PPC::DSTST:
+ case PPC::DSTST64:
+ case PPC::DSTSTT:
+ case PPC::DSTSTT64:
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::ORI).addReg(PPC::R0).addReg(PPC::R0).addImm(0));
+ return;
}
return PPCAsmPrinter::emitInstruction(MI);
}
bool PPCAIXAsmPrinter::doFinalization(Module &M) {
+ // Do streamer related finalization for DWARF.
+ if (!MAI->usesDwarfFileAndLocDirectives() && MMI->hasDebugInfo())
+ OutStreamer->doFinalizationAtSectionEnd(
+ OutStreamer->getContext().getObjectFileInfo()->getTextSection());
+
for (MCSymbol *Sym : ExtSymSDNodeSymbols)
OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern);
return PPCAsmPrinter::doFinalization(M);
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
index 47b9e97..fa6713d 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -415,5 +415,5 @@
}
BlockSizes.clear();
- return true;
+ return EverMadeChange;
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.h
index e349959..b0e50b2 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -10,6 +10,7 @@
#define PPCCCSTATE_H
#include "PPCISelLowering.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
@@ -36,6 +37,37 @@
bool WasOriginalArgPPCF128(unsigned ValNo) { return OriginalArgWasPPCF128[ValNo]; }
void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
};
-}
+
+class AIXCCState : public CCState {
+private:
+ BitVector IsFixed;
+
+public:
+ AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C)
+ : CCState(CC, IsVarArg, MF, Locs, C) {}
+
+ void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn) {
+ // All formal arguments are fixed.
+ IsFixed.resize(Ins.size(), true);
+ CCState::AnalyzeFormalArguments(Ins, Fn);
+ }
+
+ void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn) {
+ // Record whether the call operand was a fixed argument.
+ IsFixed.resize(Outs.size(), false);
+ for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo)
+ if (Outs[ValNo].IsFixed)
+ IsFixed.set(ValNo);
+
+ CCState::AnalyzeCallOperands(Outs, Fn);
+ }
+
+ bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); }
+};
+
+} // end namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
index cc34867..1e81276 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -355,6 +355,11 @@
def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
(sequence "V%u", 0, 31))>;
+def CSR_64_AllRegs_AIX_Dflt_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
+ (sequence "V%u", 0, 19))>;
+
def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
(sequence "VSL%u", 0, 31))>;
+def CSR_64_AllRegs_AIX_Dflt_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
+ (sequence "VSL%u", 0, 19))>;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
new file mode 100644
index 0000000..9daef26
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -0,0 +1,306 @@
+//===-- PPCExpandAtomicPseudoInsts.cpp - Expand atomic pseudo instrs. -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands atomic pseudo instructions into
+// target instructions post RA. With such method, LL/SC loop is considered as
+// a whole blob and make spilling unlikely happens in the LL/SC loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-atomic-expand"
+
+namespace {
+
+class PPCExpandAtomicPseudo : public MachineFunctionPass {
+public:
+ const PPCInstrInfo *TII;
+ const PPCRegisterInfo *TRI;
+ static char ID;
+
+ PPCExpandAtomicPseudo() : MachineFunctionPass(ID) {
+ initializePPCExpandAtomicPseudoPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+ bool expandAtomicRMW128(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+ bool expandAtomicCmpSwap128(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+};
+
+static void PairedCopy(const PPCInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ Register Dest0, Register Dest1, Register Src0,
+ Register Src1) {
+ const MCInstrDesc &OR = TII->get(PPC::OR8);
+ const MCInstrDesc &XOR = TII->get(PPC::XOR8);
+ if (Dest0 == Src1 && Dest1 == Src0) {
+ // The most tricky case, swapping values.
+ BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1);
+ BuildMI(MBB, MBBI, DL, XOR, Dest1).addReg(Dest0).addReg(Dest1);
+ BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1);
+ } else if (Dest0 != Src0 || Dest1 != Src1) {
+ if (Dest0 == Src1 || Dest1 != Src0) {
+ BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1);
+ BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0);
+ } else {
+ BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0);
+ BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1);
+ }
+ }
+}
+
+bool PPCExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ TII = static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI = &TII->getRegisterInfo();
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ MachineBasicBlock &MBB = *I;
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
+ MBBI != MBBE;) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Changed |= expandMI(MBB, MI, NMBBI);
+ MBBI = NMBBI;
+ }
+ }
+ if (Changed)
+ MF.RenumberBlocks();
+ return Changed;
+}
+
+bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ switch (MI.getOpcode()) {
+ case PPC::ATOMIC_SWAP_I128:
+ case PPC::ATOMIC_LOAD_ADD_I128:
+ case PPC::ATOMIC_LOAD_SUB_I128:
+ case PPC::ATOMIC_LOAD_XOR_I128:
+ case PPC::ATOMIC_LOAD_NAND_I128:
+ case PPC::ATOMIC_LOAD_AND_I128:
+ case PPC::ATOMIC_LOAD_OR_I128:
+ return expandAtomicRMW128(MBB, MI, NMBBI);
+ case PPC::ATOMIC_CMP_SWAP_I128:
+ return expandAtomicCmpSwap128(MBB, MI, NMBBI);
+ default:
+ return false;
+ }
+}
+
+bool PPCExpandAtomicPseudo::expandAtomicRMW128(
+ MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ const MCInstrDesc &LL = TII->get(PPC::LQARX);
+ const MCInstrDesc &SC = TII->get(PPC::STQCX);
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ // Create layout of control flow.
+ MachineFunction::iterator MFI = ++MBB.getIterator();
+ MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(MFI, LoopMBB);
+ MF->insert(MFI, ExitMBB);
+ ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()),
+ MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(LoopMBB);
+
+ // For non-min/max operations, control flow is kinda like:
+ // MBB:
+ // ...
+ // LoopMBB:
+ // lqarx in, ptr
+ // addc out.sub_x1, in.sub_x1, op.sub_x1
+ // adde out.sub_x0, in.sub_x0, op.sub_x0
+ // stqcx out, ptr
+ // bne- LoopMBB
+ // ExitMBB:
+ // ...
+ Register Old = MI.getOperand(0).getReg();
+ Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0);
+ Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1);
+ Register Scratch = MI.getOperand(1).getReg();
+ Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0);
+ Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1);
+ Register RA = MI.getOperand(2).getReg();
+ Register RB = MI.getOperand(3).getReg();
+ Register IncrLo = MI.getOperand(4).getReg();
+ Register IncrHi = MI.getOperand(5).getReg();
+ unsigned RMWOpcode = MI.getOpcode();
+
+ MachineBasicBlock *CurrentMBB = LoopMBB;
+ BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB);
+
+ switch (RMWOpcode) {
+ case PPC::ATOMIC_SWAP_I128:
+ PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo,
+ IncrHi, IncrLo);
+ break;
+ case PPC::ATOMIC_LOAD_ADD_I128:
+ BuildMI(CurrentMBB, DL, TII->get(PPC::ADDC8), ScratchLo)
+ .addReg(IncrLo)
+ .addReg(OldLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::ADDE8), ScratchHi)
+ .addReg(IncrHi)
+ .addReg(OldHi);
+ break;
+ case PPC::ATOMIC_LOAD_SUB_I128:
+ BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFC8), ScratchLo)
+ .addReg(IncrLo)
+ .addReg(OldLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFE8), ScratchHi)
+ .addReg(IncrHi)
+ .addReg(OldHi);
+ break;
+
+#define TRIVIAL_ATOMICRMW(Opcode, Instr) \
+ case Opcode: \
+ BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchLo) \
+ .addReg(IncrLo) \
+ .addReg(OldLo); \
+ BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchHi) \
+ .addReg(IncrHi) \
+ .addReg(OldHi); \
+ break
+
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_OR_I128, PPC::OR8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_XOR_I128, PPC::XOR8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_AND_I128, PPC::AND8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_NAND_I128, PPC::NAND8);
+#undef TRIVIAL_ATOMICRMW
+ default:
+ llvm_unreachable("Unhandled atomic RMW operation");
+ }
+ BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(LoopMBB);
+ CurrentMBB->addSuccessor(LoopMBB);
+ CurrentMBB->addSuccessor(ExitMBB);
+ recomputeLiveIns(*LoopMBB);
+ recomputeLiveIns(*ExitMBB);
+ NMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
+ MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ const MCInstrDesc &LL = TII->get(PPC::LQARX);
+ const MCInstrDesc &SC = TII->get(PPC::STQCX);
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ Register Old = MI.getOperand(0).getReg();
+ Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0);
+ Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1);
+ Register Scratch = MI.getOperand(1).getReg();
+ Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0);
+ Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1);
+ Register RA = MI.getOperand(2).getReg();
+ Register RB = MI.getOperand(3).getReg();
+ Register CmpLo = MI.getOperand(4).getReg();
+ Register CmpHi = MI.getOperand(5).getReg();
+ Register NewLo = MI.getOperand(6).getReg();
+ Register NewHi = MI.getOperand(7).getReg();
+ // Create layout of control flow.
+ // loop:
+ // old = lqarx ptr
+ // <compare old, cmp>
+ // bne 0, fail
+ // succ:
+ // stqcx new ptr
+ // bne 0, loop
+ // b exit
+ // fail:
+ // stqcx old ptr
+ // exit:
+ // ....
+ MachineFunction::iterator MFI = ++MBB.getIterator();
+ MachineBasicBlock *LoopCmpMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *CmpSuccMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *CmpFailMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(MFI, LoopCmpMBB);
+ MF->insert(MFI, CmpSuccMBB);
+ MF->insert(MFI, CmpFailMBB);
+ MF->insert(MFI, ExitMBB);
+ ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()),
+ MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(LoopCmpMBB);
+ // Build loop.
+ MachineBasicBlock *CurrentMBB = LoopCmpMBB;
+ BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchLo)
+ .addReg(OldLo)
+ .addReg(CmpLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchHi)
+ .addReg(OldHi)
+ .addReg(CmpHi);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::OR8_rec), ScratchLo)
+ .addReg(ScratchLo)
+ .addReg(ScratchHi);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(CmpFailMBB);
+ CurrentMBB->addSuccessor(CmpSuccMBB);
+ CurrentMBB->addSuccessor(CmpFailMBB);
+ // Build succ.
+ CurrentMBB = CmpSuccMBB;
+ PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo,
+ NewHi, NewLo);
+ BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(LoopCmpMBB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::B)).addMBB(ExitMBB);
+ CurrentMBB->addSuccessor(LoopCmpMBB);
+ CurrentMBB->addSuccessor(ExitMBB);
+ CurrentMBB = CmpFailMBB;
+ BuildMI(CurrentMBB, DL, SC).addReg(Old).addReg(RA).addReg(RB);
+ CurrentMBB->addSuccessor(ExitMBB);
+
+ recomputeLiveIns(*LoopCmpMBB);
+ recomputeLiveIns(*CmpSuccMBB);
+ recomputeLiveIns(*CmpFailMBB);
+ recomputeLiveIns(*ExitMBB);
+ NMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+} // namespace
+
+INITIALIZE_PASS(PPCExpandAtomicPseudo, DEBUG_TYPE, "PowerPC Expand Atomic",
+ false, false)
+
+char PPCExpandAtomicPseudo::ID = 0;
+FunctionPass *llvm::createPPCExpandAtomicPseudoPass() {
+ return new PPCExpandAtomicPseudo();
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index c181816..dfb2c1e 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -63,7 +63,7 @@
namespace {
-typedef struct Address {
+struct Address {
enum {
RegBase,
FrameIndexBase
@@ -81,7 +81,7 @@
: BaseType(RegBase), Offset(0) {
Base.Reg = 0;
}
-} Address;
+};
class PPCFastISel final : public FastISel {
@@ -112,15 +112,12 @@
unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- uint64_t Imm);
+ unsigned Op0, uint64_t Imm);
unsigned fastEmitInst_r(unsigned MachineInstOpcode,
- const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill);
+ const TargetRegisterClass *RC, unsigned Op0);
unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill);
+ unsigned Op0, unsigned Op1);
bool fastLowerCall(CallLoweringInfo &CLI) override;
@@ -2426,7 +2423,7 @@
// where those regs have another meaning.
unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
+ unsigned Op0,
uint64_t Imm) {
if (MachineInstOpcode == PPC::ADDI)
MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
@@ -2437,8 +2434,7 @@
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
- return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC,
- Op0, Op0IsKill, Imm);
+ return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC, Op0, Imm);
}
// Override for instructions with one register operand to avoid use of
@@ -2446,12 +2442,12 @@
// we must be conservative.
unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass* RC,
- unsigned Op0, bool Op0IsKill) {
+ unsigned Op0) {
const TargetRegisterClass *UseRC =
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
- return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+ return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0);
}
// Override for instructions with two register operands to avoid use
@@ -2459,14 +2455,12 @@
// so we must be conservative.
unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass* RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill) {
+ unsigned Op0, unsigned Op1) {
const TargetRegisterClass *UseRC =
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
- return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
- Op1, Op1IsKill);
+ return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op1);
}
namespace llvm {
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 16536bf..1de6b63 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -642,6 +642,8 @@
bool HasFP = hasFP(MF);
bool HasBP = RegInfo->hasBasePointer(MF);
bool HasRedZone = isPPC64 || !isSVR4ABI;
+ bool HasROPProtect = Subtarget.hasROPProtect();
+ bool HasPrivileged = Subtarget.hasPrivileged();
Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
Register BPReg = RegInfo->getBaseRegister(MF);
@@ -672,6 +674,8 @@
const MCInstrDesc &MoveFromCondRegInst = TII.get(isPPC64 ? PPC::MFCR8
: PPC::MFCR);
const MCInstrDesc &StoreWordInst = TII.get(isPPC64 ? PPC::STW8 : PPC::STW);
+ const MCInstrDesc &HashST =
+ TII.get(HasPrivileged ? PPC::HASHSTP : PPC::HASHST);
// Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
// LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
@@ -733,6 +737,22 @@
if (stackUpdateCanBeMoved(MF)) {
const std::vector<CalleeSavedInfo> &Info = MFI.getCalleeSavedInfo();
for (CalleeSavedInfo CSI : Info) {
+ // If the callee saved register is spilled to a register instead of the
+ // stack then the spill no longer uses the stack pointer.
+ // This can lead to two consequences:
+ // 1) We no longer need to update the stack because the function does not
+ // spill any callee saved registers to stack.
+ // 2) We have a situation where we still have to update the stack pointer
+ // even though some registers are spilled to other registers. In
+ // this case the current code moves the stack update to an incorrect
+ // position.
+ // In either case we should abort moving the stack update operation.
+ if (CSI.isSpilledToReg()) {
+ StackUpdateLoc = MBBI;
+ MovingStackUpdateDown = false;
+ break;
+ }
+
int FrIdx = CSI.getFrameIdx();
// If the frame index is not negative the callee saved info belongs to a
// stack object that is not a fixed stack object. We ignore non-fixed
@@ -817,11 +837,34 @@
.addReg(SPReg);
}
- if (MustSaveLR)
+ // Generate the instruction to store the LR. In the case where ROP protection
+ // is required the register holding the LR should not be killed as it will be
+ // used by the hash store instruction.
+ if (MustSaveLR) {
BuildMI(MBB, StackUpdateLoc, dl, StoreInst)
- .addReg(ScratchReg, getKillRegState(true))
- .addImm(LROffset)
- .addReg(SPReg);
+ .addReg(ScratchReg, getKillRegState(!HasROPProtect))
+ .addImm(LROffset)
+ .addReg(SPReg);
+
+ // Add the ROP protection Hash Store instruction.
+ // NOTE: This is technically a violation of the ABI. The hash can be saved
+ // up to 512 bytes into the Protected Zone. This can be outside of the
+ // initial 288 byte volatile program storage region in the Protected Zone.
+ // However, this restriction will be removed in an upcoming revision of the
+ // ABI.
+ if (HasROPProtect) {
+ const int SaveIndex = FI->getROPProtectionHashSaveIndex();
+ const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+ assert((ImmOffset <= -8 && ImmOffset >= -512) &&
+ "ROP hash save offset out of range.");
+ assert(((ImmOffset & 0x7) == 0) &&
+ "ROP hash save offset must be 8 byte aligned.");
+ BuildMI(MBB, StackUpdateLoc, dl, HashST)
+ .addReg(ScratchReg, getKillRegState(true))
+ .addImm(ImmOffset)
+ .addReg(SPReg);
+ }
+ }
if (MustSaveCR &&
!(SingleScratchReg && MustSaveLR)) {
@@ -1521,6 +1564,8 @@
bool HasFP = hasFP(MF);
bool HasBP = RegInfo->hasBasePointer(MF);
bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+ bool HasROPProtect = Subtarget.hasROPProtect();
+ bool HasPrivileged = Subtarget.hasPrivileged();
Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
Register BPReg = RegInfo->getBaseRegister(MF);
@@ -1545,6 +1590,8 @@
: PPC::LWZ);
const MCInstrDesc& MoveToCRInst = TII.get( isPPC64 ? PPC::MTOCRF8
: PPC::MTOCRF);
+ const MCInstrDesc &HashChk =
+ TII.get(HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK);
int LROffset = getReturnSaveOffset();
int FPOffset = 0;
@@ -1630,6 +1677,12 @@
if (stackUpdateCanBeMoved(MF)) {
const std::vector<CalleeSavedInfo> & Info = MFI.getCalleeSavedInfo();
for (CalleeSavedInfo CSI : Info) {
+ // If the callee saved register is spilled to another register abort the
+ // stack update movement.
+ if (CSI.isSpilledToReg()) {
+ StackUpdateLoc = MBBI;
+ break;
+ }
int FrIdx = CSI.getFrameIdx();
// If the frame index is not negative the callee saved info belongs to a
// stack object that is not a fixed stack object. We ignore non-fixed
@@ -1807,8 +1860,23 @@
BuildMI(MBB, MBBI, dl, MoveToCRInst, MustSaveCRs[i])
.addReg(TempReg, getKillRegState(i == e-1));
- if (MustSaveLR)
+ if (MustSaveLR) {
+ // If ROP protection is required, an extra instruction is added to compute a
+ // hash and then compare it to the hash stored in the prologue.
+ if (HasROPProtect) {
+ const int SaveIndex = FI->getROPProtectionHashSaveIndex();
+ const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+ assert((ImmOffset <= -8 && ImmOffset >= -512) &&
+ "ROP hash check location offset out of range.");
+ assert(((ImmOffset & 0x7) == 0) &&
+ "ROP hash check location offset must be 8 byte aligned.");
+ BuildMI(MBB, StackUpdateLoc, dl, HashChk)
+ .addReg(ScratchReg)
+ .addImm(ImmOffset)
+ .addReg(SPReg);
+ }
BuildMI(MBB, StackUpdateLoc, dl, MTLRInst).addReg(ScratchReg);
+ }
// Callee pop calling convention. Pop parameter/linkage area. Used for tail
// call optimization
@@ -2257,30 +2325,39 @@
BVCalleeSaved.set(CSRegs[i]);
for (unsigned Reg : BVAllocatable.set_bits()) {
- // Set to 0 if the register is not a volatile VF/F8 register, or if it is
+ // Set to 0 if the register is not a volatile VSX register, or if it is
// used in the function.
- if (BVCalleeSaved[Reg] ||
- (!PPC::F8RCRegClass.contains(Reg) &&
- !PPC::VFRCRegClass.contains(Reg)) ||
- (MF.getRegInfo().isPhysRegUsed(Reg)))
+ if (BVCalleeSaved[Reg] || !PPC::VSRCRegClass.contains(Reg) ||
+ MF.getRegInfo().isPhysRegUsed(Reg))
BVAllocatable.reset(Reg);
}
bool AllSpilledToReg = true;
+ unsigned LastVSRUsedForSpill = 0;
for (auto &CS : CSI) {
if (BVAllocatable.none())
return false;
unsigned Reg = CS.getReg();
- if (!PPC::G8RCRegClass.contains(Reg) && !PPC::GPRCRegClass.contains(Reg)) {
+
+ if (!PPC::G8RCRegClass.contains(Reg)) {
AllSpilledToReg = false;
continue;
}
+ // For P9, we can reuse LastVSRUsedForSpill to spill two GPRs
+ // into one VSR using the mtvsrdd instruction.
+ if (LastVSRUsedForSpill != 0) {
+ CS.setDstReg(LastVSRUsedForSpill);
+ BVAllocatable.reset(LastVSRUsedForSpill);
+ LastVSRUsedForSpill = 0;
+ continue;
+ }
+
unsigned VolatileVFReg = BVAllocatable.find_first();
if (VolatileVFReg < BVAllocatable.size()) {
CS.setDstReg(VolatileVFReg);
- BVAllocatable.reset(VolatileVFReg);
+ LastVSRUsedForSpill = VolatileVFReg;
} else {
AllSpilledToReg = false;
}
@@ -2299,6 +2376,24 @@
DebugLoc DL;
bool CRSpilled = false;
MachineInstrBuilder CRMIB;
+ BitVector Spilled(TRI->getNumRegs());
+
+ VSRContainingGPRs.clear();
+
+ // Map each VSR to GPRs to be spilled with into it. Single VSR can contain one
+ // or two GPRs, so we need table to record information for later save/restore.
+ llvm::for_each(CSI, [&](const CalleeSavedInfo &Info) {
+ if (Info.isSpilledToReg()) {
+ auto &SpilledVSR =
+ VSRContainingGPRs.FindAndConstruct(Info.getDstReg()).second;
+ assert(SpilledVSR.second == 0 &&
+ "Can't spill more than two GPRs into VSR!");
+ if (SpilledVSR.first == 0)
+ SpilledVSR.first = Info.getReg();
+ else
+ SpilledVSR.second = Info.getReg();
+ }
+ });
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
@@ -2348,9 +2443,31 @@
}
} else {
if (CSI[i].isSpilledToReg()) {
- NumPESpillVSR++;
- BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD), CSI[i].getDstReg())
- .addReg(Reg, getKillRegState(true));
+ unsigned Dst = CSI[i].getDstReg();
+
+ if (Spilled[Dst])
+ continue;
+
+ if (VSRContainingGPRs[Dst].second != 0) {
+ assert(Subtarget.hasP9Vector() &&
+ "mtvsrdd is unavailable on pre-P9 targets.");
+
+ NumPESpillVSR += 2;
+ BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRDD), Dst)
+ .addReg(VSRContainingGPRs[Dst].first, getKillRegState(true))
+ .addReg(VSRContainingGPRs[Dst].second, getKillRegState(true));
+ } else if (VSRContainingGPRs[Dst].second == 0) {
+ assert(Subtarget.hasP8Vector() &&
+ "Can't move GPR to VSR on pre-P8 targets.");
+
+ ++NumPESpillVSR;
+ BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD),
+ TRI->getSubReg(Dst, PPC::sub_64))
+ .addReg(VSRContainingGPRs[Dst].first, getKillRegState(true));
+ } else {
+ llvm_unreachable("More than two GPRs spilled to a VSR!");
+ }
+ Spilled.set(Dst);
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
// Use !IsLiveIn for the kill flag.
@@ -2454,6 +2571,7 @@
bool CR3Spilled = false;
bool CR4Spilled = false;
unsigned CSIIndex = 0;
+ BitVector Restored(TRI->getNumRegs());
// Initialize insertion-point logic; we will be restoring in reverse
// order of spill.
@@ -2498,9 +2616,32 @@
if (CSI[i].isSpilledToReg()) {
DebugLoc DL;
- NumPEReloadVSR++;
- BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD), Reg)
- .addReg(CSI[i].getDstReg(), getKillRegState(true));
+ unsigned Dst = CSI[i].getDstReg();
+
+ if (Restored[Dst])
+ continue;
+
+ if (VSRContainingGPRs[Dst].second != 0) {
+ assert(Subtarget.hasP9Vector());
+ NumPEReloadVSR += 2;
+ BuildMI(MBB, I, DL, TII.get(PPC::MFVSRLD),
+ VSRContainingGPRs[Dst].second)
+ .addReg(Dst);
+ BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD),
+ VSRContainingGPRs[Dst].first)
+ .addReg(TRI->getSubReg(Dst, PPC::sub_64), getKillRegState(true));
+ } else if (VSRContainingGPRs[Dst].second == 0) {
+ assert(Subtarget.hasP8Vector());
+ ++NumPEReloadVSR;
+ BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD),
+ VSRContainingGPRs[Dst].first)
+ .addReg(TRI->getSubReg(Dst, PPC::sub_64), getKillRegState(true));
+ } else {
+ llvm_unreachable("More than two GPRs spilled to a VSR!");
+ }
+
+ Restored.set(Dst);
+
} else {
// Default behavior for non-CR saves.
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
@@ -2554,6 +2695,5 @@
bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled())
return false;
- return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
- MF.getSubtarget<PPCSubtarget>().isPPC64());
+ return !MF.getSubtarget<PPCSubtarget>().is32BitELFABI();
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index 8bf52c0..b378c27 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -28,6 +28,10 @@
const unsigned BasePointerSaveOffset;
const unsigned CRSaveOffset;
+ // Map each group of one or two GPRs to corresponding VSR for spilling.
+ // TODO: Use local table in methods to avoid this mutable member.
+ mutable DenseMap<unsigned, std::pair<Register, Register>> VSRContainingGPRs;
+
/**
* Find register[s] that can be used in function prologue and epilogue
*
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2604218..606aae6 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -152,6 +152,15 @@
GlobalBaseReg = 0;
Subtarget = &MF.getSubtarget<PPCSubtarget>();
PPCLowering = Subtarget->getTargetLowering();
+ if (Subtarget->hasROPProtect()) {
+ // Create a place on the stack for the ROP Protection Hash.
+ // The ROP Protection Hash will always be 8 bytes and aligned to 8
+ // bytes.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ const int Result = MFI.CreateStackObject(8, Align(8), false);
+ FI->setROPProtectionHashSaveIndex(Result);
+ }
SelectionDAGISel::runOnMachineFunction(MF);
return true;
@@ -229,6 +238,45 @@
return false;
}
+ /// SelectDSForm - Returns true if address N can be represented by the
+ /// addressing mode of DSForm instructions (a base register, plus a signed
+ /// 16-bit displacement that is a multiple of 4.
+ bool SelectDSForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+ Align(4)) == PPC::AM_DSForm;
+ }
+
+ /// SelectDQForm - Returns true if address N can be represented by the
+ /// addressing mode of DQForm instructions (a base register, plus a signed
+ /// 16-bit displacement that is a multiple of 16.
+ bool SelectDQForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+ Align(16)) == PPC::AM_DQForm;
+ }
+
+ /// SelectDForm - Returns true if address N can be represented by
+ /// the addressing mode of DForm instructions (a base register, plus a
+ /// signed 16-bit immediate.
+ bool SelectDForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+ None) == PPC::AM_DForm;
+ }
+
+ /// SelectXForm - Returns true if address N can be represented by the
+ /// addressing mode of XForm instructions (an indexed [r+r] operation).
+ bool SelectXForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+ None) == PPC::AM_XForm;
+ }
+
+ /// SelectForceXForm - Given the specified address, force it to be
+ /// represented as an indexed [r+r] operation (an XForm instruction).
+ bool SelectForceXForm(SDNode *Parent, SDValue N, SDValue &Disp,
+ SDValue &Base) {
+ return PPCLowering->SelectForceXFormMode(N, Disp, Base, *CurDAG) ==
+ PPC::AM_XForm;
+ }
+
/// SelectAddrIdx - Given the specified address, check to see if it can be
/// represented as an indexed [r+r] operation.
/// This is for xform instructions whose associated displacement form is D.
@@ -433,6 +481,60 @@
.getNode();
}
+// Check if a SDValue has the toc-data attribute.
+static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
+ GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val);
+ if (!GA)
+ return false;
+
+ const GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(GA->getGlobal());
+ if (!GV)
+ return false;
+
+ if (!GV->hasAttribute("toc-data"))
+ return false;
+
+ // TODO: These asserts should be updated as more support for the toc data
+ // transformation is added (64 bit, struct support, etc.).
+
+ assert(PointerSize == 4 && "Only 32 Bit Codegen is currently supported by "
+ "the toc data transformation.");
+
+ assert(PointerSize >= GV->getAlign().valueOrOne().value() &&
+ "GlobalVariables with an alignment requirement stricter then 4-bytes "
+ "not supported by the toc data transformation.");
+
+ Type *GVType = GV->getValueType();
+
+ assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
+ "supported by the toc data transformation.");
+
+ if (GVType->isVectorTy())
+ report_fatal_error("A GlobalVariable of Vector type is not currently "
+ "supported by the toc data transformation.");
+
+ if (GVType->isArrayTy())
+ report_fatal_error("A GlobalVariable of Array type is not currently "
+ "supported by the toc data transformation.");
+
+ if (GVType->isStructTy())
+ report_fatal_error("A GlobalVariable of Struct type is not currently "
+ "supported by the toc data transformation.");
+
+ assert(GVType->getPrimitiveSizeInBits() <= PointerSize * 8 &&
+ "A GlobalVariable with size larger than 32 bits is not currently "
+ "supported by the toc data transformation.");
+
+ if (GV->hasLocalLinkage() || GV->hasPrivateLinkage())
+ report_fatal_error("A GlobalVariable with private or local linkage is not "
+ "currently supported by the toc data transformation.");
+
+ assert(!GV->hasCommonLinkage() &&
+ "Tentative definitions cannot have the mapping class XMC_TD.");
+
+ return true;
+}
+
/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
/// operand. If so Imm will receive the 32-bit value.
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -941,7 +1043,7 @@
// 63 0 63 0
if ((Shift = findContiguousZerosAtLeast(Imm, 49)) ||
(Shift = findContiguousZerosAtLeast(~Imm, 49))) {
- uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+ uint64_t RotImm = APInt(64, Imm).rotr(Shift).getZExtValue();
Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
getI32Imm(RotImm & 0xffff));
return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
@@ -1019,7 +1121,7 @@
// This is similar to Pattern 2-6, please refer to the diagram there.
if ((Shift = findContiguousZerosAtLeast(Imm, 33)) ||
(Shift = findContiguousZerosAtLeast(~Imm, 33))) {
- uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+ uint64_t RotImm = APInt(64, Imm).rotr(Shift).getZExtValue();
uint64_t ImmHi16 = (RotImm >> 16) & 0xffff;
unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
@@ -1033,12 +1135,159 @@
return nullptr;
}
+// Try to select instructions to generate a 64 bit immediate using prefix as
+// well as non prefix instructions. The function will return the SDNode
+// to materialize that constant or it will return nullptr if it does not
+// find one. The variable InstCnt is set to the number of instructions that
+// were selected.
+static SDNode *selectI64ImmDirectPrefix(SelectionDAG *CurDAG, const SDLoc &dl,
+ uint64_t Imm, unsigned &InstCnt) {
+ unsigned TZ = countTrailingZeros<uint64_t>(Imm);
+ unsigned LZ = countLeadingZeros<uint64_t>(Imm);
+ unsigned TO = countTrailingOnes<uint64_t>(Imm);
+ unsigned FO = countLeadingOnes<uint64_t>(LZ == 64 ? 0 : (Imm << LZ));
+ unsigned Hi32 = Hi_32(Imm);
+ unsigned Lo32 = Lo_32(Imm);
+
+ auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ };
+
+ auto getI64Imm = [CurDAG, dl](uint64_t Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i64);
+ };
+
+ // Following patterns use 1 instruction to materialize Imm.
+ InstCnt = 1;
+
+ // The pli instruction can materialize up to 34 bits directly.
+ // If a constant fits within 34-bits, emit the pli instruction here directly.
+ if (isInt<34>(Imm))
+ return CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64,
+ CurDAG->getTargetConstant(Imm, dl, MVT::i64));
+
+ // Require at least two instructions.
+ InstCnt = 2;
+ SDNode *Result = nullptr;
+ // Patterns : {zeros}{ones}{33-bit value}{zeros}
+ // {zeros}{33-bit value}{zeros}
+ // {zeros}{ones}{33-bit value}
+ // {ones}{33-bit value}{zeros}
+ // We can take advantage of PLI's sign-extension semantics to generate leading
+ // ones, and then use RLDIC to mask off the ones on both sides after rotation.
+ if ((LZ + FO + TZ) > 30) {
+ APInt SignedInt34 = APInt(34, (Imm >> TZ) & 0x3ffffffff);
+ APInt Extended = SignedInt34.sext(64);
+ Result = CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64,
+ getI64Imm(*Extended.getRawData()));
+ return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TZ), getI32Imm(LZ));
+ }
+ // Pattern : {zeros}{33-bit value}{ones}
+ // Shift right the Imm by (30 - LZ) bits to construct a negative 34 bit value,
+ // therefore we can take advantage of PLI's sign-extension semantics, and then
+ // mask them off after rotation.
+ //
+ // +--LZ--||-33-bit-||--TO--+ +-------------|--34-bit--+
+ // |00000001bbbbbbbbb1111111| -> |00000000000001bbbbbbbbb1|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ //
+ // +----sext-----|--34-bit--+ +clear-|-----------------+
+ // |11111111111111bbbbbbbbb1| -> |00000001bbbbbbbbb1111111|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ if ((LZ + TO) > 30) {
+ APInt SignedInt34 = APInt(34, (Imm >> (30 - LZ)) & 0x3ffffffff);
+ APInt Extended = SignedInt34.sext(64);
+ Result = CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64,
+ getI64Imm(*Extended.getRawData()));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(30 - LZ), getI32Imm(LZ));
+ }
+ // Patterns : {zeros}{ones}{33-bit value}{ones}
+ // {ones}{33-bit value}{ones}
+ // Similar to LI we can take advantage of PLI's sign-extension semantics to
+ // generate leading ones, and then use RLDICL to mask off the ones in left
+ // sides (if required) after rotation.
+ if ((LZ + FO + TO) > 30) {
+ APInt SignedInt34 = APInt(34, (Imm >> TO) & 0x3ffffffff);
+ APInt Extended = SignedInt34.sext(64);
+ Result = CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64,
+ getI64Imm(*Extended.getRawData()));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TO), getI32Imm(LZ));
+ }
+ // Patterns : {******}{31 zeros}{******}
+ // : {******}{31 ones}{******}
+ // If Imm contains 31 consecutive zeros/ones then the remaining bit count
+ // is 33. Rotate right the Imm to construct a int<33> value, we can use PLI
+ // for the int<33> value and then use RLDICL without a mask to rotate it back.
+ //
+ // +------|--ones--|------+ +---ones--||---33 bit--+
+ // |bbbbbb1111111111aaaaaa| -> |1111111111aaaaaabbbbbb|
+ // +----------------------+ +----------------------+
+ // 63 0 63 0
+ for (unsigned Shift = 0; Shift < 63; ++Shift) {
+ uint64_t RotImm = APInt(64, Imm).rotr(Shift).getZExtValue();
+ if (isInt<34>(RotImm)) {
+ Result =
+ CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64, getI64Imm(RotImm));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Shift),
+ getI32Imm(0));
+ }
+ }
+
+ // Patterns : High word == Low word
+ // This is basically a splat of a 32 bit immediate.
+ if (Hi32 == Lo32) {
+ Result = CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64, getI64Imm(Hi32));
+ SDValue Ops[] = {SDValue(Result, 0), SDValue(Result, 0), getI32Imm(32),
+ getI32Imm(0)};
+ return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+ }
+
+ InstCnt = 3;
+ // Catch-all
+ // This pattern can form any 64 bit immediate in 3 instructions.
+ SDNode *ResultHi =
+ CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64, getI64Imm(Hi32));
+ SDNode *ResultLo =
+ CurDAG->getMachineNode(PPC::PLI8, dl, MVT::i64, getI64Imm(Lo32));
+ SDValue Ops[] = {SDValue(ResultLo, 0), SDValue(ResultHi, 0), getI32Imm(32),
+ getI32Imm(0)};
+ return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+}
+
static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, uint64_t Imm,
unsigned *InstCnt = nullptr) {
unsigned InstCntDirect = 0;
// No more than 3 instructions is used if we can select the i64 immediate
// directly.
SDNode *Result = selectI64ImmDirect(CurDAG, dl, Imm, InstCntDirect);
+
+ const PPCSubtarget &Subtarget =
+ CurDAG->getMachineFunction().getSubtarget<PPCSubtarget>();
+
+ // If we have prefixed instructions and there is a chance we can
+ // materialize the constant with fewer prefixed instructions than
+ // non-prefixed, try that.
+ if (Subtarget.hasPrefixInstrs() && InstCntDirect != 1) {
+ unsigned InstCntDirectP = 0;
+ SDNode *ResultP = selectI64ImmDirectPrefix(CurDAG, dl, Imm, InstCntDirectP);
+ // Use the prefix case in either of two cases:
+ // 1) We have no result from the non-prefix case to use.
+ // 2) The non-prefix case uses more instructions than the prefix case.
+ // If the prefix and non-prefix cases use the same number of instructions
+ // we will prefer the non-prefix case.
+ if (ResultP && (!Result || InstCntDirectP < InstCntDirect)) {
+ if (InstCnt)
+ *InstCnt = InstCntDirectP;
+ return ResultP;
+ }
+ }
+
if (Result) {
if (InstCnt)
*InstCnt = InstCntDirect;
@@ -3836,7 +4085,7 @@
Opc = Subtarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
} else {
assert(LHS.getValueType() == MVT::f128 && "Unknown vt!");
- assert(Subtarget->hasVSX() && "__float128 requires VSX");
+ assert(Subtarget->hasP9Vector() && "XSCMPUQP requires Power9 Vector");
Opc = PPC::XSCMPUQP;
}
if (Chain)
@@ -4734,6 +4983,18 @@
break;
case ISD::INTRINSIC_WO_CHAIN: {
+ // We emit the PPC::FSELS instruction here because of type conflicts with
+ // the comparison operand. The FSELS instruction is defined to use an 8-byte
+ // comparison like the FSELD version. The fsels intrinsic takes a 4-byte
+ // value for the comparison. When selecting through a .td file, a type
+ // error is raised. Must check this first so we never break on the
+ // !Subtarget->isISA3_1() check.
+ if (N->getConstantOperandVal(0) == Intrinsic::ppc_fsels) {
+ SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3)};
+ CurDAG->SelectNodeTo(N, PPC::FSELS, MVT::f32, Ops);
+ return;
+ }
+
if (!Subtarget->isISA3_1())
break;
unsigned Opcode = 0;
@@ -5399,12 +5660,12 @@
// Handle 32-bit small code model.
if (!isPPC64) {
- // Transforms the ISD::TOC_ENTRY node to a PPCISD::LWZtoc.
- auto replaceWithLWZtoc = [this, &dl](SDNode *TocEntry) {
+ // Transforms the ISD::TOC_ENTRY node to passed in Opcode, either
+ // PPC::ADDItoc, or PPC::LWZtoc
+ auto replaceWith = [this, &dl](unsigned OpCode, SDNode *TocEntry) {
SDValue GA = TocEntry->getOperand(0);
SDValue TocBase = TocEntry->getOperand(1);
- SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
- TocBase);
+ SDNode *MN = CurDAG->getMachineNode(OpCode, dl, MVT::i32, GA, TocBase);
transferMemOperands(TocEntry, MN);
ReplaceNode(TocEntry, MN);
};
@@ -5414,12 +5675,17 @@
"32-bit ELF can only have TOC entries in position independent"
" code.");
// 32-bit ELF always uses a small code model toc access.
- replaceWithLWZtoc(N);
+ replaceWith(PPC::LWZtoc, N);
return;
}
if (isAIXABI && CModel == CodeModel::Small) {
- replaceWithLWZtoc(N);
+ if (hasTocDataAttr(N->getOperand(0),
+ CurDAG->getDataLayout().getPointerSize()))
+ replaceWith(PPC::ADDItoc, N);
+ else
+ replaceWith(PPC::LWZtoc, N);
+
return;
}
}
@@ -6682,6 +6948,105 @@
CurDAG->RemoveDeadNodes();
}
+static bool isVSXSwap(SDValue N) {
+ if (!N->isMachineOpcode())
+ return false;
+ unsigned Opc = N->getMachineOpcode();
+
+ // Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
+ // operand is 2.
+ if (Opc == PPC::XXPERMDIs) {
+ return isa<ConstantSDNode>(N->getOperand(1)) &&
+ N->getConstantOperandVal(1) == 2;
+ } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
+ return N->getOperand(0) == N->getOperand(1) &&
+ isa<ConstantSDNode>(N->getOperand(2)) &&
+ N->getConstantOperandVal(2) == 2;
+ }
+
+ return false;
+}
+
+// TODO: Make this complete and replace with a table-gen bit.
+static bool isLaneInsensitive(SDValue N) {
+ if (!N->isMachineOpcode())
+ return false;
+ unsigned Opc = N->getMachineOpcode();
+
+ switch (Opc) {
+ default:
+ return false;
+ case PPC::VAVGSB:
+ case PPC::VAVGUB:
+ case PPC::VAVGSH:
+ case PPC::VAVGUH:
+ case PPC::VAVGSW:
+ case PPC::VAVGUW:
+ case PPC::VMAXFP:
+ case PPC::VMAXSB:
+ case PPC::VMAXUB:
+ case PPC::VMAXSH:
+ case PPC::VMAXUH:
+ case PPC::VMAXSW:
+ case PPC::VMAXUW:
+ case PPC::VMINFP:
+ case PPC::VMINSB:
+ case PPC::VMINUB:
+ case PPC::VMINSH:
+ case PPC::VMINUH:
+ case PPC::VMINSW:
+ case PPC::VMINUW:
+ case PPC::VADDFP:
+ case PPC::VADDUBM:
+ case PPC::VADDUHM:
+ case PPC::VADDUWM:
+ case PPC::VSUBFP:
+ case PPC::VSUBUBM:
+ case PPC::VSUBUHM:
+ case PPC::VSUBUWM:
+ case PPC::VAND:
+ case PPC::VANDC:
+ case PPC::VOR:
+ case PPC::VORC:
+ case PPC::VXOR:
+ case PPC::VNOR:
+ case PPC::VMULUWM:
+ return true;
+ }
+}
+
+// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
+// lane-insensitive.
+static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
+ // Our desired xxswap might be source of COPY_TO_REGCLASS.
+ // TODO: Can we put this a common method for DAG?
+ auto SkipRCCopy = [](SDValue V) {
+ while (V->isMachineOpcode() &&
+ V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) {
+ // All values in the chain should have single use.
+ if (V->use_empty() || !V->use_begin()->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ V = V->getOperand(0);
+ }
+ return V.hasOneUse() ? V : SDValue();
+ };
+
+ SDValue VecOp = SkipRCCopy(N->getOperand(0));
+ if (!VecOp || !isLaneInsensitive(VecOp))
+ return;
+
+ SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
+ RHS = SkipRCCopy(VecOp.getOperand(1));
+ if (!LHS || !RHS || !isVSXSwap(LHS) || !isVSXSwap(RHS))
+ return;
+
+ // These swaps may still have chain-uses here, count on dead code elimination
+ // in following passes to remove them.
+ DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
+ DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
+ DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
+}
+
void PPCDAGToDAGISel::PeepholePPC64() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
@@ -6691,6 +7056,9 @@
if (N->use_empty() || !N->isMachineOpcode())
continue;
+ if (isVSXSwap(SDValue(N, 0)))
+ reduceVSXSwap(N, CurDAG);
+
unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
bool RequiresMod4Offset = false;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 26dc3af..3735817 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -121,10 +121,10 @@
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
-// TODO - Remove this option if soft fp128 has been fully supported .
-static cl::opt<bool>
- EnableSoftFP128("enable-soft-fp128",
- cl::desc("temp option to enable soft fp128"), cl::Hidden);
+static cl::opt<bool> EnableQuadwordAtomics(
+ "ppc-quadword-atomics",
+ cl::desc("enable quadword lock-free atomic operations"), cl::init(false),
+ cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
@@ -135,12 +135,18 @@
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
+static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
const PPCSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
+ // Initialize map that relates the PPC addressing modes to the computed flags
+ // of a load/store instruction. The map is used to determine the optimal
+ // addressing mode when selecting load and stores.
+ initializeAddrModeMap();
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
// arguments are at least 4/8 bytes aligned.
bool isPPC64 = Subtarget.isPPC64();
@@ -328,14 +334,18 @@
setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
+
+ if (!Subtarget.hasSPE()) {
+ setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
+ }
+
if (Subtarget.hasVSX()) {
setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);
@@ -417,7 +427,7 @@
// to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectively
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
- if (Subtarget.hasP9Vector())
+ if (Subtarget.hasP9Vector() && Subtarget.isPPC64())
setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
else
setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
@@ -481,6 +491,10 @@
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+
+ // SPE supports signaling compare of f32/f64.
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
} else {
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
@@ -882,6 +896,7 @@
if (Subtarget.hasVSX()) {
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
}
if (Subtarget.hasP8Altivec())
@@ -1069,8 +1084,7 @@
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
- if (Subtarget.hasDirectMove())
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
// Handle constrained floating-point operations of vector.
@@ -1105,6 +1119,23 @@
setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
+ addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+
+ for (MVT FPT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+
+ // Expand the SELECT to SELECT_CC
+ setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+
+ // No implementation for these ops for PowerPC.
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
}
if (Subtarget.hasP8Altivec()) {
@@ -1123,15 +1154,12 @@
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
- addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
setOperationAction(ISD::FADD, MVT::f128, Legal);
setOperationAction(ISD::FSUB, MVT::f128, Legal);
setOperationAction(ISD::FDIV, MVT::f128, Legal);
setOperationAction(ISD::FMUL, MVT::f128, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
- // No extending loads to f128 on PPC.
- for (MVT FPT : MVT::fp_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+
setOperationAction(ISD::FMA, MVT::f128, Legal);
setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
@@ -1147,18 +1175,9 @@
setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
setOperationAction(ISD::FROUND, MVT::f128, Legal);
- setOperationAction(ISD::SELECT, MVT::f128, Expand);
setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
- setTruncStoreAction(MVT::f128, MVT::f64, Expand);
- setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i128, Custom);
- // No implementation for these ops for PowerPC.
- setOperationAction(ISD::FSIN, MVT::f128, Expand);
- setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FPOW, MVT::f128, Expand);
- setOperationAction(ISD::FPOWI, MVT::f128, Expand);
- setOperationAction(ISD::FREM, MVT::f128, Expand);
// Handle constrained floating-point operations of fp128
setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
@@ -1181,12 +1200,7 @@
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
- } else if (Subtarget.hasAltivec() && EnableSoftFP128) {
- addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
-
- for (MVT FPT : MVT::fp_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
-
+ } else if (Subtarget.hasVSX()) {
setOperationAction(ISD::LOAD, MVT::f128, Promote);
setOperationAction(ISD::STORE, MVT::f128, Promote);
@@ -1202,18 +1216,10 @@
setOperationAction(ISD::FDIV, MVT::f128, Expand);
setOperationAction(ISD::FNEG, MVT::f128, Expand);
setOperationAction(ISD::FABS, MVT::f128, Expand);
- setOperationAction(ISD::FSIN, MVT::f128, Expand);
- setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FPOW, MVT::f128, Expand);
- setOperationAction(ISD::FPOWI, MVT::f128, Expand);
- setOperationAction(ISD::FREM, MVT::f128, Expand);
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
setOperationAction(ISD::FMA, MVT::f128, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
- setTruncStoreAction(MVT::f128, MVT::f64, Expand);
- setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-
// Expand the fp_extend if the target type is fp128.
setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
@@ -1223,6 +1229,19 @@
setOperationAction(ISD::FP_ROUND, VT, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
}
+
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f128, Expand);
+
+ // Lower following f128 select_cc pattern:
+ // select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
+ setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+
+ // We need to handle f128 SELECT_CC with integer result type.
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, isPPC64 ? Custom : Expand);
}
if (Subtarget.hasP9Altivec()) {
@@ -1237,6 +1256,9 @@
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
}
+
+ if (Subtarget.isISA3_1())
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
}
if (Subtarget.pairedVectorMemops()) {
@@ -1264,6 +1286,9 @@
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
}
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics())
+ setMaxAtomicSizeInBitsSupported(128);
+
setBooleanContents(ZeroOrOneBooleanContent);
if (Subtarget.hasAltivec()) {
@@ -1416,6 +1441,84 @@
PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
}
+// *********************************** NOTE ************************************
+// For selecting load and store instructions, the addressing modes are defined
+// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
+// patterns to match the load the store instructions.
+//
+// The TD definitions for the addressing modes correspond to their respective
+// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
+// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
+// address mode flags of a particular node. Afterwards, the computed address
+// flags are passed into getAddrModeForFlags() in order to retrieve the optimal
+// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
+// accordingly, based on the preferred addressing mode.
+//
+// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
+// MemOpFlags contains all the possible flags that can be used to compute the
+// optimal addressing mode for load and store instructions.
+// AddrMode contains all the possible load and store addressing modes available
+// on Power (such as DForm, DSForm, DQForm, XForm, etc.)
+//
+// When adding new load and store instructions, it is possible that new address
+// flags may need to be added into MemOpFlags, and a new addressing mode will
+// need to be added to AddrMode. An entry of the new addressing mode (consisting
+// of the minimal and main distinguishing address flags for the new load/store
+// instructions) will need to be added into initializeAddrModeMap() below.
+// Finally, when adding new addressing modes, the getAddrModeForFlags() will
+// need to be updated to account for selecting the optimal addressing mode.
+// *****************************************************************************
+/// Initialize the map that relates the different addressing modes of the load
+/// and store instructions to a set of flags. This ensures the load/store
+/// instruction is correctly matched during instruction selection.
+void PPCTargetLowering::initializeAddrModeMap() {
+ AddrModesMap[PPC::AM_DForm] = {
+ // LWZ, STW
+ PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
+ PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
+ PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
+ PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
+ // LBZ, LHZ, STB, STH
+ PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
+ PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
+ PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
+ PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
+ // LHA
+ PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
+ PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
+ PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
+ PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
+ // LFS, LFD, STFS, STFD
+ PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
+ PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
+ PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
+ PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
+ };
+ AddrModesMap[PPC::AM_DSForm] = {
+ // LWA
+ PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
+ PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
+ PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
+ // LD, STD
+ PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
+ PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
+ PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
+ // DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
+ PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
+ PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
+ PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
+ };
+ AddrModesMap[PPC::AM_DQForm] = {
+ // LXV, STXV
+ PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
+ PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
+ PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
+ PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
+ PPC::MOF_NotAddNorCst | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
+ PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
+ };
+}
+
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
@@ -1571,6 +1674,7 @@
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
+ case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX";
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
@@ -2427,6 +2531,20 @@
return isIntS16Immediate(Op.getNode(), Imm);
}
+/// Used when computing address flags for selecting loads and stores.
+/// If we have an OR, check if the LHS and RHS are provably disjoint.
+/// An OR of two provably disjoint values is equivalent to an ADD.
+/// Most PPC load/store instructions compute the effective address as a sum,
+/// so doing this conversion is useful.
+static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
+ if (N.getOpcode() != ISD::OR)
+ return false;
+ KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+ if (!LHSKnown.Zero.getBoolValue())
+ return false;
+ KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
+ return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
+}
/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
/// be represented as an indexed [r+r] operation.
@@ -3116,6 +3234,41 @@
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
+ if (Subtarget.isAIXABI())
+ return LowerGlobalTLSAddressAIX(Op, DAG);
+
+ return LowerGlobalTLSAddressLinux(Op, DAG);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
+ SelectionDAG &DAG) const {
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ if (DAG.getTarget().useEmulatedTLS())
+ report_fatal_error("Emulated TLS is not yet supported on AIX");
+
+ SDLoc dl(GA);
+ const GlobalValue *GV = GA->getGlobal();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // The general-dynamic model is the only access model supported for now, so
+ // all the GlobalTLSAddress nodes are lowered with this model.
+ // We need to generate two TOC entries, one for the variable offset, one for
+ // the region handle. The global address for the TOC entry of the region
+ // handle is created with the MO_TLSGDM_FLAG flag and the global address
+ // for the TOC entry of the variable offset is created with MO_TLSGD_FLAG.
+ SDValue VariableOffsetTGA =
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGD_FLAG);
+ SDValue RegionHandleTGA =
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGDM_FLAG);
+ SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
+ SDValue RegionHandle = getTOCEntry(DAG, dl, RegionHandleTGA);
+ return DAG.getNode(PPCISD::TLSGD_AIX, dl, PtrVT, VariableOffset,
+ RegionHandle);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
+ SelectionDAG &DAG) const {
// FIXME: TLS addresses currently use medium model code sequences,
// which is the most useful form. Eventually support for small and
// large models could be added if users need it, at the cost of
@@ -3294,21 +3447,43 @@
}
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ bool IsStrict = Op->isStrictFPOpcode();
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
+ SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue RHS = Op.getOperand(IsStrict ? 2 : 1);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ EVT LHSVT = LHS.getValueType();
SDLoc dl(Op);
+ // Soften the setcc with libcall if it is fp128.
+ if (LHSVT == MVT::f128) {
+ assert(!Subtarget.hasP9Vector() &&
+ "SETCC for f128 is already legal under Power9!");
+ softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,
+ Op->getOpcode() == ISD::STRICT_FSETCCS);
+ if (RHS.getNode())
+ LHS = DAG.getNode(ISD::SETCC, dl, Op.getValueType(), LHS, RHS,
+ DAG.getCondCode(CC));
+ if (IsStrict)
+ return DAG.getMergeValues({LHS, Chain}, dl);
+ return LHS;
+ }
+
+ assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
+
if (Op.getValueType() == MVT::v2i64) {
// When the operands themselves are v2i64 values, we need to do something
// special because VSX has no underlying comparison operations for these.
- if (Op.getOperand(0).getValueType() == MVT::v2i64) {
+ if (LHS.getValueType() == MVT::v2i64) {
// Equality can be handled by casting to the legal type for Altivec
// comparisons, everything else needs to be expanded.
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
- return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
- DAG.getSetCC(dl, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
- CC));
+ return DAG.getNode(
+ ISD::BITCAST, dl, MVT::v2i64,
+ DAG.getSetCC(dl, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC));
}
return SDValue();
@@ -3324,7 +3499,7 @@
if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
return V;
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
// Leave comparisons against 0 and -1 alone for now, since they're usually
// optimized. FIXME: revisit this when we can custom lower all setcc
// optimizations.
@@ -3337,11 +3512,9 @@
// condition register, reading it back out, and masking the correct bit. The
// normal approach here uses sub to do this instead of xor. Using xor exposes
// the result to other bit-twiddling opportunities.
- EVT LHSVT = Op.getOperand(0).getValueType();
if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
EVT VT = Op.getValueType();
- SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
- Op.getOperand(1));
+ SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, LHS, RHS);
return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
}
return SDValue();
@@ -4979,7 +5152,7 @@
if (isPatchPoint)
return false;
- if (isFunctionGlobalAddress(Callee) || dyn_cast<ExternalSymbolSDNode>(Callee))
+ if (isFunctionGlobalAddress(Callee) || isa<ExternalSymbolSDNode>(Callee))
return false;
// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
@@ -5104,8 +5277,8 @@
const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
auto &Context = DAG.getMachineFunction().getMMI().getContext();
MCSectionXCOFF *Sec = Context.getXCOFFSection(
- (Twine(".") + Twine(SymName)).str(), XCOFF::XMC_PR, XCOFF::XTY_ER,
- SectionKind::getMetadata());
+ (Twine(".") + Twine(SymName)).str(), SectionKind::getMetadata(),
+ XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
return Sec->getQualNameSymbol();
};
@@ -6312,21 +6485,49 @@
Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
+// Returns true when the shadow of a general purpose argument register
+// in the parameter save area is aligned to at least 'RequiredAlign'.
+static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
+ assert(RequiredAlign.value() <= 16 &&
+ "Required alignment greater than stack alignment.");
+ switch (Reg) {
+ default:
+ report_fatal_error("called on invalid register.");
+ case PPC::R5:
+ case PPC::R9:
+ case PPC::X3:
+ case PPC::X5:
+ case PPC::X7:
+ case PPC::X9:
+ // These registers are 16 byte aligned which is the most strict aligment
+ // we can support.
+ return true;
+ case PPC::R3:
+ case PPC::R7:
+ case PPC::X4:
+ case PPC::X6:
+ case PPC::X8:
+ case PPC::X10:
+ // The shadow of these registers in the PSA is 8 byte aligned.
+ return RequiredAlign <= 8;
+ case PPC::R4:
+ case PPC::R6:
+ case PPC::R8:
+ case PPC::R10:
+ return RequiredAlign <= 4;
+ }
+}
+
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State) {
-
+ CCState &S) {
+ AIXCCState &State = static_cast<AIXCCState &>(S);
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
State.getMachineFunction().getSubtarget());
const bool IsPPC64 = Subtarget.isPPC64();
const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
- if (ValVT.isVector() && !State.getMachineFunction()
- .getTarget()
- .Options.EnableAIXExtendedAltivecABI)
- report_fatal_error("the default Altivec AIX ABI is not yet supported");
-
if (ValVT == MVT::f128)
report_fatal_error("f128 is unimplemented on AIX.");
@@ -6447,15 +6648,96 @@
case MVT::v2i64:
case MVT::v2f64:
case MVT::v1i128: {
- if (State.isVarArg())
- report_fatal_error(
- "variadic arguments for vector types are unimplemented for AIX");
+ const unsigned VecSize = 16;
+ const Align VecAlign(VecSize);
- if (unsigned VReg = State.AllocateReg(VR))
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
- else {
- report_fatal_error(
- "passing vector parameters to the stack is unimplemented for AIX");
+ if (!State.isVarArg()) {
+ // If there are vector registers remaining we don't consume any stack
+ // space.
+ if (unsigned VReg = State.AllocateReg(VR)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
+ return false;
+ }
+ // Vectors passed on the stack do not shadow GPRs or FPRs even though they
+ // might be allocated in the portion of the PSA that is shadowed by the
+ // GPRs.
+ const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ const unsigned PtrSize = IsPPC64 ? 8 : 4;
+ ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
+
+ unsigned NextRegIndex = State.getFirstUnallocated(GPRs);
+ // Burn any underaligned registers and their shadowed stack space until
+ // we reach the required alignment.
+ while (NextRegIndex != GPRs.size() &&
+ !isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) {
+ // Shadow allocate register and its stack shadow.
+ unsigned Reg = State.AllocateReg(GPRs);
+ State.AllocateStack(PtrSize, PtrAlign);
+ assert(Reg && "Allocating register unexpectedly failed.");
+ (void)Reg;
+ NextRegIndex = State.getFirstUnallocated(GPRs);
+ }
+
+ // Vectors that are passed as fixed arguments are handled differently.
+ // They are passed in VRs if any are available (unlike arguments passed
+ // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
+ // functions)
+ if (State.isFixed(ValNo)) {
+ if (unsigned VReg = State.AllocateReg(VR)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
+ // Shadow allocate GPRs and stack space even though we pass in a VR.
+ for (unsigned I = 0; I != VecSize; I += PtrSize)
+ State.AllocateReg(GPRs);
+ State.AllocateStack(VecSize, VecAlign);
+ return false;
+ }
+ // No vector registers remain so pass on the stack.
+ const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ // If all GPRS are consumed then we pass the argument fully on the stack.
+ if (NextRegIndex == GPRs.size()) {
+ const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ // Corner case for 32-bit codegen. We have 2 registers to pass the first
+ // half of the argument, and then need to pass the remaining half on the
+ // stack.
+ if (GPRs[NextRegIndex] == PPC::R9) {
+ const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
+ State.addLoc(
+ CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+ const unsigned FirstReg = State.AllocateReg(PPC::R9);
+ const unsigned SecondReg = State.AllocateReg(PPC::R10);
+ assert(FirstReg && SecondReg &&
+ "Allocating R9 or R10 unexpectedly failed.");
+ State.addLoc(
+ CCValAssign::getCustomReg(ValNo, ValVT, FirstReg, RegVT, LocInfo));
+ State.addLoc(
+ CCValAssign::getCustomReg(ValNo, ValVT, SecondReg, RegVT, LocInfo));
+ return false;
+ }
+
+ // We have enough GPRs to fully pass the vector argument, and we have
+ // already consumed any underaligned registers. Start with the custom
+ // MemLoc and then the custom RegLocs.
+ const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
+ State.addLoc(
+ CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ for (unsigned I = 0; I != VecSize; I += PtrSize) {
+ const unsigned Reg = State.AllocateReg(GPRs);
+ assert(Reg && "Failed to allocated register for vararg vector argument");
+ State.addLoc(
+ CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
}
return false;
}
@@ -6463,8 +6745,11 @@
return true;
}
+// So far, this function is only used by LowerFormalArguments_AIX()
static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
- bool IsPPC64) {
+ bool IsPPC64,
+ bool HasP8Vector,
+ bool HasVSX) {
assert((IsPPC64 || SVT != MVT::i64) &&
"i64 should have been split for 32-bit codegen.");
@@ -6476,9 +6761,9 @@
case MVT::i64:
return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
case MVT::f32:
- return &PPC::F4RCRegClass;
+ return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
case MVT::f64:
- return &PPC::F8RCRegClass;
+ return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
@@ -6590,7 +6875,7 @@
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
const EVT PtrVT = getPointerTy(MF.getDataLayout());
// Reserve space for the linkage area on the stack.
@@ -6603,27 +6888,116 @@
for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
CCValAssign &VA = ArgLocs[I++];
MVT LocVT = VA.getLocVT();
+ MVT ValVT = VA.getValVT();
ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
- if (VA.isMemLoc() && VA.getValVT().isVector())
- report_fatal_error(
- "passing vector parameters to the stack is unimplemented for AIX");
-
// For compatibility with the AIX XL compiler, the float args in the
// parameter save area are initialized even if the argument is available
// in register. The caller is required to initialize both the register
// and memory, however, the callee can choose to expect it in either.
// The memloc is dismissed here because the argument is retrieved from
// the register.
- if (VA.isMemLoc() && VA.needsCustom())
+ if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
continue;
+ auto HandleMemLoc = [&]() {
+ const unsigned LocSize = LocVT.getStoreSize();
+ const unsigned ValSize = ValVT.getStoreSize();
+ assert((ValSize <= LocSize) &&
+ "Object size is larger than size of MemLoc");
+ int CurArgOffset = VA.getLocMemOffset();
+ // Objects are right-justified because AIX is big-endian.
+ if (LocSize > ValSize)
+ CurArgOffset += LocSize - ValSize;
+ // Potential tail calls could cause overwriting of argument stack slots.
+ const bool IsImmutable =
+ !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+ (CallConv == CallingConv::Fast));
+ int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue ArgValue =
+ DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
+ InVals.push_back(ArgValue);
+ };
+
+ // Vector arguments to VaArg functions are passed both on the stack, and
+ // in any available GPRs. Load the value from the stack and add the GPRs
+ // as live ins.
+ if (VA.isMemLoc() && VA.needsCustom()) {
+ assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
+ assert(isVarArg && "Only use custom memloc for vararg.");
+ // ValNo of the custom MemLoc, so we can compare it to the ValNo of the
+ // matching custom RegLocs.
+ const unsigned OriginalValNo = VA.getValNo();
+ (void)OriginalValNo;
+
+ auto HandleCustomVecRegLoc = [&]() {
+ assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
+ "Missing custom RegLoc.");
+ VA = ArgLocs[I++];
+ assert(VA.getValVT().isVector() &&
+ "Unexpected Val type for custom RegLoc.");
+ assert(VA.getValNo() == OriginalValNo &&
+ "ValNo mismatch between custom MemLoc and RegLoc.");
+ MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
+ MF.addLiveIn(VA.getLocReg(),
+ getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
+ Subtarget.hasVSX()));
+ };
+
+ HandleMemLoc();
+ // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
+ // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
+ // R10.
+ HandleCustomVecRegLoc();
+ HandleCustomVecRegLoc();
+
+ // If we are targeting 32-bit, there might be 2 extra custom RegLocs if
+ // we passed the vector in R5, R6, R7 and R8.
+ if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
+ assert(!IsPPC64 &&
+ "Only 2 custom RegLocs expected for 64-bit codegen.");
+ HandleCustomVecRegLoc();
+ HandleCustomVecRegLoc();
+ }
+
+ continue;
+ }
+
if (VA.isRegLoc()) {
if (VA.getValVT().isScalarInteger())
FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
- else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector())
- FuncInfo->appendParameterType(VA.getValVT().SimpleTy == MVT::f32
- ? PPCFunctionInfo::ShortFloatPoint
- : PPCFunctionInfo::LongFloatPoint);
+ else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
+ switch (VA.getValVT().SimpleTy) {
+ default:
+ report_fatal_error("Unhandled value type for argument.");
+ case MVT::f32:
+ FuncInfo->appendParameterType(PPCFunctionInfo::ShortFloatingPoint);
+ break;
+ case MVT::f64:
+ FuncInfo->appendParameterType(PPCFunctionInfo::LongFloatingPoint);
+ break;
+ }
+ } else if (VA.getValVT().isVector()) {
+ switch (VA.getValVT().SimpleTy) {
+ default:
+ report_fatal_error("Unhandled value type for argument.");
+ case MVT::v16i8:
+ FuncInfo->appendParameterType(PPCFunctionInfo::VectorChar);
+ break;
+ case MVT::v8i16:
+ FuncInfo->appendParameterType(PPCFunctionInfo::VectorShort);
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ FuncInfo->appendParameterType(PPCFunctionInfo::VectorInt);
+ break;
+ case MVT::v4f32:
+ case MVT::v2f64:
+ FuncInfo->appendParameterType(PPCFunctionInfo::VectorFloat);
+ break;
+ }
+ }
}
if (Flags.isByVal() && VA.isMemLoc()) {
@@ -6704,11 +7078,12 @@
continue;
}
- EVT ValVT = VA.getValVT();
if (VA.isRegLoc() && !VA.needsCustom()) {
- MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
- unsigned VReg =
- MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
+ MVT::SimpleValueType SVT = ValVT.SimpleTy;
+ Register VReg =
+ MF.addLiveIn(VA.getLocReg(),
+ getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
+ Subtarget.hasVSX()));
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
if (ValVT.isScalarInteger() &&
(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
@@ -6719,23 +7094,7 @@
continue;
}
if (VA.isMemLoc()) {
- const unsigned LocSize = LocVT.getStoreSize();
- const unsigned ValSize = ValVT.getStoreSize();
- assert((ValSize <= LocSize) &&
- "Object size is larger than size of MemLoc");
- int CurArgOffset = VA.getLocMemOffset();
- // Objects are right-justified because AIX is big-endian.
- if (LocSize > ValSize)
- CurArgOffset += LocSize - ValSize;
- // Potential tail calls could cause overwriting of argument stack slots.
- const bool IsImmutable =
- !(getTargetMachine().Options.GuaranteedTailCallOpt &&
- (CallConv == CallingConv::Fast));
- int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- SDValue ArgValue =
- DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
- InVals.push_back(ArgValue);
+ HandleMemLoc();
continue;
}
}
@@ -6816,8 +7175,8 @@
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
- *DAG.getContext());
+ AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage save area (LSA) on the stack.
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
@@ -6958,10 +7317,6 @@
const MVT LocVT = VA.getLocVT();
const MVT ValVT = VA.getValVT();
- if (VA.isMemLoc() && VA.getValVT().isVector())
- report_fatal_error(
- "passing vector parameters to the stack is unimplemented for AIX");
-
switch (VA.getLocInfo()) {
default:
report_fatal_error("Unexpected argument extension type.");
@@ -6980,6 +7335,52 @@
continue;
}
+ // Vector arguments passed to VarArg functions need custom handling when
+ // they are passed (at least partially) in GPRs.
+ if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
+ assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
+ // Store value to its stack slot.
+ SDValue PtrOff =
+ DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+ SDValue Store =
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Store);
+ const unsigned OriginalValNo = VA.getValNo();
+ // Then load the GPRs from the stack
+ unsigned LoadOffset = 0;
+ auto HandleCustomVecRegLoc = [&]() {
+ assert(I != E && "Unexpected end of CCvalAssigns.");
+ assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
+ "Expected custom RegLoc.");
+ CCValAssign RegVA = ArgLocs[I++];
+ assert(RegVA.getValNo() == OriginalValNo &&
+ "Custom MemLoc ValNo and custom RegLoc ValNo must match.");
+ SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+ DAG.getConstant(LoadOffset, dl, PtrVT));
+ SDValue Load = DAG.getLoad(PtrVT, dl, Store, Add, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(RegVA.getLocReg(), Load));
+ LoadOffset += PtrByteSize;
+ };
+
+ // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
+ // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
+ // R10.
+ HandleCustomVecRegLoc();
+ HandleCustomVecRegLoc();
+
+ if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
+ ArgLocs[I].getValNo() == OriginalValNo) {
+ assert(!IsPPC64 &&
+ "Only 2 custom RegLocs expected for 64-bit codegen.");
+ HandleCustomVecRegLoc();
+ HandleCustomVecRegLoc();
+ }
+
+ continue;
+ }
+
if (VA.isMemLoc()) {
SDValue PtrOff =
DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
@@ -6990,11 +7391,15 @@
continue;
}
+ if (!ValVT.isFloatingPoint())
+ report_fatal_error(
+ "Unexpected register handling for calling convention.");
+
// Custom handling is used for GPR initializations for vararg float
// arguments.
assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
- ValVT.isFloatingPoint() && LocVT.isInteger() &&
- "Unexpected register handling for calling convention.");
+ LocVT.isInteger() &&
+ "Custom register handling only expected for VarArg.");
SDValue ArgAsInt =
DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
@@ -7425,18 +7830,29 @@
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
/// possible.
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
- // Not FP, or using SPE? Not a fsel.
- if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
- !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE())
- return Op;
-
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
EVT ResVT = Op.getValueType();
EVT CmpVT = Op.getOperand(0).getValueType();
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
SDLoc dl(Op);
+
+ // Without power9-vector, we don't have native instruction for f128 comparison.
+ // Following transformation to libcall is needed for setcc:
+ // select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
+ if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
+ SDValue Z = DAG.getSetCC(
+ dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT),
+ LHS, RHS, CC);
+ SDValue Zero = DAG.getConstant(0, dl, Z.getValueType());
+ return DAG.getSelectCC(dl, Z, Zero, TV, FV, ISD::SETNE);
+ }
+
+ // Not FP, or using SPE? Not a fsel.
+ if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
+ Subtarget.hasSPE())
+ return Op;
+
SDNodeFlags Flags = Op.getNode()->getFlags();
// We have xsmaxcdp/xsmincdp which are OK to emit even in the
@@ -8638,6 +9054,18 @@
return Success;
}
+// Nondestructive check for convertTonNonDenormSingle.
+bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
+ // Only convert if it loses info, since XXSPLTIDP should
+ // handle the other case.
+ APFloat APFloatToConvert = ArgAPFloat;
+ bool LosesInfo = true;
+ APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+
+ return (!LosesInfo && !APFloatToConvert.isDenormal());
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -9338,7 +9766,8 @@
// which is strictly wider than the loaded value by 8 bytes. So we need to
// adjust the splat index to point to the correct address in memory.
if (IsPermutedLoad) {
- assert(isLittleEndian && "Unexpected permuted load on big endian target");
+ assert((isLittleEndian || IsFourByte) &&
+ "Unexpected size for permuted load on big endian target");
SplatIdx += IsFourByte ? 2 : 1;
assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
"Splat of a value outside of the loaded memory");
@@ -9353,6 +9782,11 @@
else
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+ // If the width of the load is the same as the width of the splat,
+ // loading with an offset would load the wrong memory.
+ if (LD->getValueType(0).getSizeInBits() == (IsFourByte ? 32 : 64))
+ Offset = 0;
+
SDValue BasePtr = LD->getBasePtr();
if (Offset != 0)
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -9646,7 +10080,7 @@
isDot = true;
break;
case Intrinsic::ppc_altivec_vcmpequd_p:
- if (Subtarget.hasP8Altivec()) {
+ if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
CompareOpc = 199;
isDot = true;
} else
@@ -9706,7 +10140,7 @@
isDot = true;
break;
case Intrinsic::ppc_altivec_vcmpgtsd_p:
- if (Subtarget.hasP8Altivec()) {
+ if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
CompareOpc = 967;
isDot = true;
} else
@@ -9725,7 +10159,7 @@
isDot = true;
break;
case Intrinsic::ppc_altivec_vcmpgtud_p:
- if (Subtarget.hasP8Altivec()) {
+ if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
CompareOpc = 711;
isDot = true;
} else
@@ -9929,7 +10363,7 @@
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,
DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
: VecNo,
- dl, MVT::i64));
+ dl, getPointerTy(DAG.getDataLayout())));
RetOps.push_back(Extract);
}
return DAG.getMergeValues(RetOps, dl);
@@ -10024,6 +10458,8 @@
// Lower scalar BSWAP64 to xxbrd.
SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
+ if (!Subtarget.isPPC64())
+ return Op;
// MTVSRDD
Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
Op.getOperand(0));
@@ -10096,14 +10532,39 @@
"Should only be called for ISD::INSERT_VECTOR_ELT");
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
- // We have legal lowering for constant indices but not for variable ones.
- if (!C)
- return SDValue();
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
+ SDValue V3 = Op.getOperand(2);
+
+ if (VT == MVT::v2f64 && C)
+ return Op;
+
+ if (Subtarget.isISA3_1()) {
+ if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
+ return SDValue();
+ // On P10, we have legal lowering for constant and variable indices for
+ // integer vectors.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64)
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
+ // For f32 and f64 vectors, we have legal lowering for variable indices.
+ // For f32 we also have legal lowering when the element is loaded from
+ // memory.
+ if (VT == MVT::v4f32 || VT == MVT::v2f64) {
+ if (!C || (VT == MVT::v4f32 && dyn_cast<LoadSDNode>(V2)))
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
+ return Op;
+ }
+ }
+
+ // Before P10, we have legal lowering for constant indices but not for
+ // variable ones.
+ if (!C)
+ return SDValue();
+
// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
@@ -10193,7 +10654,7 @@
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
- DAG.getConstant(VecNum, dl, MVT::i64));
+ DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
SDValue Store =
DAG.getStore(StoreChain, dl, Elt, BasePtr,
SN->getPointerInfo().getWithOffset(Idx * 16),
@@ -10367,6 +10828,8 @@
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS:
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
@@ -10525,7 +10988,7 @@
// Other Lowering Code
//===----------------------------------------------------------------------===//
-static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *Func = Intrinsic::getDeclaration(M, Id);
return Builder.CreateCall(Func, {});
@@ -10533,7 +10996,7 @@
// The mappings for emitLeading/TrailingFence is taken from
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
@@ -10543,7 +11006,7 @@
return nullptr;
}
-Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
@@ -10806,6 +11269,7 @@
Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
+ Register SrwDestReg = RegInfo.createVirtualRegister(GPRC);
Register Ptr1Reg;
Register TmpReg =
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
@@ -10833,7 +11297,8 @@
// stwcx. tmp4, ptr
// bne- loopMBB
// fallthrough --> exitMBB
- // srw dest, tmpDest, shift
+ // srw SrwDest, tmpDest, shift
+ // rlwinm SrwDest, SrwDest, 0, 24 [16], 31
if (ptrA != ZeroReg) {
Ptr1Reg = RegInfo.createVirtualRegister(RC);
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
@@ -10935,7 +11400,14 @@
// exitMBB:
// ...
BB = exitMBB;
- BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
+ // Since the shift amount is not a constant, we need to clear
+ // the upper bits with a separate RLWINM.
+ BuildMI(*BB, BB->begin(), dl, TII->get(PPC::RLWINM), dest)
+ .addReg(SrwDestReg)
+ .addImm(0)
+ .addImm(is8bit ? 24 : 16)
+ .addImm(31);
+ BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), SrwDestReg)
.addReg(TmpDestReg)
.addReg(ShiftReg);
return BB;
@@ -12164,6 +12636,17 @@
} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
return emitProbedAlloca(MI, BB);
+ } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
+ DebugLoc DL = MI.getDebugLoc();
+ Register Src = MI.getOperand(2).getReg();
+ Register Lo = MI.getOperand(0).getReg();
+ Register Hi = MI.getOperand(1).getReg();
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
+ .addDef(Lo)
+ .addUse(Src, 0, PPC::sub_gp8_x1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
+ .addDef(Hi)
+ .addUse(Src, 0, PPC::sub_gp8_x0);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@@ -13495,7 +13978,7 @@
if (Operand.getOpcode() != ISD::LOAD)
return SDValue();
- LoadSDNode *LD = dyn_cast<LoadSDNode>(Operand);
+ auto *LD = cast<LoadSDNode>(Operand);
EVT MemoryType = LD->getMemoryVT();
// This transformation is only valid if the we are loading either a byte,
@@ -13872,6 +14355,9 @@
(Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
+ if (ResVT == MVT::f128 && !Subtarget.hasP9Vector())
+ return SDValue();
+
if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Vector() ||
cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
return SDValue();
@@ -13950,13 +14436,24 @@
return SDValue();
}
+// Fix up the shuffle mask to account for the fact that the result of
+// scalar_to_vector is not in lane zero. This just takes all values in
+// the ranges specified by the min/max indices and adds the number of
+// elements required to ensure each element comes from the respective
+// position in the valid lane.
+// On little endian, that's just the corresponding element in the other
+// half of the vector. On big endian, it is in the same half but right
+// justified rather than left justified in that half.
static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
int LHSMaxIdx, int RHSMinIdx,
- int RHSMaxIdx, int HalfVec) {
+ int RHSMaxIdx, int HalfVec,
+ unsigned ValidLaneWidth,
+ const PPCSubtarget &Subtarget) {
for (int i = 0, e = ShuffV.size(); i < e; i++) {
int Idx = ShuffV[i];
if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
- ShuffV[i] += HalfVec;
+ ShuffV[i] +=
+ Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth;
}
}
@@ -13965,7 +14462,8 @@
// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
// In such a case, just change the shuffle mask to extract the element
// from the permuted index.
-static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
+static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget) {
SDLoc dl(OrigSToV);
EVT VT = OrigSToV.getValueType();
assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -13979,8 +14477,14 @@
// Can't handle non-const element indices or different vector types
// for the input to the extract and the output of the scalar_to_vector.
if (Idx && VT == OrigVector.getValueType()) {
- SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
- NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(
+ NumElts > 1 &&
+ "Cannot produce a permuted scalar_to_vector for one element vector");
+ SmallVector<int, 16> NewMask(NumElts, -1);
+ unsigned ResultInElt = NumElts / 2;
+ ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
+ NewMask[ResultInElt] = Idx->getZExtValue();
return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
}
}
@@ -13996,6 +14500,10 @@
// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
// to put the value into element zero. Adjust the shuffle mask so that the
// vector can remain in permuted form (to prevent a swap prior to a shuffle).
+// On big endian targets, this is still useful for SCALAR_TO_VECTOR
+// nodes with elements smaller than doubleword because all the ways
+// of getting scalar data into a vector register put the value in the
+// rightmost element of the left half of the vector.
SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG) const {
SDValue LHS = SVN->getOperand(0);
@@ -14004,10 +14512,14 @@
int NumElts = LHS.getValueType().getVectorNumElements();
SDValue Res(SVN, 0);
SDLoc dl(SVN);
+ bool IsLittleEndian = Subtarget.isLittleEndian();
- // None of these combines are useful on big endian systems since the ISA
- // already has a big endian bias.
- if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+ // On big endian targets this is only useful for subtargets with direct moves.
+ // On little endian targets it would be useful for all subtargets with VSX.
+ // However adding special handling for LE subtargets without direct moves
+ // would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
+ // which includes direct moves.
+ if (!Subtarget.hasDirectMove())
return Res;
// If this is not a shuffle of a shuffle and the first element comes from
@@ -14030,6 +14542,15 @@
int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
: SToVRHS.getValueType().getVectorNumElements();
int NumEltsOut = ShuffV.size();
+ // The width of the "valid lane" (i.e. the lane that contains the value that
+ // is vectorized) needs to be expressed in terms of the number of elements
+ // of the shuffle. It is thereby the ratio of the values before and after
+ // any bitcast.
+ unsigned ValidLaneWidth =
+ SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() /
+ LHS.getValueType().getScalarSizeInBits()
+ : SToVRHS.getValueType().getScalarSizeInBits() /
+ RHS.getValueType().getScalarSizeInBits();
// Initially assume that neither input is permuted. These will be adjusted
// accordingly if either input is.
@@ -14040,18 +14561,26 @@
// Get the permuted scalar to vector nodes for the source(s) that come from
// ISD::SCALAR_TO_VECTOR.
+ // On big endian systems, this only makes sense for element sizes smaller
+ // than 64 bits since for 64-bit elements, all instructions already put
+ // the value into element zero. Since scalar size of LHS and RHS may differ
+ // after isScalarToVec, this should be checked using their own sizes.
if (SToVLHS) {
+ if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64)
+ return Res;
// Set up the values for the shuffle vector fixup.
LHSMaxIdx = NumEltsOut / NumEltsIn;
- SToVLHS = getSToVPermuted(SToVLHS, DAG);
+ SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget);
if (SToVLHS.getValueType() != LHS.getValueType())
SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
LHS = SToVLHS;
}
if (SToVRHS) {
+ if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64)
+ return Res;
RHSMinIdx = NumEltsOut;
RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
- SToVRHS = getSToVPermuted(SToVRHS, DAG);
+ SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget);
if (SToVRHS.getValueType() != RHS.getValueType())
SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
RHS = SToVRHS;
@@ -14061,10 +14590,9 @@
// The minimum and maximum indices that correspond to element zero for both
// the LHS and RHS are computed and will control which shuffle mask entries
// are to be changed. For example, if the RHS is permuted, any shuffle mask
- // entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
- // HalfVec to refer to the corresponding element in the permuted vector.
+ // entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted.
fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
- HalfVec);
+ HalfVec, ValidLaneWidth, Subtarget);
Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
// We may have simplified away the shuffle. We won't be able to do anything
@@ -14074,12 +14602,13 @@
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
}
+ SDValue TheSplat = IsLittleEndian ? RHS : LHS;
// The common case after we commuted the shuffle is that the RHS is a splat
// and we have elements coming in from the splat at indices that are not
// conducive to using a merge.
// Example:
// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
- if (!isSplatBV(RHS))
+ if (!isSplatBV(TheSplat))
return Res;
// We are looking for a mask such that all even elements are from
@@ -14089,24 +14618,41 @@
// Adjust the mask so we are pulling in the same index from the splat
// as the index from the interesting vector in consecutive elements.
- // Example (even elements from first vector):
- // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
- if (Mask[0] < NumElts)
- for (int i = 1, e = Mask.size(); i < e; i += 2)
- ShuffV[i] = (ShuffV[i - 1] + NumElts);
- // Example (odd elements from first vector):
- // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
- else
- for (int i = 0, e = Mask.size(); i < e; i += 2)
- ShuffV[i] = (ShuffV[i + 1] + NumElts);
+ if (IsLittleEndian) {
+ // Example (even elements from first vector):
+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
+ if (Mask[0] < NumElts)
+ for (int i = 1, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i - 1] + NumElts);
+ // Example (odd elements from first vector):
+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
+ else
+ for (int i = 0, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i + 1] + NumElts);
+ } else {
+ // Example (even elements from first vector):
+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
+ if (Mask[0] < NumElts)
+ for (int i = 0, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = ShuffV[i + 1] - NumElts;
+ // Example (odd elements from first vector):
+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
+ else
+ for (int i = 1, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = ShuffV[i - 1] - NumElts;
+ }
// If the RHS has undefs, we need to remove them since we may have created
// a shuffle that adds those instead of the splat value.
- SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
- RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);
+ SDValue SplatVal =
+ cast<BuildVectorSDNode>(TheSplat.getNode())->getSplatValue();
+ TheSplat = DAG.getSplatBuildVector(TheSplat.getValueType(), dl, SplatVal);
- Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
- return Res;
+ if (IsLittleEndian)
+ RHS = TheSplat;
+ else
+ LHS = TheSplat;
+ return DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
}
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
@@ -14145,7 +14691,15 @@
return SDValue();
if (LSBase->getOpcode() == ISD::LOAD) {
- SDLoc dl(SVN);
+ // If the load return value 0 has more than one user except the
+ // shufflevector instruction, it is not profitable to replace the
+ // shufflevector with a reverse load.
+ for (SDNode::use_iterator UI = LSBase->use_begin(), UE = LSBase->use_end();
+ UI != UE; ++UI)
+ if (UI.getUse().getResNo() == 0 && UI->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ SDLoc dl(LSBase);
SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
return DAG.getMemIntrinsicNode(
PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
@@ -14153,6 +14707,12 @@
}
if (LSBase->getOpcode() == ISD::STORE) {
+ // If there are other uses of the shuffle, the swap cannot be avoided.
+ // Forcing the use of an X-Form (since swapped stores only have
+ // X-Forms) without removing the swap is unprofitable.
+ if (!SVN->hasOneUse())
+ return SDValue();
+
SDLoc dl(LSBase);
SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
LSBase->getBasePtr()};
@@ -14667,13 +15227,17 @@
}
}
break;
- case ISD::BSWAP:
+ case ISD::BSWAP: {
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
- if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
- N->getOperand(0).hasOneUse() &&
+ // For subtargets without LDBRX, we can still do better than the default
+ // expansion even for 64-bit BSWAP (LOAD).
+ bool Is64BitBswapOn64BitTgt =
+ Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;
+ bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&
+ N->getOperand(0).hasOneUse();
+ if (IsSingleUseNormalLd &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
- (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
- N->getValueType(0) == MVT::i64))) {
+ (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
// Create the byte-swapping load.
@@ -14704,7 +15268,37 @@
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
}
- break;
+ // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
+ // before legalization so that the BUILD_PAIR is handled correctly.
+ if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
+ !IsSingleUseNormalLd)
+ return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));
+
+ // Can't split volatile or atomic loads.
+ if (!LD->isSimple())
+ return SDValue();
+ SDValue BasePtr = LD->getBasePtr();
+ SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
+ LD->getPointerInfo(), LD->getAlignment());
+ Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getIntPtrConstant(4, dl));
+ MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
+ LD->getMemOperand(), 4, 4);
+ SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, NewMMO);
+ Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);
+ SDValue Res;
+ if (Subtarget.isLittleEndian())
+ Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);
+ else
+ Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ SDValue TF =
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);
+ return Res;
+ }
case PPCISD::VCMP:
// If a VCMP_rec node already exists with exactly the same operands as this
// node, use its result instead of this node (VCMP_rec computes both a CR6
@@ -15141,7 +15735,13 @@
} else if ((Constraint == "wa" || Constraint == "wd" ||
Constraint == "wf" || Constraint == "wi") &&
Subtarget.hasVSX()) {
- return std::make_pair(0U, &PPC::VSRCRegClass);
+ // A VSX register for either a scalar (FP) or vector. There is no
+ // support for single precision scalars on subtargets prior to Power8.
+ if (VT.isVector())
+ return std::make_pair(0U, &PPC::VSRCRegClass);
+ if (VT == MVT::f32 && Subtarget.hasP8Vector())
+ return std::make_pair(0U, &PPC::VSSRCRegClass);
+ return std::make_pair(0U, &PPC::VSFRCRegClass);
} else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
if (VT == MVT::f32 && Subtarget.hasP8Vector())
return std::make_pair(0U, &PPC::VSSRCRegClass);
@@ -15202,10 +15802,19 @@
&PPC::G8RCRegClass);
// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
- if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
+ if (!R.second && StringRef("{cc}").equals_insensitive(Constraint)) {
R.first = PPC::CR0;
R.second = &PPC::CRRCRegClass;
}
+ // FIXME: This warning should ideally be emitted in the front end.
+ const auto &TM = getTargetMachine();
+ if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
+ if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
+ (R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
+ (R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
+ errs() << "warning: vector registers 20 to 32 are reserved in the "
+ "default AIX AltiVec ABI and cannot be used\n";
+ }
return R;
}
@@ -15452,6 +16061,22 @@
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
+ case Intrinsic::ppc_atomicrmw_xchg_i128:
+ case Intrinsic::ppc_atomicrmw_add_i128:
+ case Intrinsic::ppc_atomicrmw_sub_i128:
+ case Intrinsic::ppc_atomicrmw_nand_i128:
+ case Intrinsic::ppc_atomicrmw_and_i128:
+ case Intrinsic::ppc_atomicrmw_or_i128:
+ case Intrinsic::ppc_atomicrmw_xor_i128:
+ case Intrinsic::ppc_cmpxchg_i128:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(16);
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
@@ -15622,9 +16247,7 @@
return isInt<16>(Imm) || isUInt<16>(Imm);
}
-bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
+bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
MachineMemOperand::Flags,
bool *Fast) const {
if (DisablePPCUnaligned)
@@ -15894,12 +16517,24 @@
return true;
}
-// Override to disable global variable loading on Linux.
+// Override to disable global variable loading on Linux and insert AIX canary
+// word declaration.
void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
+ if (Subtarget.isAIXABI()) {
+ M.getOrInsertGlobal(AIXSSPCanaryWordName,
+ Type::getInt8PtrTy(M.getContext()));
+ return;
+ }
if (!Subtarget.isTargetLinux())
return TargetLowering::insertSSPDeclarations(M);
}
+Value *PPCTargetLowering::getSDagStackGuard(const Module &M) const {
+ if (Subtarget.isAIXABI())
+ return M.getGlobalVariable(AIXSSPCanaryWordName);
+ return TargetLowering::getSDagStackGuard(M);
+}
+
bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
if (!VT.isSimple() || !Subtarget.hasVSX())
@@ -15913,10 +16548,8 @@
case MVT::f32:
case MVT::f64:
if (Subtarget.hasPrefixInstrs()) {
- // With prefixed instructions, we can materialize anything that can be
- // represented with a 32-bit immediate, not just positive zero.
- APFloat APFloatOfImm = Imm;
- return convertToNonDenormSingle(APFloatOfImm);
+ // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
+ return true;
}
LLVM_FALLTHROUGH;
case MVT::ppcf128:
@@ -16047,9 +16680,7 @@
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
SDValue Cmp = RHS.getOperand(0);
SDValue Z = Cmp.getOperand(0);
- auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
-
- assert(Constant && "Constant Should not be a null pointer.");
+ auto *Constant = cast<ConstantSDNode>(Cmp.getOperand(1));
int64_t NegConstant = 0 - Constant->getSExtValue();
switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
@@ -16498,3 +17129,463 @@
return SDValue();
}
+
+/// getAddrModeForFlags - Based on the set of address flags, select the most
+/// optimal instruction format to match by.
+PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
+ // This is not a node we should be handling here.
+ if (Flags == PPC::MOF_None)
+ return PPC::AM_None;
+ // Unaligned D-Forms are tried first, followed by the aligned D-Forms.
+ for (auto FlagSet : AddrModesMap.at(PPC::AM_DForm))
+ if ((Flags & FlagSet) == FlagSet)
+ return PPC::AM_DForm;
+ for (auto FlagSet : AddrModesMap.at(PPC::AM_DSForm))
+ if ((Flags & FlagSet) == FlagSet)
+ return PPC::AM_DSForm;
+ for (auto FlagSet : AddrModesMap.at(PPC::AM_DQForm))
+ if ((Flags & FlagSet) == FlagSet)
+ return PPC::AM_DQForm;
+ // If no other forms are selected, return an X-Form as it is the most
+ // general addressing mode.
+ return PPC::AM_XForm;
+}
+
+/// Set alignment flags based on whether or not the Frame Index is aligned.
+/// Utilized when computing flags for address computation when selecting
+/// load and store instructions.
+static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
+ SelectionDAG &DAG) {
+ bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(IsAdd ? N.getOperand(0) : N);
+ if (!FI)
+ return;
+ const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ unsigned FrameIndexAlign = MFI.getObjectAlign(FI->getIndex()).value();
+ // If this is (add $FI, $S16Imm), the alignment flags are already set
+ // based on the immediate. We just need to clear the alignment flags
+ // if the FI alignment is weaker.
+ if ((FrameIndexAlign % 4) != 0)
+ FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
+ if ((FrameIndexAlign % 16) != 0)
+ FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
+ // If the address is a plain FrameIndex, set alignment flags based on
+ // FI alignment.
+ if (!IsAdd) {
+ if ((FrameIndexAlign % 4) == 0)
+ FlagSet |= PPC::MOF_RPlusSImm16Mult4;
+ if ((FrameIndexAlign % 16) == 0)
+ FlagSet |= PPC::MOF_RPlusSImm16Mult16;
+ }
+}
+
+/// Given a node, compute flags that are used for address computation when
+/// selecting load and store instructions. The flags computed are stored in
+/// FlagSet. This function takes into account whether the node is a constant,
+/// an ADD, OR, or a constant, and computes the address flags accordingly.
+static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
+ SelectionDAG &DAG) {
+ // Set the alignment flags for the node depending on if the node is
+ // 4-byte or 16-byte aligned.
+ auto SetAlignFlagsForImm = [&](uint64_t Imm) {
+ if ((Imm & 0x3) == 0)
+ FlagSet |= PPC::MOF_RPlusSImm16Mult4;
+ if ((Imm & 0xf) == 0)
+ FlagSet |= PPC::MOF_RPlusSImm16Mult16;
+ };
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+ // All 32-bit constants can be computed as LIS + Disp.
+ const APInt &ConstImm = CN->getAPIntValue();
+ if (ConstImm.isSignedIntN(32)) { // Flag to handle 32-bit constants.
+ FlagSet |= PPC::MOF_AddrIsSImm32;
+ SetAlignFlagsForImm(ConstImm.getZExtValue());
+ setAlignFlagsForFI(N, FlagSet, DAG);
+ }
+ if (ConstImm.isSignedIntN(34)) // Flag to handle 34-bit constants.
+ FlagSet |= PPC::MOF_RPlusSImm34;
+ else // Let constant materialization handle large constants.
+ FlagSet |= PPC::MOF_NotAddNorCst;
+ } else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
+ // This address can be represented as an addition of:
+ // - Register + Imm16 (possibly a multiple of 4/16)
+ // - Register + Imm34
+ // - Register + PPCISD::Lo
+ // - Register + Register
+ // In any case, we won't have to match this as Base + Zero.
+ SDValue RHS = N.getOperand(1);
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(RHS)) {
+ const APInt &ConstImm = CN->getAPIntValue();
+ if (ConstImm.isSignedIntN(16)) {
+ FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
+ SetAlignFlagsForImm(ConstImm.getZExtValue());
+ setAlignFlagsForFI(N, FlagSet, DAG);
+ }
+ if (ConstImm.isSignedIntN(34))
+ FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
+ else
+ FlagSet |= PPC::MOF_RPlusR; // Register.
+ } else if (RHS.getOpcode() == PPCISD::Lo &&
+ !cast<ConstantSDNode>(RHS.getOperand(1))->getZExtValue())
+ FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
+ else
+ FlagSet |= PPC::MOF_RPlusR;
+ } else { // The address computation is not a constant or an addition.
+ setAlignFlagsForFI(N, FlagSet, DAG);
+ FlagSet |= PPC::MOF_NotAddNorCst;
+ }
+}
+
+/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
+/// the address flags of the load/store instruction that is to be matched.
+unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
+ SelectionDAG &DAG) const {
+ unsigned FlagSet = PPC::MOF_None;
+
+ // Compute subtarget flags.
+ if (!Subtarget.hasP9Vector())
+ FlagSet |= PPC::MOF_SubtargetBeforeP9;
+ else {
+ FlagSet |= PPC::MOF_SubtargetP9;
+ if (Subtarget.hasPrefixInstrs())
+ FlagSet |= PPC::MOF_SubtargetP10;
+ }
+ if (Subtarget.hasSPE())
+ FlagSet |= PPC::MOF_SubtargetSPE;
+
+ // Mark this as something we don't want to handle here if it is atomic
+ // or pre-increment instruction.
+ if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Parent))
+ if (LSB->isIndexed())
+ return PPC::MOF_None;
+
+ // Compute in-memory type flags. This is based on if there are scalars,
+ // floats or vectors.
+ const MemSDNode *MN = dyn_cast<MemSDNode>(Parent);
+ assert(MN && "Parent should be a MemSDNode!");
+ EVT MemVT = MN->getMemoryVT();
+ unsigned Size = MemVT.getSizeInBits();
+ if (MemVT.isScalarInteger()) {
+ assert(Size <= 64 && "Not expecting scalar integers larger than 8 bytes!");
+ if (Size < 32)
+ FlagSet |= PPC::MOF_SubWordInt;
+ else if (Size == 32)
+ FlagSet |= PPC::MOF_WordInt;
+ else
+ FlagSet |= PPC::MOF_DoubleWordInt;
+ } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
+ if (Size == 128)
+ FlagSet |= PPC::MOF_Vector;
+ else if (Size == 256)
+ FlagSet |= PPC::MOF_Vector256;
+ else
+ llvm_unreachable("Not expecting illegal vectors!");
+ } else { // Floating point type: can be scalar, f128 or vector types.
+ if (Size == 32 || Size == 64)
+ FlagSet |= PPC::MOF_ScalarFloat;
+ else if (MemVT == MVT::f128 || MemVT.isVector())
+ FlagSet |= PPC::MOF_Vector;
+ else
+ llvm_unreachable("Not expecting illegal scalar floats!");
+ }
+
+ // Compute flags for address computation.
+ computeFlagsForAddressComputation(N, FlagSet, DAG);
+
+ // Compute type extension flags.
+ if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Parent)) {
+ switch (LN->getExtensionType()) {
+ case ISD::SEXTLOAD:
+ FlagSet |= PPC::MOF_SExt;
+ break;
+ case ISD::EXTLOAD:
+ case ISD::ZEXTLOAD:
+ FlagSet |= PPC::MOF_ZExt;
+ break;
+ case ISD::NON_EXTLOAD:
+ FlagSet |= PPC::MOF_NoExt;
+ break;
+ }
+ } else
+ FlagSet |= PPC::MOF_NoExt;
+
+ // For integers, no extension is the same as zero extension.
+ // We set the extension mode to zero extension so we don't have
+ // to add separate entries in AddrModesMap for loads and stores.
+ if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
+ FlagSet |= PPC::MOF_ZExt;
+ FlagSet &= ~PPC::MOF_NoExt;
+ }
+
+ // If we don't have prefixed instructions, 34-bit constants should be
+ // treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
+ bool IsNonP1034BitConst =
+ ((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
+ FlagSet) == PPC::MOF_RPlusSImm34;
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
+ IsNonP1034BitConst)
+ FlagSet |= PPC::MOF_NotAddNorCst;
+
+ return FlagSet;
+}
+
+/// SelectForceXFormMode - Given the specified address, force it to be
+/// represented as an indexed [r+r] operation (an XForm instruction).
+PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
+ SDValue &Base,
+ SelectionDAG &DAG) const {
+
+ PPC::AddrMode Mode = PPC::AM_XForm;
+ int16_t ForceXFormImm = 0;
+ if (provablyDisjointOr(DAG, N) &&
+ !isIntS16Immediate(N.getOperand(1), ForceXFormImm)) {
+ Disp = N.getOperand(0);
+ Base = N.getOperand(1);
+ return Mode;
+ }
+
+ // If the address is the result of an add, we will utilize the fact that the
+ // address calculation includes an implicit add. However, we can reduce
+ // register pressure if we do not materialize a constant just for use as the
+ // index register. We only get rid of the add if it is not an add of a
+ // value and a 16-bit signed constant and both have a single use.
+ if (N.getOpcode() == ISD::ADD &&
+ (!isIntS16Immediate(N.getOperand(1), ForceXFormImm) ||
+ !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
+ Disp = N.getOperand(0);
+ Base = N.getOperand(1);
+ return Mode;
+ }
+
+ // Otherwise, use R0 as the base register.
+ Disp = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ N.getValueType());
+ Base = N;
+
+ return Mode;
+}
+
+// If we happen to match to an aligned D-Form, check if the Frame Index is
+// adequately aligned. If it is not, reset the mode to match to X-Form.
+static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
+ PPC::AddrMode &Mode) {
+ if (!isa<FrameIndexSDNode>(N))
+ return;
+ if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
+ (Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
+ Mode = PPC::AM_XForm;
+}
+
+/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
+/// compute the address flags of the node, get the optimal address mode based
+/// on the flags, and set the Base and Disp based on the address mode.
+PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
+ SDValue N, SDValue &Disp,
+ SDValue &Base,
+ SelectionDAG &DAG,
+ MaybeAlign Align) const {
+ SDLoc DL(Parent);
+
+ // Compute the address flags.
+ unsigned Flags = computeMOFlags(Parent, N, DAG);
+
+ // Get the optimal address mode based on the Flags.
+ PPC::AddrMode Mode = getAddrModeForFlags(Flags);
+
+ // If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
+ // Select an X-Form load if it is not.
+ setXFormForUnalignedFI(N, Flags, Mode);
+
+ // Set Base and Disp accordingly depending on the address mode.
+ switch (Mode) {
+ case PPC::AM_DForm:
+ case PPC::AM_DSForm:
+ case PPC::AM_DQForm: {
+ // This is a register plus a 16-bit immediate. The base will be the
+ // register and the displacement will be the immediate unless it
+ // isn't sufficiently aligned.
+ if (Flags & PPC::MOF_RPlusSImm16) {
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+ int16_t Imm = cast<ConstantSDNode>(Op1)->getAPIntValue().getZExtValue();
+ if (!Align || isAligned(*Align, Imm)) {
+ Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
+ Base = Op0;
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0)) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ }
+ break;
+ }
+ }
+ // This is a register plus the @lo relocation. The base is the register
+ // and the displacement is the global address.
+ else if (Flags & PPC::MOF_RPlusLo) {
+ Disp = N.getOperand(1).getOperand(0); // The global address.
+ assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+ Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
+ Disp.getOpcode() == ISD::TargetConstantPool ||
+ Disp.getOpcode() == ISD::TargetJumpTable);
+ Base = N.getOperand(0);
+ break;
+ }
+ // This is a constant address at most 32 bits. The base will be
+ // zero or load-immediate-shifted and the displacement will be
+ // the low 16 bits of the address.
+ else if (Flags & PPC::MOF_AddrIsSImm32) {
+ auto *CN = cast<ConstantSDNode>(N);
+ EVT CNType = CN->getValueType(0);
+ uint64_t CNImm = CN->getZExtValue();
+ // If this address fits entirely in a 16-bit sext immediate field, codegen
+ // this as "d, 0".
+ int16_t Imm;
+ if (isIntS16Immediate(CN, Imm) && (!Align || isAligned(*Align, Imm))) {
+ Disp = DAG.getTargetConstant(Imm, DL, CNType);
+ Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ CNType);
+ break;
+ }
+ // Handle 32-bit sext immediate with LIS + Addr mode.
+ if ((CNType == MVT::i32 || isInt<32>(CNImm)) &&
+ (!Align || isAligned(*Align, CNImm))) {
+ int32_t Addr = (int32_t)CNImm;
+ // Otherwise, break this down into LIS + Disp.
+ Disp = DAG.getTargetConstant((int16_t)Addr, DL, MVT::i32);
+ Base =
+ DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);
+ uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
+ Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);
+ break;
+ }
+ }
+ // Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
+ Disp = DAG.getTargetConstant(0, DL, getPointerTy(DAG.getDataLayout()));
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ } else
+ Base = N;
+ break;
+ }
+ case PPC::AM_None:
+ break;
+ default: { // By default, X-Form is always available to be selected.
+ // When a frame index is not aligned, we also match by XForm.
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N);
+ Base = FI ? N : N.getOperand(1);
+ Disp = FI ? DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ N.getValueType())
+ : N.getOperand(0);
+ break;
+ }
+ }
+ return Mode;
+}
+
+CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
+ bool Return,
+ bool IsVarArg) const {
+ switch (CC) {
+ case CallingConv::Cold:
+ return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF_FIS);
+ default:
+ return CC_PPC64_ELF_FIS;
+ }
+}
+
+TargetLowering::AtomicExpansionKind
+PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+ return AtomicExpansionKind::MaskedIntrinsic;
+ return TargetLowering::shouldExpandAtomicRMWInIR(AI);
+}
+
+TargetLowering::AtomicExpansionKind
+PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
+ unsigned Size = AI->getPointerOperand()
+ ->getType()
+ ->getPointerElementType()
+ ->getPrimitiveSizeInBits();
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+ return AtomicExpansionKind::MaskedIntrinsic;
+ return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
+}
+
+static Intrinsic::ID
+getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Xchg:
+ return Intrinsic::ppc_atomicrmw_xchg_i128;
+ case AtomicRMWInst::Add:
+ return Intrinsic::ppc_atomicrmw_add_i128;
+ case AtomicRMWInst::Sub:
+ return Intrinsic::ppc_atomicrmw_sub_i128;
+ case AtomicRMWInst::And:
+ return Intrinsic::ppc_atomicrmw_and_i128;
+ case AtomicRMWInst::Or:
+ return Intrinsic::ppc_atomicrmw_or_i128;
+ case AtomicRMWInst::Xor:
+ return Intrinsic::ppc_atomicrmw_xor_i128;
+ case AtomicRMWInst::Nand:
+ return Intrinsic::ppc_atomicrmw_nand_i128;
+ }
+}
+
+Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
+ IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
+ Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
+ assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
+ "Only support quadword now");
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+ assert(ValTy->getPrimitiveSizeInBits() == 128);
+ Function *RMW = Intrinsic::getDeclaration(
+ M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
+ Value *IncrHi =
+ Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
+ Value *Addr =
+ Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi});
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+}
+
+Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
+ IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
+ Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
+ assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
+ "Only support quadword now");
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+ assert(ValTy->getPrimitiveSizeInBits() == 128);
+ Function *IntCmpXchg =
+ Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");
+ Value *CmpHi =
+ Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi");
+ Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");
+ Value *NewHi =
+ Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");
+ Value *Addr =
+ Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
+ emitLeadingFence(Builder, CI, Ord);
+ Value *LoHi =
+ Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi});
+ emitTrailingFence(Builder, CI, Ord);
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 836c52b..87579ba 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -355,6 +355,15 @@
/// register assignment.
ADDI_TLSGD_L_ADDR,
+ /// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+ /// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+ /// Op that combines two register copies of TOC entries
+ /// (region handle into R3 and variable offset into R4) followed by a
+ /// GET_TLS_ADDR node which will be expanded to a call to __get_tls_addr.
+ /// This node is used in 64-bit mode as well (in which case the result is
+ /// G8RC and inputs are X3/X4).
+ TLSGD_AIX,
+
/// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
/// model, produces an ADDIS8 instruction that adds the GOT base
/// register to sym\@got\@tlsld\@ha.
@@ -662,6 +671,49 @@
/// the number of bytes of each element [124] -> [bhw].
SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+ // Flags for computing the optimal addressing mode for loads and stores.
+ enum MemOpFlags {
+ MOF_None = 0,
+
+ // Extension mode for integer loads.
+ MOF_SExt = 1,
+ MOF_ZExt = 1 << 1,
+ MOF_NoExt = 1 << 2,
+
+ // Address computation flags.
+ MOF_NotAddNorCst = 1 << 5, // Not const. or sum of ptr and scalar.
+ MOF_RPlusSImm16 = 1 << 6, // Reg plus signed 16-bit constant.
+ MOF_RPlusLo = 1 << 7, // Reg plus signed 16-bit relocation
+ MOF_RPlusSImm16Mult4 = 1 << 8, // Reg plus 16-bit signed multiple of 4.
+ MOF_RPlusSImm16Mult16 = 1 << 9, // Reg plus 16-bit signed multiple of 16.
+ MOF_RPlusSImm34 = 1 << 10, // Reg plus 34-bit signed constant.
+ MOF_RPlusR = 1 << 11, // Sum of two variables.
+ MOF_PCRel = 1 << 12, // PC-Relative relocation.
+ MOF_AddrIsSImm32 = 1 << 13, // A simple 32-bit constant.
+
+ // The in-memory type.
+ MOF_SubWordInt = 1 << 15,
+ MOF_WordInt = 1 << 16,
+ MOF_DoubleWordInt = 1 << 17,
+ MOF_ScalarFloat = 1 << 18, // Scalar single or double precision.
+ MOF_Vector = 1 << 19, // Vector types and quad precision scalars.
+ MOF_Vector256 = 1 << 20,
+
+ // Subtarget features.
+ MOF_SubtargetBeforeP9 = 1 << 22,
+ MOF_SubtargetP9 = 1 << 23,
+ MOF_SubtargetP10 = 1 << 24,
+ MOF_SubtargetSPE = 1 << 25
+ };
+
+ // The addressing modes for loads and stores.
+ enum AddrMode {
+ AM_None,
+ AM_DForm,
+ AM_DSForm,
+ AM_DQForm,
+ AM_XForm,
+ };
} // end namespace PPC
class PPCTargetLowering : public TargetLowering {
@@ -690,7 +742,8 @@
/// then the VPERM for the shuffle. All in all a very slow sequence.
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
const override {
- if (VT.getVectorNumElements() != 1 && VT.getScalarSizeInBits() % 8 == 0)
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ VT.getScalarSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -737,7 +790,7 @@
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
- /// Return true if target always beneficiates from combining into FMA for a
+ /// Return true if target always benefits from combining into FMA for a
/// given value type. This must typically return false on targets where FMA
/// takes more cycles to execute than FADD.
bool enableAggressiveFMAFusion(EVT VT) const override;
@@ -818,11 +871,28 @@
return true;
}
- Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
+ TargetLowering::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ TargetLowering::AtomicExpansionKind
+ shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder,
+ AtomicRMWInst *AI, Value *AlignedAddr,
+ Value *Incr, Value *Mask,
+ Value *ShiftAmt,
+ AtomicOrdering Ord) const override;
+ Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilderBase &Builder,
+ AtomicCmpXchgInst *CI,
+ Value *AlignedAddr, Value *CmpVal,
+ Value *NewVal, Value *Mask,
+ AtomicOrdering Ord) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
@@ -880,8 +950,6 @@
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
if (ConstraintCode == "es")
return InlineAsm::Constraint_es;
- else if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
else if (ConstraintCode == "Q")
return InlineAsm::Constraint_Q;
else if (ConstraintCode == "Z")
@@ -961,7 +1029,7 @@
/// Is unaligned memory access allowed for the given type, and is it fast
/// relative to software emulation.
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align = 1,
+ EVT VT, unsigned AddrSpace, Align Alignment = Align(1),
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
@@ -998,7 +1066,8 @@
/// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const override {
// We support any array type as "consecutive" block in the parameter
// save area. The element type defines the alignment requirement and
// whether the argument should go in GPRs, FPRs, or VRs if available.
@@ -1022,6 +1091,7 @@
/// Override to support customized stack guard loading.
bool useLoadStackGuardNode() const override;
void insertSSPDeclarations(Module &M) const override;
+ Value *getSDagStackGuard(const Module &M) const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
@@ -1034,6 +1104,18 @@
unsigned JTI,
MCContext &Ctx) const override;
+ /// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
+ /// compute the address flags of the node, get the optimal address mode
+ /// based on the flags, and set the Base and Disp based on the address mode.
+ PPC::AddrMode SelectOptimalAddrMode(const SDNode *Parent, SDValue N,
+ SDValue &Disp, SDValue &Base,
+ SelectionDAG &DAG,
+ MaybeAlign Align) const;
+ /// SelectForceXFormMode - Given the specified address, force it to be
+ /// represented as an indexed [r+r] operation (an XForm instruction).
+ PPC::AddrMode SelectForceXFormMode(SDValue N, SDValue &Disp, SDValue &Base,
+ SelectionDAG &DAG) const;
+
/// Structure that collects some common arguments that get passed around
/// between the functions for call lowering.
struct CallFlags {
@@ -1052,6 +1134,9 @@
HasNest(HasNest), NoMerge(NoMerge) {}
};
+ CCAssignFn *ccAssignFnForCall(CallingConv::ID CC, bool Return,
+ bool IsVarArg) const;
+
private:
struct ReuseLoadInfo {
SDValue Ptr;
@@ -1076,6 +1161,10 @@
}
};
+ // Map that relates a set of common address flags to PPC addressing modes.
+ std::map<PPC::AddrMode, SmallVector<unsigned, 16>> AddrModesMap;
+ void initializeAddrModeMap();
+
bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
SelectionDAG &DAG,
ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
@@ -1123,6 +1212,8 @@
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddressAIX(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -1306,6 +1397,17 @@
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool hasBitPreservingFPLogic(EVT VT) const override;
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
+ /// getAddrModeForFlags - Based on the set of address flags, select the most
+ /// optimal instruction format to match by.
+ PPC::AddrMode getAddrModeForFlags(unsigned Flags) const;
+
+ /// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
+ /// the address flags of the load/store instruction that is to be matched.
+ /// The address flags are stored in a map, which is then searched
+ /// through to determine the optimal load/store instruction format.
+ unsigned computeMOFlags(const SDNode *Parent, SDValue N,
+ SelectionDAG &DAG) const;
}; // end class PPCTargetLowering
namespace PPC {
@@ -1322,6 +1424,7 @@
bool convertToNonDenormSingle(APInt &ArgAPInt);
bool convertToNonDenormSingle(APFloat &ArgAPFloat);
+ bool checkConvertToNonDenormSingle(APFloat &ArgAPFloat);
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 03e9d69..92712c5 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -229,53 +229,64 @@
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
- [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_add_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_SUB_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
- [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_sub_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_OR_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
- [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_or_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_XOR_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
- [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_xor_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_AND_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
- [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_and_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_NAND_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
- [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_nand_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_MIN_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
- [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_min_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_MAX_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
- [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_max_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_UMIN_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
- [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_umin_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_UMAX_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
- [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_umax_64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_CMP_SWAP_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
- [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+ [(set i64:$dst, (atomic_cmp_swap_64 ForceXForm:$ptr, i64:$old, i64:$new))]>;
def ATOMIC_SWAP_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
- [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
+ [(set i64:$dst, (atomic_swap_64 ForceXForm:$ptr, i64:$new))]>;
}
// Instructions to support atomic operations
let mayLoad = 1, hasSideEffects = 0 in {
def LDARX : XForm_1_memOp<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
"ldarx $rD, $ptr", IIC_LdStLDARX, []>;
+// TODO: Add scheduling info.
+let hasNoSchedulingInfo = 1 in
+def LQARX : XForm_1_memOp<31, 276, (outs g8prc:$RTp), (ins memrr:$ptr),
+ "lqarx $RTp, $ptr", IIC_LdStLQARX, []>, isPPC64;
// Instruction to support lock versions of atomics
// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
def LDARXL : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
"ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isRecordForm;
+// TODO: Add scheduling info.
+let hasNoSchedulingInfo = 1 in
+// FIXME: We have to seek a way to remove isRecordForm since
+// LQARXL is not really altering CR0.
+def LQARXL : XForm_1<31, 276, (outs g8prc:$RTp), (ins memrr:$ptr),
+ "lqarx $RTp, $ptr, 1", IIC_LdStLQARX, []>,
+ isPPC64, isRecordForm;
let hasExtraDefRegAllocReq = 1 in
def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
@@ -283,9 +294,97 @@
Requires<[IsISA3_0]>;
}
-let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
def STDCX : XForm_1_memOp<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
"stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isRecordForm;
+// TODO: Add scheduling info.
+let hasNoSchedulingInfo = 1 in
+def STQCX : XForm_1_memOp<31, 182, (outs), (ins g8prc:$RSp, memrr:$dst),
+ "stqcx. $RSp, $dst", IIC_LdStSTQCX, []>,
+ isPPC64, isRecordForm;
+}
+
+def SPLIT_QUADWORD : PPCCustomInserterPseudo<(outs g8rc:$lo, g8rc:$hi),
+ (ins g8prc:$src),
+ "#SPLIT_QUADWORD", []>;
+class AtomicRMW128<string asmstr>
+ : PPCPostRAExpPseudo<(outs g8prc:$RTp, g8prc:$scratch),
+ (ins memrr:$ptr, g8rc:$incr_lo, g8rc:$incr_hi),
+ asmstr, []>;
+// We have to keep values in MI's uses during LL/SC looping as they are,
+// so set both $RTp and $scratch earlyclobber.
+let mayStore = 1, mayLoad = 1,
+ Defs = [CR0],
+ Constraints = "@earlyclobber $scratch,@earlyclobber $RTp" in {
+// Atomic pseudo instructions expanded post-ra.
+def ATOMIC_SWAP_I128 : AtomicRMW128<"#ATOMIC_SWAP_I128">;
+def ATOMIC_LOAD_ADD_I128 : AtomicRMW128<"#ATOMIC_LOAD_ADD_I128">;
+def ATOMIC_LOAD_SUB_I128 : AtomicRMW128<"#ATOMIC_LOAD_SUB_I128">;
+def ATOMIC_LOAD_AND_I128 : AtomicRMW128<"#ATOMIC_LOAD_AND_I128">;
+def ATOMIC_LOAD_XOR_I128 : AtomicRMW128<"#ATOMIC_LOAD_XOR_I128">;
+def ATOMIC_LOAD_OR_I128 : AtomicRMW128<"#ATOMIC_LOAD_OR_I128">;
+def ATOMIC_LOAD_NAND_I128 : AtomicRMW128<"#ATOMIC_LOAD_NAND_I128">;
+
+def ATOMIC_CMP_SWAP_I128 : PPCPostRAExpPseudo<
+ (outs g8prc:$RTp, g8prc:$scratch),
+ (ins memrr:$ptr, g8rc:$cmp_lo, g8rc:$cmp_hi,
+ g8rc:$new_lo, g8rc:$new_hi),
+ "#ATOMIC_CMP_SWAP_I128", []>;
+}
+
+def : Pat<(int_ppc_atomicrmw_add_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_ADD_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_sub_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_SUB_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_xor_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_XOR_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_and_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_AND_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_nand_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_NAND_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_or_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_OR_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_xchg_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_SWAP_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_cmpxchg_i128 ForceXForm:$ptr,
+ i64:$cmp_lo,
+ i64:$cmp_hi,
+ i64:$new_lo,
+ i64:$new_hi),
+ (SPLIT_QUADWORD (ATOMIC_CMP_SWAP_I128
+ memrr:$ptr,
+ g8rc:$cmp_lo,
+ g8rc:$cmp_hi,
+ g8rc:$new_lo,
+ g8rc:$new_hi))>;
let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
@@ -703,11 +802,11 @@
"cmpldi $dst, $src1, $src2",
IIC_IntCompare>, isPPC64;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
- def CMPRB8 : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+ def CMPRB8 : X_BF3_L1_RS5_RS5<31, 192, (outs crrc:$BF),
(ins u1imm:$L, g8rc:$rA, g8rc:$rB),
"cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
Requires<[IsISA3_0]>;
- def CMPEQB : X_BF3_RS5_RS5<31, 224, (outs crbitrc:$BF),
+ def CMPEQB : X_BF3_RS5_RS5<31, 224, (outs crrc:$BF),
(ins g8rc:$rA, g8rc:$rB), "cmpeqb $BF, $rA, $rB",
IIC_IntCompare, []>, Requires<[IsISA3_0]>;
}
@@ -810,9 +909,10 @@
"popcntw $rA, $rS", IIC_IntGeneral,
[(set i32:$rA, (ctpop i32:$rS))]>;
-def POPCNTB : XForm_11<31, 122, (outs g8rc:$rA), (ins g8rc:$rS),
- "popcntb $rA, $rS", IIC_IntGeneral,
- [(set i64:$rA, (int_ppc_popcntb i64:$rS))]>;
+let isCodeGenOnly = 1 in
+def POPCNTB8 : XForm_11<31, 122, (outs g8rc:$rA), (ins g8rc:$rS),
+ "popcntb $rA, $rS", IIC_IntGeneral,
+ [(set i64:$rA, (int_ppc_popcntb i64:$rS))]>;
defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"divd", "$rT, $rA, $rB", IIC_IntDivD,
@@ -835,8 +935,6 @@
"maddld $RT, $RA, $RB, $RC", IIC_IntMulHD,
[(set i32:$RT, (add_without_simm16 (mul_without_simm16 i32:$RA, i32:$RB), i32:$RC))]>,
isPPC64;
-def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA),
- "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
def MADDLD8 : VAForm_1a<51,
(outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
@@ -989,9 +1087,9 @@
//Disable this alias on AIX for now because as does not support them.
let Predicates = [ModernAs] in {
- def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
- def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
-}
+
+def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
+def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
def : InstAlias<"mfrtcu $Rx", (MFSPR8 g8rc:$Rx, 4)>;
def : InstAlias<"mfrtcl $Rx", (MFSPR8 g8rc:$Rx, 5)>;
@@ -1050,6 +1148,8 @@
def : InstAlias<"mfspefscr $Rx", (MFSPR8 g8rc:$Rx, 512)>;
def : InstAlias<"mtspefscr $Rx", (MTSPR8 512, g8rc:$Rx)>;
+}
+
//===----------------------------------------------------------------------===//
// Load/Store instructions.
//
@@ -1060,21 +1160,21 @@
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
"lha $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
+ [(set i64:$rD, (sextloadi16 DForm:$src))]>,
PPC970_DGroup_Cracked;
def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
"lwa $rD, $src", IIC_LdStLWA,
[(set i64:$rD,
- (DSFormSextLoadi32 iaddrX4:$src))]>, isPPC64,
+ (sextloadi32 DSForm:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
"lhax $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+ [(set i64:$rD, (sextloadi16 XForm:$src))]>,
PPC970_DGroup_Cracked;
def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src),
"lwax $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi32 xaddrX4:$src))]>, isPPC64,
+ [(set i64:$rD, (sextloadi32 XForm:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
// For fast-isel:
let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
@@ -1115,23 +1215,23 @@
let PPC970_Unit = 2 in {
def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
"lbz $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
+ [(set i64:$rD, (zextloadi8 DForm:$src))]>;
def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
"lhz $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
+ [(set i64:$rD, (zextloadi16 DForm:$src))]>;
def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
"lwz $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+ [(set i64:$rD, (zextloadi32 DForm:$src))]>, isPPC64;
def LBZX8 : XForm_1_memOp<31, 87, (outs g8rc:$rD), (ins memrr:$src),
"lbzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+ [(set i64:$rD, (zextloadi8 XForm:$src))]>;
def LHZX8 : XForm_1_memOp<31, 279, (outs g8rc:$rD), (ins memrr:$src),
"lhzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+ [(set i64:$rD, (zextloadi16 XForm:$src))]>;
def LWZX8 : XForm_1_memOp<31, 23, (outs g8rc:$rD), (ins memrr:$src),
"lwzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+ [(set i64:$rD, (zextloadi32 XForm:$src))]>;
// Update forms.
@@ -1176,7 +1276,7 @@
let PPC970_Unit = 2 in {
def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
"ld $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (DSFormLoad iaddrX4:$src))]>, isPPC64;
+ [(set i64:$rD, (load DSForm:$src))]>, isPPC64;
// The following four definitions are selected for small code model only.
// Otherwise, we need to create two instructions to form a 32-bit offset,
// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
@@ -1199,10 +1299,10 @@
def LDX : XForm_1_memOp<31, 21, (outs g8rc:$rD), (ins memrr:$src),
"ldx $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (load xaddrX4:$src))]>, isPPC64;
+ [(set i64:$rD, (load XForm:$src))]>, isPPC64;
def LDBRX : XForm_1_memOp<31, 532, (outs g8rc:$rD), (ins memrr:$src),
"ldbrx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+ [(set i64:$rD, (PPClbrx ForceXForm:$src, i64))]>, isPPC64;
let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
def LHBRX8 : XForm_1_memOp<31, 790, (outs g8rc:$rD), (ins memrr:$src),
@@ -1228,6 +1328,22 @@
"ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
Requires<[IsISA3_0]>;
}
+
+let mayLoad = 1, hasNoSchedulingInfo = 1 in {
+// Full 16-byte load.
+// Early clobber $RTp to avoid assigned to the same register as RA.
+// TODO: Add scheduling info.
+def LQ : DQForm_RTp5_RA17_MEM<56, 0,
+ (outs g8prc:$RTp),
+ (ins memrix16:$src),
+ "lq $RTp, $src", IIC_LdStLQ,
+ []>,
+ RegConstraint<"@earlyclobber $RTp">,
+ isPPC64;
+def RESTORE_QUADWORD : PPCEmitTimePseudo<(outs g8prc:$RTp), (ins memrix:$src),
+ "#RESTORE_QUADWORD", []>;
+}
+
}
// Support for medium and large code model.
@@ -1299,6 +1415,18 @@
def GETtlsldADDR : GETtlsldADDRPseudo <"#GETtlsldADDR">;
let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
def GETtlsldADDRPCREL : GETtlsldADDRPseudo <"#GETtlsldADDRPCREL">;
+
+// On AIX, the call to __tls_get_addr needs two inputs in X3/X4 for the
+// offset and region handle respectively. The call is not followed by a nop
+// so we don't need to mark it with a size of 8 bytes. Finally, the assembly
+// manual mentions this exact set of registers as the clobbered set, others
+// are guaranteed not to be clobbered.
+let Defs = [X0,X4,X5,X11,LR8,CR0] in
+def GETtlsADDR64AIX :
+ PPCEmitTimePseudo<(outs g8rc:$rD),(ins g8rc:$offset, g8rc:$handle),
+ "GETtlsADDR64AIX",
+ [(set i64:$rD,
+ (PPCgetTlsAddr i64:$offset, i64:$handle))]>, isPPC64;
}
// Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8
@@ -1324,6 +1452,13 @@
[(set i64:$rD,
(PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
isPPC64;
+// This pseudo is expanded to two copies to put the variable offset in R4 and
+// the region handle in R3 and GETtlsADDR64AIX.
+def TLSGDAIX8 :
+ PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$offset, g8rc:$handle),
+ "#TLSGDAIX8",
+ [(set i64:$rD,
+ (PPCTlsgdAIX i64:$offset, i64:$handle))]>;
// Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8
// are true defines, while the rest of the Defs are clobbers.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -1359,39 +1494,50 @@
// Truncating stores.
def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
"stb $rS, $src", IIC_LdStStore,
- [(truncstorei8 i64:$rS, iaddr:$src)]>;
+ [(truncstorei8 i64:$rS, DForm:$src)]>;
def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
"sth $rS, $src", IIC_LdStStore,
- [(truncstorei16 i64:$rS, iaddr:$src)]>;
+ [(truncstorei16 i64:$rS, DForm:$src)]>;
def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
"stw $rS, $src", IIC_LdStStore,
- [(truncstorei32 i64:$rS, iaddr:$src)]>;
+ [(truncstorei32 i64:$rS, DForm:$src)]>;
def STBX8 : XForm_8_memOp<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
"stbx $rS, $dst", IIC_LdStStore,
- [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+ [(truncstorei8 i64:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
def STHX8 : XForm_8_memOp<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
"sthx $rS, $dst", IIC_LdStStore,
- [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+ [(truncstorei16 i64:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
"stwx $rS, $dst", IIC_LdStStore,
- [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+ [(truncstorei32 i64:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
} // Interpretation64Bit
// Normal 8-byte stores.
def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
"std $rS, $dst", IIC_LdStSTD,
- [(DSFormStore i64:$rS, iaddrX4:$dst)]>, isPPC64;
+ [(store i64:$rS, DSForm:$dst)]>, isPPC64;
def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
"stdx $rS, $dst", IIC_LdStSTD,
- [(store i64:$rS, xaddrX4:$dst)]>, isPPC64,
+ [(store i64:$rS, XForm:$dst)]>, isPPC64,
PPC970_DGroup_Cracked;
def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
"stdbrx $rS, $dst", IIC_LdStStore,
- [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+ [(PPCstbrx i64:$rS, ForceXForm:$dst, i64)]>, isPPC64,
PPC970_DGroup_Cracked;
+
+let mayStore = 1, hasNoSchedulingInfo = 1 in {
+// Normal 16-byte stores.
+// TODO: Add scheduling info.
+def STQ : DSForm_1<62, 2, (outs), (ins g8prc:$RSp, memrix:$dst),
+ "stq $RSp, $dst", IIC_LdStSTQ,
+ []>, isPPC64;
+def SPILL_QUADWORD : PPCEmitTimePseudo<(outs), (ins g8prc:$RSp, memrix:$dst),
+ "#SPILL_QUADWORD", []>;
+}
+
}
// Stores with Update (pre-inc).
@@ -1500,6 +1646,29 @@
[(set f64:$frD, (PPCany_fctiwuz f64:$frB))]>, isPPC64;
}
+// These instructions store a hash computed from the value of the link register
+// and the value of the stack pointer.
+let mayStore = 1 in {
+def HASHST : XForm_XD6_RA5_RB5<31, 722, (outs),
+ (ins g8rc:$RB, memrihash:$D_RA_XD),
+ "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHSTP : XForm_XD6_RA5_RB5<31, 658, (outs),
+ (ins g8rc:$RB, memrihash:$D_RA_XD),
+ "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+}
+
+// These instructions check a hash computed from the value of the link register
+// and the value of the stack pointer. The hasSideEffects flag is needed as the
+// instruction may TRAP if the hash does not match the hash stored at the
+// specified address.
+let mayLoad = 1, hasSideEffects = 1 in {
+def HASHCHK : XForm_XD6_RA5_RB5<31, 754, (outs),
+ (ins g8rc:$RB, memrihash:$D_RA_XD),
+ "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs),
+ (ins g8rc:$RB, memrihash:$D_RA_XD),
+ "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+}
//===----------------------------------------------------------------------===//
// Instruction Patterns
@@ -1523,26 +1692,26 @@
(i64not $in)>;
// Extending loads with i64 targets.
-def : Pat<(zextloadi1 iaddr:$src),
- (LBZ8 iaddr:$src)>;
-def : Pat<(zextloadi1 xaddr:$src),
- (LBZX8 xaddr:$src)>;
-def : Pat<(extloadi1 iaddr:$src),
- (LBZ8 iaddr:$src)>;
-def : Pat<(extloadi1 xaddr:$src),
- (LBZX8 xaddr:$src)>;
-def : Pat<(extloadi8 iaddr:$src),
- (LBZ8 iaddr:$src)>;
-def : Pat<(extloadi8 xaddr:$src),
- (LBZX8 xaddr:$src)>;
-def : Pat<(extloadi16 iaddr:$src),
- (LHZ8 iaddr:$src)>;
-def : Pat<(extloadi16 xaddr:$src),
- (LHZX8 xaddr:$src)>;
-def : Pat<(extloadi32 iaddr:$src),
- (LWZ8 iaddr:$src)>;
-def : Pat<(extloadi32 xaddr:$src),
- (LWZX8 xaddr:$src)>;
+def : Pat<(zextloadi1 DForm:$src),
+ (LBZ8 DForm:$src)>;
+def : Pat<(zextloadi1 XForm:$src),
+ (LBZX8 XForm:$src)>;
+def : Pat<(extloadi1 DForm:$src),
+ (LBZ8 DForm:$src)>;
+def : Pat<(extloadi1 XForm:$src),
+ (LBZX8 XForm:$src)>;
+def : Pat<(extloadi8 DForm:$src),
+ (LBZ8 DForm:$src)>;
+def : Pat<(extloadi8 XForm:$src),
+ (LBZX8 XForm:$src)>;
+def : Pat<(extloadi16 DForm:$src),
+ (LHZ8 DForm:$src)>;
+def : Pat<(extloadi16 XForm:$src),
+ (LHZX8 XForm:$src)>;
+def : Pat<(extloadi32 DForm:$src),
+ (LWZ8 DForm:$src)>;
+def : Pat<(extloadi32 XForm:$src),
+ (LWZX8 XForm:$src)>;
// Standard shifts. These are represented separately from the real shifts above
// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
@@ -1592,21 +1761,43 @@
def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
(ADDIS8 $in, tblockaddress:$g)>;
-// Patterns to match r+r indexed loads and stores for
-// addresses without at least 4-byte alignment.
-def : Pat<(i64 (NonDSFormSextLoadi32 xoaddr:$src)),
- (LWAX xoaddr:$src)>;
-def : Pat<(i64 (NonDSFormLoad xoaddr:$src)),
- (LDX xoaddr:$src)>;
-def : Pat<(NonDSFormStore i64:$rS, xoaddr:$dst),
- (STDX $rS, xoaddr:$dst)>;
+// AIX 64-bit small code model TLS access.
+def : Pat<(i64 (PPCtoc_entry tglobaltlsaddr:$disp, i64:$reg)),
+ (i64 (LDtoc tglobaltlsaddr:$disp, i64:$reg))>;
// 64-bits atomic loads and stores
-def : Pat<(atomic_load_64 iaddrX4:$src), (LD memrix:$src)>;
-def : Pat<(atomic_load_64 xaddrX4:$src), (LDX memrr:$src)>;
+def : Pat<(atomic_load_64 DSForm:$src), (LD memrix:$src)>;
+def : Pat<(atomic_load_64 XForm:$src), (LDX memrr:$src)>;
-def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
-def : Pat<(atomic_store_64 xaddrX4:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_64 DSForm:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
+def : Pat<(atomic_store_64 XForm:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0, In64BitMode] in {
+def : Pat<(i64 (int_ppc_cmpeqb g8rc:$a, g8rc:$b)),
+ (i64 (SETB8 (CMPEQB $a, $b)))>;
+def : Pat<(i64 (int_ppc_setb g8rc:$a, g8rc:$b)),
+ (i64 (SETB8 (CMPD $a, $b)))>;
+def : Pat<(i64 (int_ppc_maddhd g8rc:$a, g8rc:$b, g8rc:$c)),
+ (i64 (MADDHD $a, $b, $c))>;
+def : Pat<(i64 (int_ppc_maddhdu g8rc:$a, g8rc:$b, g8rc:$c)),
+ (i64 (MADDHDU $a, $b, $c))>;
+def : Pat<(i64 (int_ppc_maddld g8rc:$a, g8rc:$b, g8rc:$c)),
+ (i64 (MADDLD8 $a, $b, $c))>;
+}
+
+let Predicates = [In64BitMode] in {
+def : Pat<(i64 (int_ppc_mulhd g8rc:$a, g8rc:$b)),
+ (i64 (MULHD $a, $b))>;
+def : Pat<(i64 (int_ppc_mulhdu g8rc:$a, g8rc:$b)),
+ (i64 (MULHDU $a, $b))>;
+def : Pat<(int_ppc_load8r ForceXForm:$ptr),
+ (LDBRX ForceXForm:$ptr)>;
+def : Pat<(int_ppc_store8r g8rc:$a, ForceXForm:$ptr),
+ (STDBRX g8rc:$a, ForceXForm:$ptr)>;
+}
+
+def : Pat<(i64 (int_ppc_cmpb g8rc:$a, g8rc:$b)),
+ (i64 (CMPB8 $a, $b))>;
let Predicates = [IsISA3_0] in {
// DARN (deliver random number)
@@ -1615,14 +1806,20 @@
def : Pat<(int_ppc_darn), (DARN 1)>;
def : Pat<(int_ppc_darnraw), (DARN 2)>;
+class X_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
+ InstrItinClass itin, list<dag> pattern>
+ : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$rA, ty:$rB, u1imm:$L),
+ !strconcat(opc, " $rA, $rB"), itin, pattern>{
+ let L = 1;
+}
+
class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
InstrItinClass itin, list<dag> pattern>
: X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$rA, ty:$rB, u1imm:$L),
!strconcat(opc, " $rA, $rB, $L"), itin, pattern>;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-def CP_COPY8 : X_L1_RA5_RB5<31, 774, "copy" , g8rc, IIC_LdStCOPY, []>;
-def CP_PASTE8 : X_L1_RA5_RB5<31, 902, "paste" , g8rc, IIC_LdStPASTE, []>;
+def CP_COPY8 : X_RA5_RB5<31, 774, "copy" , g8rc, IIC_LdStCOPY, []>;
def CP_PASTE8_rec : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isRecordForm;
}
@@ -1633,3 +1830,16 @@
def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
} // IsISA3_0
+
+def : Pat<(int_ppc_stdcx ForceXForm:$dst, g8rc:$A),
+ (STDCX g8rc:$A, ForceXForm:$dst)>;
+def : Pat<(int_ppc_tdw g8rc:$A, g8rc:$B, i32:$IMM),
+ (TD $IMM, $A, $B)>;
+
+// trapd
+def : Pat<(int_ppc_trapd g8rc:$A),
+ (TDI 24, $A, 0)>;
+def : Pat<(i64 (int_ppc_mfspr timm:$SPR)),
+ (MFSPR8 $SPR)>;
+def : Pat<(int_ppc_mtspr timm:$SPR, g8rc:$RT),
+ (MTSPR8 $SPR, $RT)>;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 1a34aa0..2bc7fb2 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -30,11 +30,6 @@
// Altivec transformation functions and pattern fragments.
//
-// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
-// of that type.
-def vnot_ppc : PatFrag<(ops node:$in),
- (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
-
def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
@@ -352,7 +347,7 @@
}
def DSSALL : DSS_Form<1, 822, (outs), (ins),
- "dssall", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dssall)]>,
+ "dssall", IIC_LdStLoad /*FIXME*/, []>,
Deprecated<DeprecatedDST> {
let STRM = 0;
let A = 0;
@@ -416,46 +411,46 @@
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads.
def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src),
"lvebx $vD, $src", IIC_LdStLoad,
- [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+ [(set v16i8:$vD, (int_ppc_altivec_lvebx ForceXForm:$src))]>;
def LVEHX: XForm_1_memOp<31, 39, (outs vrrc:$vD), (ins memrr:$src),
"lvehx $vD, $src", IIC_LdStLoad,
- [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+ [(set v8i16:$vD, (int_ppc_altivec_lvehx ForceXForm:$src))]>;
def LVEWX: XForm_1_memOp<31, 71, (outs vrrc:$vD), (ins memrr:$src),
"lvewx $vD, $src", IIC_LdStLoad,
- [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+ [(set v4i32:$vD, (int_ppc_altivec_lvewx ForceXForm:$src))]>;
def LVX : XForm_1_memOp<31, 103, (outs vrrc:$vD), (ins memrr:$src),
"lvx $vD, $src", IIC_LdStLoad,
- [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+ [(set v4i32:$vD, (int_ppc_altivec_lvx ForceXForm:$src))]>;
def LVXL : XForm_1_memOp<31, 359, (outs vrrc:$vD), (ins memrr:$src),
"lvxl $vD, $src", IIC_LdStLoad,
- [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+ [(set v4i32:$vD, (int_ppc_altivec_lvxl ForceXForm:$src))]>;
}
def LVSL : XForm_1_memOp<31, 6, (outs vrrc:$vD), (ins memrr:$src),
"lvsl $vD, $src", IIC_LdStLoad,
- [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+ [(set v16i8:$vD, (int_ppc_altivec_lvsl ForceXForm:$src))]>,
PPC970_Unit_LSU;
def LVSR : XForm_1_memOp<31, 38, (outs vrrc:$vD), (ins memrr:$src),
"lvsr $vD, $src", IIC_LdStLoad,
- [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+ [(set v16i8:$vD, (int_ppc_altivec_lvsr ForceXForm:$src))]>,
PPC970_Unit_LSU;
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { // Stores.
def STVEBX: XForm_8_memOp<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
"stvebx $rS, $dst", IIC_LdStStore,
- [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
+ [(int_ppc_altivec_stvebx v16i8:$rS, ForceXForm:$dst)]>;
def STVEHX: XForm_8_memOp<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
"stvehx $rS, $dst", IIC_LdStStore,
- [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
+ [(int_ppc_altivec_stvehx v8i16:$rS, ForceXForm:$dst)]>;
def STVEWX: XForm_8_memOp<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
"stvewx $rS, $dst", IIC_LdStStore,
- [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
+ [(int_ppc_altivec_stvewx v4i32:$rS, ForceXForm:$dst)]>;
def STVX : XForm_8_memOp<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
"stvx $rS, $dst", IIC_LdStStore,
- [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
+ [(int_ppc_altivec_stvx v4i32:$rS, ForceXForm:$dst)]>;
def STVXL : XForm_8_memOp<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
"stvxl $rS, $dst", IIC_LdStStore,
- [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
+ [(int_ppc_altivec_stvxl v4i32:$rS, ForceXForm:$dst)]>;
}
let PPC970_Unit = 5 in { // VALU Operations.
@@ -521,7 +516,7 @@
def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vandc $vD, $vA, $vB", IIC_VecFP,
[(set v4i32:$vD, (and v4i32:$vA,
- (vnot_ppc v4i32:$vB)))]>;
+ (vnot v4i32:$vB)))]>;
def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
"vcfsx $vD, $vB, $UIMM", IIC_VecFP,
@@ -684,8 +679,8 @@
def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vnor $vD, $vA, $vB", IIC_VecFP,
- [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
- v4i32:$vB)))]>;
+ [(set v4i32:$vD, (vnot (or v4i32:$vA,
+ v4i32:$vB)))]>;
let isCommutable = 1 in {
def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vor $vD, $vA, $vB", IIC_VecFP,
@@ -870,6 +865,13 @@
def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+// This is a nop on all supported architectures and the AIX assembler
+// doesn't support it (and will not be updated to support it).
+let Predicates = [IsAIX] in
+def : Pat<(int_ppc_altivec_dssall), (NOP)>;
+let Predicates = [NotAIX] in
+def : Pat<(int_ppc_altivec_dssall), (DSSALL)>;
+
// Rotates.
def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)),
(v16i8 (VRLB v16i8:$vA, v16i8:$vB))>;
@@ -899,11 +901,11 @@
def : Pat<(v4i32 (usubsat v4i32:$vA, v4i32:$vB)), (v4i32 (VSUBUWS $vA, $vB))>;
// Loads.
-def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+def : Pat<(v4i32 (load ForceXForm:$src)), (LVX ForceXForm:$src)>;
// Stores.
-def : Pat<(store v4i32:$rS, xoaddr:$dst),
- (STVX $rS, xoaddr:$dst)>;
+def : Pat<(store v4i32:$rS, ForceXForm:$dst),
+ (STVX $rS, ForceXForm:$dst)>;
// Bit conversions.
def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
@@ -1034,11 +1036,11 @@
(VMRGHW $vB, $vA)>;
// Logical Operations
-def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
+def : Pat<(vnot v4i32:$vA), (VNOR $vA, $vA)>;
-def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
+def : Pat<(vnot (or v4i32:$A, v4i32:$B)),
(VNOR $A, $B)>;
-def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
+def : Pat<(and v4i32:$A, (vnot v4i32:$B)),
(VANDC $A, $B)>;
def : Pat<(fmul v4f32:$vA, v4f32:$vB),
@@ -1140,19 +1142,21 @@
(VSEL $vC, $vB, $vA)>;
def : Pat<(v2f64 (vselect v2i64:$vA, v2f64:$vB, v2f64:$vC)),
(VSEL $vC, $vB, $vA)>;
+def : Pat<(v1i128 (vselect v1i128:$vA, v1i128:$vB, v1i128:$vC)),
+ (VSEL $vC, $vB, $vA)>;
// Vector Integer Average Instructions
-def : Pat<(v4i32 (sra (sub v4i32:$vA, (vnot_ppc v4i32:$vB)),
+def : Pat<(v4i32 (sra (sub v4i32:$vA, (vnot v4i32:$vB)),
(v4i32 (immEQOneV)))), (v4i32 (VAVGSW $vA, $vB))>;
-def : Pat<(v8i16 (sra (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))),
+def : Pat<(v8i16 (sra (sub v8i16:$vA, (v8i16 (bitconvert(vnot v4i32:$vB)))),
(v8i16 (immEQOneV)))), (v8i16 (VAVGSH $vA, $vB))>;
-def : Pat<(v16i8 (sra (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))),
+def : Pat<(v16i8 (sra (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
(v16i8 (immEQOneV)))), (v16i8 (VAVGSB $vA, $vB))>;
-def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot_ppc v4i32:$vB)),
+def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot v4i32:$vB)),
(v4i32 (immEQOneV)))), (v4i32 (VAVGUW $vA, $vB))>;
-def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))),
+def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot v4i32:$vB)))),
(v8i16 (immEQOneV)))), (v8i16 (VAVGUH $vA, $vB))>;
-def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))),
+def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
(v16i8 (immEQOneV)))), (v16i8 (VAVGUB $vA, $vB))>;
} // end HasAltivec
@@ -1299,16 +1303,16 @@
// if we find situations where Altivec is really preferred over VSX.
def VEQV : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"veqv $vD, $vA, $vB", IIC_VecGeneral,
- [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
+ [(set v4i32:$vD, (vnot (xor v4i32:$vA, v4i32:$vB)))]>;
def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vnand $vD, $vA, $vB", IIC_VecGeneral,
- [(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>;
+ [(set v4i32:$vD, (vnot (and v4i32:$vA, v4i32:$vB)))]>;
} // isCommutable
def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vorc $vD, $vA, $vB", IIC_VecGeneral,
[(set v4i32:$vD, (or v4i32:$vA,
- (vnot_ppc v4i32:$vB)))]>;
+ (vnot v4i32:$vB)))]>;
// i64 element comparisons.
def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
@@ -1503,8 +1507,7 @@
def VNEGD : VX_VT5_EO5_VB5<1538, 7, "vnegd",
[(set v2i64:$vD,
- (sub (v2i64 (bitconvert (v4i32 immAllZerosV))),
- v2i64:$vB))]>;
+ (sub (v2i64 immAllZerosV), v2i64:$vB))]>;
// Vector Parity Byte
def VPRTYBW : VX_VT5_EO5_VB5<1538, 8, "vprtybw", [(set v4i32:$vD,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 646efe6..91b507e 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -422,6 +422,20 @@
let Inst{29-31} = xo;
}
+class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RTp;
+ bits<17> DQ_RA;
+ let Pattern = pattern;
+
+ let Inst{6-10} = RTp{4-0};
+ let Inst{11-15} = DQ_RA{16-12}; // Register #
+ let Inst{16-27} = DQ_RA{11-0}; // Displacement.
+ let Inst{28-31} = xo;
+}
+
// 1.7.6 X-Form
class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
@@ -1195,6 +1209,21 @@
let Inst{31} = XT{5};
}
+class XForm_XD6_RA5_RB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<11> D_RA_XD;
+ bits<5> RB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = D_RA_XD{4-0}; // D
+ let Inst{11-15} = D_RA_XD{10-6}; // RA
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+ let Inst{31} = D_RA_XD{5}; // DX
+}
+
class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
@@ -2112,6 +2141,24 @@
let Inst{31} = RC;
}
+class Z23Form_RTAB5_CY2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<5> RA;
+ bits<5> RB;
+ bits<2> CY;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-22} = CY;
+ let Inst{23-30} = xo;
+ let Inst{31} = 0;
+}
+
//===----------------------------------------------------------------------===//
// EmitTimePseudo won't have encoding information for the [MC]CodeEmitter
// stuff
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9e3c6c5..9dd35d5 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -344,7 +344,7 @@
//
// 2: Reduce register pressure.
// Try to reassociate FMA with FSUB and a constant like below:
-// C is a floatint point const.
+// C is a floating point const.
//
// Pattern 1:
// A = FSUB X, Y (Leaf)
@@ -362,7 +362,7 @@
//
// Before the transformation, A must be assigned with different hardware
// register with D. After the transformation, A and D must be assigned with
-// same hardware register due to TIE attricute of FMA instructions.
+// same hardware register due to TIE attribute of FMA instructions.
//
bool PPCInstrInfo::getFMAPatterns(
MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
@@ -1096,6 +1096,8 @@
break;
case PPC::LI:
case PPC::LI8:
+ case PPC::PLI:
+ case PPC::PLI8:
case PPC::LIS:
case PPC::LIS8:
case PPC::ADDIStocHA:
@@ -1106,6 +1108,7 @@
case PPC::XXLXORspz:
case PPC::XXLXORdpz:
case PPC::XXLEQVOnes:
+ case PPC::XXSPLTI32DX:
case PPC::V_SET0B:
case PPC::V_SET0H:
case PPC::V_SET0:
@@ -1256,8 +1259,10 @@
}
/// Return the noop instruction to use for a noop.
-void PPCInstrInfo::getNoop(MCInst &NopInst) const {
- NopInst.setOpcode(PPC::NOP);
+MCInst PPCInstrInfo::getNop() const {
+ MCInst Nop;
+ Nop.setOpcode(PPC::NOP);
+ return Nop;
}
// Branch analysis.
@@ -1829,6 +1834,22 @@
if (SrcPrimed && !KillSrc)
BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
return;
+ } else if (PPC::G8pRCRegClass.contains(DestReg) &&
+ PPC::G8pRCRegClass.contains(SrcReg)) {
+ // TODO: Handle G8RC to G8pRC (and vice versa) copy.
+ unsigned DestRegIdx = DestReg - PPC::G8p0;
+ MCRegister DestRegSub0 = PPC::X0 + 2 * DestRegIdx;
+ MCRegister DestRegSub1 = PPC::X0 + 2 * DestRegIdx + 1;
+ unsigned SrcRegIdx = SrcReg - PPC::G8p0;
+ MCRegister SrcRegSub0 = PPC::X0 + 2 * SrcRegIdx;
+ MCRegister SrcRegSub1 = PPC::X0 + 2 * SrcRegIdx + 1;
+ BuildMI(MBB, I, DL, get(PPC::OR8), DestRegSub0)
+ .addReg(SrcRegSub0)
+ .addReg(SrcRegSub0, getKillRegState(KillSrc));
+ BuildMI(MBB, I, DL, get(PPC::OR8), DestRegSub1)
+ .addReg(SrcRegSub1)
+ .addReg(SrcRegSub1, getKillRegState(KillSrc));
+ return;
} else
llvm_unreachable("Impossible reg-to-reg copy");
@@ -1881,6 +1902,8 @@
assert(Subtarget.pairedVectorMemops() &&
"Register unexpected when paired memops are disabled.");
OpcodeIndex = SOK_PairedVecSpill;
+ } else if (PPC::G8pRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_PairedG8Spill;
} else {
llvm_unreachable("Unknown regclass!");
}
@@ -2888,6 +2911,7 @@
{MO_TLSGD_FLAG, "ppc-tlsgd"},
{MO_TLSLD_FLAG, "ppc-tlsld"},
{MO_TPREL_FLAG, "ppc-tprel"},
+ {MO_TLSGDM_FLAG, "ppc-tlsgdm"},
{MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"},
{MO_GOT_TLSLD_PCREL_FLAG, "ppc-got-tlsld-pcrel"},
{MO_GOT_TPREL_PCREL_FLAG, "ppc-got-tprel-pcrel"}};
@@ -3136,11 +3160,11 @@
Register InUseReg = MI.getOperand(OpNo).getReg();
MI.getOperand(OpNo).ChangeToImmediate(Imm);
- if (MI.implicit_operands().empty())
- return;
-
// We need to make sure that the MI didn't have any implicit use
- // of this REG any more.
+ // of this REG any more. We don't call MI.implicit_operands().empty() to
+ // return early, since MI's MCID might be changed in calling context, as a
+ // result its number of explicit operands may be changed, thus the begin of
+ // implicit operand is changed.
const TargetRegisterInfo *TRI = &getRegisterInfo();
int UseOpIdx = MI.findRegisterUseOperandIdx(InUseReg, false, TRI);
if (UseOpIdx >= 0) {
@@ -4412,21 +4436,17 @@
// Sign-extend to 64-bits.
// DefMI may be folded with another imm form instruction, the result Imm is
// the sum of Imm of DefMI and BaseImm which is from imm form instruction.
+ APInt ActualValue(64, ImmMO.getImm() + BaseImm, true);
+ if (III.SignedImm && !ActualValue.isSignedIntN(III.ImmWidth))
+ return false;
+ if (!III.SignedImm && !ActualValue.isIntN(III.ImmWidth))
+ return false;
Imm = SignExtend64<16>(ImmMO.getImm() + BaseImm);
if (Imm % III.ImmMustBeMultipleOf)
return false;
if (III.TruncateImmTo)
Imm &= ((1 << III.TruncateImmTo) - 1);
- if (III.SignedImm) {
- APInt ActualValue(64, Imm, true);
- if (!ActualValue.isSignedIntN(III.ImmWidth))
- return false;
- } else {
- uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
- if ((uint64_t)Imm > UnsignedMax)
- return false;
- }
}
else
return false;
@@ -4743,7 +4763,12 @@
LLVM_DEBUG(DefMI.dump());
MI.getOperand(III.OpNoForForwarding).setReg(RegMO->getReg());
- MI.getOperand(III.OpNoForForwarding).setIsKill(RegMO->isKill());
+ if (RegMO->isKill()) {
+ MI.getOperand(III.OpNoForForwarding).setIsKill(true);
+ // Clear the killed flag in RegMO. Doing this here can handle some cases
+ // that DefMI and MI are not in same basic block.
+ RegMO->setIsKill(false);
+ }
MI.getOperand(III.ImmOpNo).setImm(Imm);
// FIXME: fix kill/dead flag if MI and DefMI are not in same basic block.
@@ -5156,7 +5181,8 @@
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
unsigned StackOffset = MI.getOperand(1).getImm();
Register StackReg = MI.getOperand(2).getReg();
- if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
+ Register SPReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
+ if (StackReg == SPReg && StackOffset == TOCSaveOffset)
return true;
return false;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index c6ef174..404156d 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -127,6 +127,7 @@
SOK_AccumulatorSpill,
SOK_UAccumulatorSpill,
SOK_SPESpill,
+ SOK_PairedG8Spill,
SOK_LastOpcodeSpill // This must be last on the enum.
};
@@ -136,14 +137,16 @@
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \
- PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD \
+ PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD, \
+ PPC::RESTORE_QUADWORD \
}
#define Pwr9LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
- PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \
+ PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, \
+ NoInstr, PPC::RESTORE_QUADWORD \
}
#define Pwr10LoadOpcodes \
@@ -151,21 +154,23 @@
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \
- PPC::RESTORE_UACC, NoInstr \
+ PPC::RESTORE_UACC, NoInstr, PPC::RESTORE_QUADWORD \
}
#define Pwr8StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \
- PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD \
+ PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD, \
+ PPC::SPILL_QUADWORD \
}
#define Pwr9StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
- PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr \
+ PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr, \
+ PPC::SPILL_QUADWORD \
}
#define Pwr10StoreOpcodes \
@@ -173,7 +178,7 @@
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \
- NoInstr \
+ NoInstr, PPC::SPILL_QUADWORD \
}
// Initialize arrays for load and store spill opcodes on supported subtargets.
@@ -559,7 +564,7 @@
///
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const override;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 724af23..f53e1b8 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -218,6 +218,7 @@
SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCTlsgdAIX : SDNode<"PPCISD::TLSGD_AIX", SDTIntBinOp>;
def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
@@ -398,6 +399,44 @@
SDLoc(N), MVT::i32);
}]>;
+// Check if the value can be converted to be single precision immediate, which
+// can be exploited by XXSPLTIDP. Ensure that it cannot be converted to single
+// precision before exploiting with XXSPLTI32DX.
+def nzFPImmAsi64 : PatLeaf<(fpimm), [{
+ APFloat APFloatOfN = N->getValueAPF();
+ return !N->isExactlyValue(+0.0) && !checkConvertToNonDenormSingle(APFloatOfN);
+}]>;
+
+// Get the Hi bits of a 64 bit immediate.
+def getFPAs64BitIntHi : SDNodeXForm<fpimm, [{
+ APFloat APFloatOfN = N->getValueAPF();
+ bool Unused;
+ APFloatOfN.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+ &Unused);
+ uint32_t Hi = (uint32_t)((APFloatOfN.bitcastToAPInt().getZExtValue() &
+ 0xFFFFFFFF00000000LL) >> 32);
+ return CurDAG->getTargetConstant(Hi, SDLoc(N), MVT::i32);
+}]>;
+
+// Get the Lo bits of a 64 bit immediate.
+def getFPAs64BitIntLo : SDNodeXForm<fpimm, [{
+ APFloat APFloatOfN = N->getValueAPF();
+ bool Unused;
+ APFloatOfN.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+ &Unused);
+ uint32_t Lo = (uint32_t)(APFloatOfN.bitcastToAPInt().getZExtValue() &
+ 0xFFFFFFFF);
+ return CurDAG->getTargetConstant(Lo, SDLoc(N), MVT::i32);
+}]>;
+
+def imm34 : PatLeaf<(imm), [{
+ return isInt<34>(N->getSExtValue());
+}]>;
+
+def getImmAs64BitInt : SDNodeXForm<imm, [{
+ return getI64Imm(N->getSExtValue(), SDLoc(N));
+}]>;
+
def SHL32 : SDNodeXForm<imm, [{
// Transformation function: 31 - imm
return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
@@ -602,6 +641,12 @@
def g8rc : RegisterOperand<G8RC> {
let ParserMatchClass = PPCRegG8RCAsmOperand;
}
+def PPCRegG8pRCAsmOperand : AsmOperandClass {
+ let Name = "RegG8pRC"; let PredicateMethod = "isEvenRegNumber";
+}
+def g8prc : RegisterOperand<G8pRC> {
+ let ParserMatchClass = PPCRegG8pRCAsmOperand;
+}
def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
}
@@ -958,6 +1003,13 @@
def dispRIX : Operand<iPTR> {
let ParserMatchClass = PPCDispRIXOperand;
}
+def PPCDispRIHashOperand : AsmOperandClass {
+ let Name = "DispRIHash"; let PredicateMethod = "isHashImmX8";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIHash : Operand<iPTR> {
+ let ParserMatchClass = PPCDispRIHashOperand;
+}
def PPCDispRIX16Operand : AsmOperandClass {
let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
let RenderMethod = "addImmOperands";
@@ -1006,6 +1058,14 @@
let DecoderMethod = "decodeMemRIXOperands";
let OperandType = "OPERAND_MEMORY";
}
+def memrihash : Operand<iPTR> {
+ // memrihash 8-aligned for ROP Protection Instructions.
+ let PrintMethod = "printMemRegImmHash";
+ let MIOperandInfo = (ops dispRIHash:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getMemRIHashEncoding";
+ let DecoderMethod = "decodeMemRIHashOperands";
+ let OperandType = "OPERAND_MEMORY";
+}
def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
@@ -1095,6 +1155,13 @@
// PC Relative Address
def pcreladdr : ComplexPattern<iPTR, 1, "SelectAddrPCRel", [], []>;
+// Load and Store Instruction Selection addressing modes.
+def DForm : ComplexPattern<iPTR, 2, "SelectDForm", [], [SDNPWantParent]>;
+def DSForm : ComplexPattern<iPTR, 2, "SelectDSForm", [], [SDNPWantParent]>;
+def DQForm : ComplexPattern<iPTR, 2, "SelectDQForm", [], [SDNPWantParent]>;
+def XForm : ComplexPattern<iPTR, 2, "SelectXForm", [], [SDNPWantParent]>;
+def ForceXForm : ComplexPattern<iPTR, 2, "SelectForceXForm", [], [SDNPWantParent]>;
+
//===----------------------------------------------------------------------===//
// PowerPC Instruction Predicate Definitions.
def In32BitMode : Predicate<"!Subtarget->isPPC64()">;
@@ -1109,12 +1176,14 @@
def HasSPE : Predicate<"Subtarget->hasSPE()">;
def HasICBT : Predicate<"Subtarget->hasICBT()">;
def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">;
+def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">;
def NoNaNsFPMath
: Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
def NaNsFPMath
: Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">;
def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
+def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">;
def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
def HasFPU : Predicate<"Subtarget->hasFPU()">;
def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
@@ -1123,6 +1192,8 @@
// AIX assembler may not be modern enough to support some extended mne.
def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
+def IsAIX : Predicate<"Subtarget->isAIXABI()">;
+def NotAIX : Predicate<"!Subtarget->isAIXABI()">;
//===----------------------------------------------------------------------===//
// PowerPC Multiclass Definitions.
@@ -1964,6 +2035,8 @@
(DCBTST 0, xoaddr:$dst)>;
def : Pat<(int_ppc_dcbf xoaddr:$dst),
(DCBF 0, xoaddr:$dst)>;
+def : Pat<(int_ppc_icbt xoaddr:$dst),
+ (ICBT 0, xoaddr:$dst)>;
def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
(DCBT 0, xoaddr:$dst)>; // data prefetch for loads
@@ -1986,120 +2059,120 @@
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
- [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
- [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
- [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
- [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
- [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
- [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
- [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
- [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
- [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
- [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_ADD_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
- [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
- [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
- [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
- [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
- [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
- [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
- [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
- [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
- [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
- [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_ADD_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
- [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
- [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
- [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
- [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
- [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
- [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
- [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
- [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
- [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
- [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_CMP_SWAP_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
- [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_8 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_CMP_SWAP_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
- [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_16 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_CMP_SWAP_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
- [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_32 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_SWAP_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
- [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_8 ForceXForm:$ptr, i32:$new))]>;
def ATOMIC_SWAP_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
- [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_16 ForceXForm:$ptr, i32:$new))]>;
def ATOMIC_SWAP_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
- [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_32 ForceXForm:$ptr, i32:$new))]>;
}
-def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
- (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>;
-def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
- (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>;
+def : Pat<(PPCatomicCmpSwap_8 ForceXForm:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I8 ForceXForm:$ptr, i32:$old, i32:$new)>;
+def : Pat<(PPCatomicCmpSwap_16 ForceXForm:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I16 ForceXForm:$ptr, i32:$old, i32:$new)>;
// Instructions to support atomic operations
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
@@ -2165,6 +2238,10 @@
def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
"td $to, $rA, $rB", IIC_IntTrapD, []>;
+def POPCNTB : XForm_11<31, 122, (outs gprc:$rA), (ins gprc:$rS),
+ "popcntb $rA, $rS", IIC_IntGeneral,
+ [(set i32:$rA, (int_ppc_popcntb i32:$rS))]>;
+
//===----------------------------------------------------------------------===//
// PPC32 Load Instructions.
//
@@ -2173,25 +2250,25 @@
let PPC970_Unit = 2 in {
def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
"lbz $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
+ [(set i32:$rD, (zextloadi8 DForm:$src))]>;
def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
"lha $rD, $src", IIC_LdStLHA,
- [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
+ [(set i32:$rD, (sextloadi16 DForm:$src))]>,
PPC970_DGroup_Cracked;
def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
"lhz $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
+ [(set i32:$rD, (zextloadi16 DForm:$src))]>;
def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
"lwz $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (load iaddr:$src))]>;
+ [(set i32:$rD, (load DForm:$src))]>;
let Predicates = [HasFPU] in {
def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
"lfs $rD, $src", IIC_LdStLFD,
- [(set f32:$rD, (load iaddr:$src))]>;
+ [(set f32:$rD, (load DForm:$src))]>;
def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
"lfd $rD, $src", IIC_LdStLFD,
- [(set f64:$rD, (load iaddr:$src))]>;
+ [(set f64:$rD, (load DForm:$src))]>;
}
@@ -2276,38 +2353,38 @@
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
def LBZX : XForm_1_memOp<31, 87, (outs gprc:$rD), (ins memrr:$src),
"lbzx $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
+ [(set i32:$rD, (zextloadi8 XForm:$src))]>;
def LHAX : XForm_1_memOp<31, 343, (outs gprc:$rD), (ins memrr:$src),
"lhax $rD, $src", IIC_LdStLHA,
- [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
+ [(set i32:$rD, (sextloadi16 XForm:$src))]>,
PPC970_DGroup_Cracked;
def LHZX : XForm_1_memOp<31, 279, (outs gprc:$rD), (ins memrr:$src),
"lhzx $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
+ [(set i32:$rD, (zextloadi16 XForm:$src))]>;
def LWZX : XForm_1_memOp<31, 23, (outs gprc:$rD), (ins memrr:$src),
"lwzx $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (load xaddr:$src))]>;
+ [(set i32:$rD, (load XForm:$src))]>;
def LHBRX : XForm_1_memOp<31, 790, (outs gprc:$rD), (ins memrr:$src),
"lhbrx $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
+ [(set i32:$rD, (PPClbrx ForceXForm:$src, i16))]>;
def LWBRX : XForm_1_memOp<31, 534, (outs gprc:$rD), (ins memrr:$src),
"lwbrx $rD, $src", IIC_LdStLoad,
- [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
+ [(set i32:$rD, (PPClbrx ForceXForm:$src, i32))]>;
let Predicates = [HasFPU] in {
def LFSX : XForm_25_memOp<31, 535, (outs f4rc:$frD), (ins memrr:$src),
"lfsx $frD, $src", IIC_LdStLFD,
- [(set f32:$frD, (load xaddr:$src))]>;
+ [(set f32:$frD, (load XForm:$src))]>;
def LFDX : XForm_25_memOp<31, 599, (outs f8rc:$frD), (ins memrr:$src),
"lfdx $frD, $src", IIC_LdStLFD,
- [(set f64:$frD, (load xaddr:$src))]>;
+ [(set f64:$frD, (load XForm:$src))]>;
def LFIWAX : XForm_25_memOp<31, 855, (outs f8rc:$frD), (ins memrr:$src),
"lfiwax $frD, $src", IIC_LdStLFD,
- [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
+ [(set f64:$frD, (PPClfiwax ForceXForm:$src))]>;
def LFIWZX : XForm_25_memOp<31, 887, (outs f8rc:$frD), (ins memrr:$src),
"lfiwzx $frD, $src", IIC_LdStLFD,
- [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
+ [(set f64:$frD, (PPClfiwzx ForceXForm:$src))]>;
}
}
@@ -2324,20 +2401,20 @@
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$dst),
"stb $rS, $dst", IIC_LdStStore,
- [(truncstorei8 i32:$rS, iaddr:$dst)]>;
+ [(truncstorei8 i32:$rS, DForm:$dst)]>;
def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$dst),
"sth $rS, $dst", IIC_LdStStore,
- [(truncstorei16 i32:$rS, iaddr:$dst)]>;
+ [(truncstorei16 i32:$rS, DForm:$dst)]>;
def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$dst),
"stw $rS, $dst", IIC_LdStStore,
- [(store i32:$rS, iaddr:$dst)]>;
+ [(store i32:$rS, DForm:$dst)]>;
let Predicates = [HasFPU] in {
def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
"stfs $rS, $dst", IIC_LdStSTFD,
- [(store f32:$rS, iaddr:$dst)]>;
+ [(store f32:$rS, DForm:$dst)]>;
def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
"stfd $rS, $dst", IIC_LdStSTFD,
- [(store f64:$rS, iaddr:$dst)]>;
+ [(store f64:$rS, DForm:$dst)]>;
}
}
@@ -2380,37 +2457,37 @@
let PPC970_Unit = 2 in {
def STBX : XForm_8_memOp<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
"stbx $rS, $dst", IIC_LdStStore,
- [(truncstorei8 i32:$rS, xaddr:$dst)]>,
+ [(truncstorei8 i32:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
def STHX : XForm_8_memOp<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
"sthx $rS, $dst", IIC_LdStStore,
- [(truncstorei16 i32:$rS, xaddr:$dst)]>,
+ [(truncstorei16 i32:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
def STWX : XForm_8_memOp<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
"stwx $rS, $dst", IIC_LdStStore,
- [(store i32:$rS, xaddr:$dst)]>,
+ [(store i32:$rS, XForm:$dst)]>,
PPC970_DGroup_Cracked;
def STHBRX: XForm_8_memOp<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
"sthbrx $rS, $dst", IIC_LdStStore,
- [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
+ [(PPCstbrx i32:$rS, ForceXForm:$dst, i16)]>,
PPC970_DGroup_Cracked;
def STWBRX: XForm_8_memOp<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
"stwbrx $rS, $dst", IIC_LdStStore,
- [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
+ [(PPCstbrx i32:$rS, ForceXForm:$dst, i32)]>,
PPC970_DGroup_Cracked;
let Predicates = [HasFPU] in {
def STFIWX: XForm_28_memOp<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
"stfiwx $frS, $dst", IIC_LdStSTFD,
- [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
+ [(PPCstfiwx f64:$frS, ForceXForm:$dst)]>;
def STFSX : XForm_28_memOp<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
"stfsx $frS, $dst", IIC_LdStSTFD,
- [(store f32:$frS, xaddr:$dst)]>;
+ [(store f32:$frS, XForm:$dst)]>;
def STFDX : XForm_28_memOp<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
"stfdx $frS, $dst", IIC_LdStSTFD,
- [(store f64:$frS, xaddr:$dst)]>;
+ [(store f64:$frS, XForm:$dst)]>;
}
}
@@ -2485,11 +2562,19 @@
def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
"eieio", IIC_LdStLoad, []>;
+def PseudoEIEIO : PPCEmitTimePseudo<(outs), (ins), "#PPCEIEIO",
+ [(int_ppc_eieio)]>;
+
def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_iospace_sync), (SYNC 0)>, Requires<[HasSYNC]>;
def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_iospace_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_iospace_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
-def : Pat<(int_ppc_eieio), (EnforceIEIO)>;
+def : Pat<(int_ppc_iospace_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_eieio), (PseudoEIEIO)>;
+def : Pat<(int_ppc_iospace_eieio), (PseudoEIEIO)>;
//===----------------------------------------------------------------------===//
// PPC32 Arithmetic Instructions.
@@ -2578,8 +2663,8 @@
"cmpwi $crD, $rA, $imm", IIC_IntCompare>;
def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
"cmplwi $dst, $src1, $src2", IIC_IntCompare>;
- def CMPRB : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
- (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+ def CMPRB : X_BF3_L1_RS5_RS5<31, 192, (outs crrc:$BF),
+ (ins u1imm:$L, gprc:$rA, gprc:$rB),
"cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
Requires<[IsISA3_0]>;
}
@@ -3010,17 +3095,22 @@
// When FM is 30/31, we are setting the 62/63 bit of FPSCR, the implicit-def
// RM should be set.
+let hasSideEffects = 1, Defs = [RM] in {
def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
- "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+ "mtfsb0 $FM", IIC_IntMTFSB0,
+ [(int_ppc_mtfsb0 timm:$FM)]>,
PPC970_DGroup_Single, PPC970_Unit_FPU;
def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
- "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+ "mtfsb1 $FM", IIC_IntMTFSB0,
+ [(int_ppc_mtfsb1 timm:$FM)]>,
PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
let Defs = [RM] in {
let isCodeGenOnly = 1 in
def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
- "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
+ "mtfsf $FM, $rT", IIC_IntMTFSB0,
+ [(int_ppc_mtfsf timm:$FM, f64:$rT)]>,
PPC970_DGroup_Single, PPC970_Unit_FPU;
}
let Uses = [RM] in {
@@ -3071,6 +3161,10 @@
def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
"moduw $rT, $rA, $rB", IIC_IntDivW,
[(set i32:$rT, (urem i32:$rA, i32:$rB))]>;
+let hasSideEffects = 1 in
+def ADDEX : Z23Form_RTAB5_CY2<31, 170, (outs gprc:$rT),
+ (ins gprc:$rA, gprc:$rB, u2imm:$CY),
+ "addex $rT, $rA, $rB, $CY", IIC_IntGeneral, []>;
}
let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
@@ -3411,6 +3505,15 @@
"GETtlsADDR32",
[(set i32:$rD,
(PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+// R3 is explicitly defined when this op is created, so not mentioned here.
+// The rest of the Defs are the exact set of registers that will be clobbered by
+// the call.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [R0,R4,R5,R11,LR,CR0] in
+def GETtlsADDR32AIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$offset, gprc:$handle),
+ "GETtlsADDR32AIX",
+ [(set i32:$rD,
+ (PPCgetTlsAddr i32:$offset, i32:$handle))]>;
// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded. R3 and LR
// are true defines while the rest of the Defs are clobbers.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -3426,6 +3529,12 @@
"#ADDItlsldL32",
[(set i32:$rD,
(PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+// This pseudo is expanded to two copies to put the variable offset in R4 and
+// the region handle in R3 and GETtlsADDR32AIX.
+def TLSGDAIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$offset, gprc:$handle),
+ "#TLSGDAIX",
+ [(set i32:$rD,
+ (PPCTlsgdAIX i32:$offset, i32:$handle))]>;
// LR is a true define, while the rest of the Defs are clobbers. R3 is
// explicitly defined when this op is created, so not mentioned here.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -3469,6 +3578,11 @@
"#ADDIStocHA",
[(set i32:$rD,
(PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
+// Local Data Transform
+def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+ "#ADDItoc",
+ [(set i32:$rD,
+ (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
// Get Global (GOT) Base Register offset, from the word immediately preceding
// the function label.
@@ -3490,27 +3604,27 @@
def : Pat<(shl i32:$rS, i32:$rB),
(SLW $rS, $rB)>;
-def : Pat<(i32 (zextloadi1 iaddr:$src)),
- (LBZ iaddr:$src)>;
-def : Pat<(i32 (zextloadi1 xaddr:$src)),
- (LBZX xaddr:$src)>;
-def : Pat<(i32 (extloadi1 iaddr:$src)),
- (LBZ iaddr:$src)>;
-def : Pat<(i32 (extloadi1 xaddr:$src)),
- (LBZX xaddr:$src)>;
-def : Pat<(i32 (extloadi8 iaddr:$src)),
- (LBZ iaddr:$src)>;
-def : Pat<(i32 (extloadi8 xaddr:$src)),
- (LBZX xaddr:$src)>;
-def : Pat<(i32 (extloadi16 iaddr:$src)),
- (LHZ iaddr:$src)>;
-def : Pat<(i32 (extloadi16 xaddr:$src)),
- (LHZX xaddr:$src)>;
+def : Pat<(i32 (zextloadi1 DForm:$src)),
+ (LBZ DForm:$src)>;
+def : Pat<(i32 (zextloadi1 XForm:$src)),
+ (LBZX XForm:$src)>;
+def : Pat<(i32 (extloadi1 DForm:$src)),
+ (LBZ DForm:$src)>;
+def : Pat<(i32 (extloadi1 XForm:$src)),
+ (LBZX XForm:$src)>;
+def : Pat<(i32 (extloadi8 DForm:$src)),
+ (LBZ DForm:$src)>;
+def : Pat<(i32 (extloadi8 XForm:$src)),
+ (LBZX XForm:$src)>;
+def : Pat<(i32 (extloadi16 DForm:$src)),
+ (LHZ DForm:$src)>;
+def : Pat<(i32 (extloadi16 XForm:$src)),
+ (LHZX XForm:$src)>;
let Predicates = [HasFPU] in {
-def : Pat<(f64 (extloadf32 iaddr:$src)),
- (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
-def : Pat<(f64 (extloadf32 xaddr:$src)),
- (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 DForm:$src)),
+ (COPY_TO_REGCLASS (LFS DForm:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 XForm:$src)),
+ (COPY_TO_REGCLASS (LFSX XForm:$src), F8RC)>;
def : Pat<(f64 (any_fpextend f32:$src)),
(COPY_TO_REGCLASS $src, F8RC)>;
@@ -3548,6 +3662,16 @@
(FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
}
+// XL Compat intrinsics.
+def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (FMSUB $A, $B, $C)>;
+def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (FMSUBS $A, $B, $C)>;
+def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (FNMSUB $A, $B, $C)>;
+def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (FNMSUBS $A, $B, $C)>;
+def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (FNMADD $A, $B, $C)>;
+def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (FNMADDS $A, $B, $C)>;
+def : Pat<(int_ppc_fre f64:$A), (FRE $A)>;
+def : Pat<(int_ppc_fres f32:$A), (FRES $A)>;
+
include "PPCInstrAltivec.td"
include "PPCInstrSPE.td"
include "PPCInstr64Bit.td"
@@ -3995,7 +4119,7 @@
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
}
-multiclass FSetCCPat<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+multiclass FSetCCPat<SDPatternOperator SetCC, ValueType Ty, I FCmp> {
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
@@ -4055,57 +4179,57 @@
// after the inclusion of the instruction sets.
let Predicates = [HasSPE] in {
// SETCC for f32.
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETOLT)),
(EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETLT)),
(EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETOGT)),
(EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETGT)),
(EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETOEQ)),
(EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+def : Pat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETEQ)),
(EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETUGE)),
(EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETGE)),
(EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETULE)),
(EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETLE)),
(EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETUNE)),
(EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+defm : CRNotPat<(i1 (any_fsetccs f32:$s1, f32:$s2, SETNE)),
(EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
// SETCC for f64.
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETOLT)),
(EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETLT)),
(EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETOGT)),
(EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETGT)),
(EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETOEQ)),
(EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+def : Pat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETEQ)),
(EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETUGE)),
(EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETGE)),
(EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETULE)),
(EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETLE)),
(EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETUNE)),
(EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+defm : CRNotPat<(i1 (any_fsetccs f64:$s1, f64:$s2, SETNE)),
(EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
}
// match select on i1 variables:
@@ -4295,6 +4419,10 @@
def : Pat<(i1 (not (trunc i64:$in))),
(ANDI_rec_1_EQ_BIT8 $in)>;
+def : Pat<(int_ppc_fsel f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), (FSELD $FRA, $FRC, $FRB)>;
+def : Pat<(int_ppc_frsqrte f8rc:$frB), (FRSQRTE $frB)>;
+def : Pat<(int_ppc_frsqrtes f4rc:$frB), (FRSQRTES $frB)>;
+
//===----------------------------------------------------------------------===//
// PowerPC Instructions used for assembler/disassembler only
//
@@ -4370,19 +4498,20 @@
def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
"mcrfs $BF, $BFA", IIC_BrMCR>;
-// If W is 0 and BF is 7, the 60:63 bits will be set, we should set the
-// implicit-def RM.
-def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
- "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
-let Defs = [CR1] in
-def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
- "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm;
-
-def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
-def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>;
-
+// All MTFSF variants may change the rounding mode so conservatively set it
+// as an implicit def for all of them.
let Predicates = [HasFPU] in {
let Defs = [RM] in {
+let isCodeGenOnly = 1,
+ Pattern = [(int_ppc_mtfsfi timm:$BF, timm:$U)], W = 0 in
+def MTFSFIb : XLForm_4<63, 134, (outs), (ins u3imm:$BF, u4imm:$U),
+ "mtfsfi $BF, $U", IIC_IntMFFS>;
+def MTFSFI : XLForm_4<63, 134, (outs), (ins u3imm:$BF, u4imm:$U, i32imm:$W),
+ "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
+let Defs = [CR1] in
+def MTFSFI_rec : XLForm_4<63, 134, (outs), (ins u3imm:$BF, u4imm:$U, u1imm:$W),
+ "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm;
+
def MTFSF : XFLForm_1<63, 711, (outs),
(ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W),
"mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
@@ -4392,6 +4521,8 @@
"mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm;
}
+def : InstAlias<"mtfsfi $BF, $U", (MTFSFI u3imm:$BF, u4imm:$U, 0)>;
+def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec u3imm:$BF, u4imm:$U, 0)>;
def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>;
}
@@ -4638,6 +4769,12 @@
def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
+
+//Disable this alias on AIX for now because as does not support them.
+let Predicates = [ModernAs] in {
+
foreach BR = 0-7 in {
def : InstAlias<"mfbr"#BR#" $Rx",
(MFDCR gprc:$Rx, !add(BR, 0x80))>,
@@ -4649,15 +4786,8 @@
def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
-
-def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
-def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
-
-//Disable this alias on AIX for now because as does not support them.
-let Predicates = [ModernAs] in {
def : InstAlias<"mtudscr $Rx", (MTSPR 3, gprc:$Rx)>;
def : InstAlias<"mfudscr $Rx", (MFSPR gprc:$Rx, 3)>;
-}
def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
@@ -4787,6 +4917,7 @@
def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
+}
def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
@@ -5127,20 +5258,20 @@
defm : TrapExtendedMnemonic<"u", 31>;
// Atomic loads
-def : Pat<(atomic_load_8 iaddr:$src), (LBZ memri:$src)>;
-def : Pat<(atomic_load_16 iaddr:$src), (LHZ memri:$src)>;
-def : Pat<(atomic_load_32 iaddr:$src), (LWZ memri:$src)>;
-def : Pat<(atomic_load_8 xaddr:$src), (LBZX memrr:$src)>;
-def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
-def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;
+def : Pat<(atomic_load_8 DForm:$src), (LBZ memri:$src)>;
+def : Pat<(atomic_load_16 DForm:$src), (LHZ memri:$src)>;
+def : Pat<(atomic_load_32 DForm:$src), (LWZ memri:$src)>;
+def : Pat<(atomic_load_8 XForm:$src), (LBZX memrr:$src)>;
+def : Pat<(atomic_load_16 XForm:$src), (LHZX memrr:$src)>;
+def : Pat<(atomic_load_32 XForm:$src), (LWZX memrr:$src)>;
// Atomic stores
-def : Pat<(atomic_store_8 iaddr:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>;
-def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>;
-def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>;
-def : Pat<(atomic_store_8 xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
-def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
-def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_8 DForm:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_16 DForm:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_32 DForm:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_8 XForm:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_16 XForm:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_32 XForm:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
let Predicates = [IsISA3_0] in {
@@ -5148,21 +5279,13 @@
// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
// PASTE for naming consistency.
let mayLoad = 1 in
-def CP_COPY : X_L1_RA5_RB5<31, 774, "copy" , gprc, IIC_LdStCOPY, []>;
-
-let mayStore = 1 in
-def CP_PASTE : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;
+def CP_COPY : X_RA5_RB5<31, 774, "copy" , gprc, IIC_LdStCOPY, []>;
let mayStore = 1, Defs = [CR0] in
def CP_PASTE_rec : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isRecordForm;
-def CP_COPYx : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
-def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
-def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
- (ins gprc:$rA, gprc:$rB)>;
-def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
- (ins gprc:$rA, gprc:$rB)>;
-def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;
+def : InstAlias<"paste. $RA, $RB", (CP_PASTE_rec gprc:$RA, gprc:$RB, 1)>;
+def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cpabort", IIC_SprABORT, []>;
// Message Synchronize
def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
@@ -5170,8 +5293,31 @@
// Power-Saving Mode Instruction:
def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
+def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA),
+ "setb $RT, $BFA", IIC_IntGeneral>;
} // IsISA3_0
+let Predicates = [IsISA3_0] in {
+def : Pat<(i32 (int_ppc_cmprb i32:$a, gprc:$b, gprc:$c)),
+ (i32 (SETB (CMPRB u1imm:$a, $b, $c)))>;
+}
+def : Pat<(i32 (int_ppc_mulhw gprc:$a, gprc:$b)),
+ (i32 (MULHW $a, $b))>;
+def : Pat<(i32 (int_ppc_mulhwu gprc:$a, gprc:$b)),
+ (i32 (MULHWU $a, $b))>;
+def : Pat<(i32 (int_ppc_cmpb gprc:$a, gprc:$b)),
+ (i32 (CMPB $a, $b))>;
+
+def : Pat<(int_ppc_load2r ForceXForm:$ptr),
+ (LHBRX ForceXForm:$ptr)>;
+def : Pat<(int_ppc_load4r ForceXForm:$ptr),
+ (LWBRX ForceXForm:$ptr)>;
+def : Pat<(int_ppc_store2r gprc:$a, ForceXForm:$ptr),
+ (STHBRX gprc:$a, ForceXForm:$ptr)>;
+def : Pat<(int_ppc_store4r gprc:$a, ForceXForm:$ptr),
+ (STWBRX gprc:$a, ForceXForm:$ptr)>;
+
+
// Fast 32-bit reverse bits algorithm:
// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
// n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA);
@@ -5324,3 +5470,50 @@
// swap the high word and low word.
def : Pat<(i64 (bitreverse i64:$A)),
(OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
+
+def : Pat<(int_ppc_stwcx ForceXForm:$dst, gprc:$A),
+ (STWCX gprc:$A, ForceXForm:$dst)>;
+def : Pat<(int_ppc_stbcx ForceXForm:$dst, gprc:$A),
+ (STBCX gprc:$A, ForceXForm:$dst)>;
+def : Pat<(int_ppc_tw gprc:$A, gprc:$B, i32:$IMM),
+ (TW $IMM, $A, $B)>;
+def : Pat<(int_ppc_trap gprc:$A),
+ (TWI 24, $A, 0)>;
+
+def : Pat<(int_ppc_fcfid f64:$A),
+ (XSCVSXDDP $A)>;
+def : Pat<(int_ppc_fcfud f64:$A),
+ (XSCVUXDDP $A)>;
+def : Pat<(int_ppc_fctid f64:$A),
+ (FCTID $A)>;
+def : Pat<(int_ppc_fctidz f64:$A),
+ (XSCVDPSXDS $A)>;
+def : Pat<(int_ppc_fctiw f64:$A),
+ (FCTIW $A)>;
+def : Pat<(int_ppc_fctiwz f64:$A),
+ (XSCVDPSXWS $A)>;
+def : Pat<(int_ppc_fctudz f64:$A),
+ (XSCVDPUXDS $A)>;
+def : Pat<(int_ppc_fctuwz f64:$A),
+ (XSCVDPUXWS $A)>;
+
+def : Pat<(int_ppc_mfmsr), (MFMSR)>;
+def : Pat<(int_ppc_mftbu), (MFTB 269)>;
+def : Pat<(i32 (int_ppc_mfspr timm:$SPR)),
+ (MFSPR $SPR)>;
+def : Pat<(int_ppc_mtspr timm:$SPR, gprc:$RT),
+ (MTSPR $SPR, $RT)>;
+def : Pat<(int_ppc_mtmsr gprc:$RS),
+ (MTMSR $RS, 0)>;
+
+let Predicates = [IsISA2_07] in {
+ def : Pat<(int_ppc_sthcx ForceXForm:$dst, gprc:$A),
+ (STHCX gprc:$A, ForceXForm:$dst)>;
+}
+def : Pat<(int_ppc_dcbtstt ForceXForm:$dst),
+ (DCBTST 16, ForceXForm:$dst)>;
+def : Pat<(int_ppc_dcbtt ForceXForm:$dst),
+ (DCBT 16, ForceXForm:$dst)>;
+
+def : Pat<(int_ppc_stfiw ForceXForm:$dst, f64:$XT),
+ (STFIWX f64:$XT, ForceXForm:$dst)>;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index b9eb3b3..b183dbd 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1,3 +1,6 @@
+//-------------------------- Predicate definitions ---------------------------//
+def IsPPC32 : Predicate<"!Subtarget->isPPC64()">;
+
// Mask immediates for MMA instructions (2, 4 and 8 bits).
def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
@@ -18,14 +21,17 @@
SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
]>;
def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
- SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2>
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisPtrTy<2>
]>;
def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
- SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2>
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisPtrTy<2>
]>;
def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
]>;
+def SDT_PPCVecInsertElt : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<3>
+]>;
//===----------------------------------------------------------------------===//
// ISA 3.1 specific PPCISD nodes.
@@ -39,6 +45,7 @@
def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
[]>;
def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+def PPCvecinsertelt : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsertElt, []>;
//===----------------------------------------------------------------------===//
@@ -1604,13 +1611,13 @@
v16i8:$vs3, v16i8:$vs2)),
(XXMTACC Concats.VecsToVecQuad)>;
def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
- def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))),
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 0)),
Extracts.Vec0>;
- def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))),
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 1)),
Extracts.Vec1>;
- def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))),
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 2)),
Extracts.Vec2>;
- def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))),
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 3)),
Extracts.Vec3>;
}
@@ -1619,9 +1626,9 @@
Concats.VecsToVecPair0>;
def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)),
Concats.VecsToVecPair0>;
- def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))),
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 0)),
(v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
- def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))),
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 1)),
(v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
}
@@ -1687,6 +1694,10 @@
// be removed.
let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
// Load i32
+ def : Pat<(i32 (zextloadi1 (PPCmatpcreladdr pcreladdr:$ga))),
+ (PLBZpc $ga, 0)>;
+ def : Pat<(i32 (extloadi1 (PPCmatpcreladdr pcreladdr:$ga))),
+ (PLBZpc $ga, 0)>;
def : Pat<(i32 (zextloadi8 (PPCmatpcreladdr pcreladdr:$ga))),
(PLBZpc $ga, 0)>;
def : Pat<(i32 (extloadi8 (PPCmatpcreladdr pcreladdr:$ga))),
@@ -1708,6 +1719,10 @@
(PSTWpc $RS, $ga, 0)>;
// Load i64
+ def : Pat<(i64 (zextloadi1 (PPCmatpcreladdr pcreladdr:$ga))),
+ (PLBZ8pc $ga, 0)>;
+ def : Pat<(i64 (extloadi1 (PPCmatpcreladdr pcreladdr:$ga))),
+ (PLBZ8pc $ga, 0)>;
def : Pat<(i64 (zextloadi8 (PPCmatpcreladdr pcreladdr:$ga))),
(PLBZ8pc $ga, 0)>;
def : Pat<(i64 (extloadi8 (PPCmatpcreladdr pcreladdr:$ga))),
@@ -1855,14 +1870,6 @@
"xxspltidp $XT, $IMM32", IIC_VecGeneral,
[(set v2f64:$XT,
(PPCxxspltidp i32:$IMM32))]>;
- def XXSPLTI32DX :
- 8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
- (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
- "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
- [(set v2i64:$XT,
- (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
- i32:$IMM32))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
def XXPERMX :
8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
vsrc:$XC, u3imm:$UIM),
@@ -1886,6 +1893,19 @@
IIC_VecGeneral, []>;
}
+// XXSPLI32DX needs extra flags to make sure the compiler does not attempt
+// to spill part of the instruction when the values are similar.
+let isReMaterializable = 1, isMoveImm = 1, Predicates = [PrefixInstrs] in {
+ def XXSPLTI32DX :
+ 8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
+ (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
+ "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
+ [(set v2i64:$XT,
+ (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
+ i32:$IMM32))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+}
+
let Predicates = [IsISA3_1] in {
def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI),
"setbc $RT, $BI", IIC_IntCompare, []>;
@@ -2417,9 +2437,9 @@
(SETBCR8 result)>;
}
-multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, ImmLeaf ZExtTy,
- ImmLeaf SExtTy, PatLeaf Cmpi, PatLeaf Cmpli,
- PatLeaf Cmp, PatLeaf Cmpl> {
+multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf ZExtTy,
+ ImmLeaf SExtTy, I Cmpi, I Cmpli,
+ I Cmp, I Cmpl> {
defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
(EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>;
defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
@@ -2445,7 +2465,7 @@
(EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>;
}
-multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, I FCmp> {
defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
@@ -2522,14 +2542,14 @@
def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
(EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
- def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 8)),
- (v1i128 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VRRC))>;
- def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 16)),
- (v1i128 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VRRC))>;
- def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 32)),
- (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>;
- def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)),
- (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 8)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 16)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 32)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRWX ForceXForm:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 64)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRDX ForceXForm:$src), VRRC))>;
def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
(v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
@@ -2547,18 +2567,23 @@
let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
// Store element 0 of a VSX register to memory
- def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), xoaddr:$dst),
- (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), xoaddr:$dst)>;
- def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), xoaddr:$dst),
- (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), xoaddr:$dst)>;
- def : Pat<(store (i32 (extractelt v4i32:$src, 0)), xoaddr:$dst),
- (STXVRWX $src, xoaddr:$dst)>;
- def : Pat<(store (f32 (extractelt v4f32:$src, 0)), xoaddr:$dst),
- (STXVRWX $src, xoaddr:$dst)>;
- def : Pat<(store (i64 (extractelt v2i64:$src, 0)), xoaddr:$dst),
- (STXVRDX $src, xoaddr:$dst)>;
- def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
- (STXVRDX $src, xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), ForceXForm:$dst),
+ (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), ForceXForm:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), ForceXForm:$dst),
+ (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), ForceXForm:$dst)>;
+ def : Pat<(store (i32 (extractelt v4i32:$src, 0)), ForceXForm:$dst),
+ (STXVRWX $src, ForceXForm:$dst)>;
+ def : Pat<(store (f32 (extractelt v4f32:$src, 0)), ForceXForm:$dst),
+ (STXVRWX $src, ForceXForm:$dst)>;
+ def : Pat<(store (i64 (extractelt v2i64:$src, 0)), ForceXForm:$dst),
+ (STXVRDX $src, ForceXForm:$dst)>;
+ def : Pat<(store (f64 (extractelt v2f64:$src, 0)), ForceXForm:$dst),
+ (STXVRDX $src, ForceXForm:$dst)>;
+ // Load element 0 of a VSX register to memory
+ def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 ForceXForm:$src)))),
+ (v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>;
+ def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))),
+ (v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>;
}
// FIXME: The swap is overkill when the shift amount is a constant.
@@ -2606,6 +2631,19 @@
(COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
VSFRC)>;
+// To replace constant pool with XXSPLTI32DX for scalars.
+def : Pat<(f32 nzFPImmAsi64:$A),
+ (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX(IMPLICIT_DEF), 0,
+ (getFPAs64BitIntHi $A)),
+ 1, (getFPAs64BitIntLo $A)),
+ VSSRC)>;
+
+def : Pat<(f64 nzFPImmAsi64:$A),
+ (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX (IMPLICIT_DEF), 0,
+ (getFPAs64BitIntHi $A)),
+ 1, (getFPAs64BitIntLo $A)),
+ VSFRC)>;
+
// Anonymous patterns for XXEVAL
// AND
// and(A, B, C)
@@ -2615,37 +2653,36 @@
// and(A, or(B, C))
def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
// and(A, nor(B, C))
- def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (or v4i32:$vB, v4i32:$vC))),
- 8>;
+ def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
// and(A, eqv(B, C))
- def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (xor v4i32:$vB, v4i32:$vC))),
- 9>;
+ def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
// and(A, nand(B, C))
- def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (and v4i32:$vB, v4i32:$vC))),
- 14>;
+ def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
// NAND
// nand(A, B, C)
- def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+ def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
!sub(255, 1)>;
// nand(A, xor(B, C))
- def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+ def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
!sub(255, 6)>;
// nand(A, or(B, C))
- def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+ def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
!sub(255, 7)>;
// nand(A, nor(B, C))
- def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+ def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
!sub(255, 8)>;
// nand(A, eqv(B, C))
- def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+ def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
!sub(255, 9)>;
// nand(A, nand(B, C))
- def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+ def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
!sub(255, 14)>;
}
let Predicates = [PrefixInstrs] in {
+ def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>;
+ def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>;
def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
(COPY_TO_REGCLASS (XXPERMX (COPY_TO_REGCLASS $A, VSRC),
(COPY_TO_REGCLASS $B, VSRC),
@@ -2666,3 +2703,136 @@
(XXBLENDVD $A, $B, $C)>;
}
+def InsertEltShift {
+ dag Sub32Left0 = (EXTRACT_SUBREG $rB, sub_32);
+ dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30);
+ dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29);
+ dag Left3 = (RLWINM8 $rB, 3, 0, 28);
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
+ // Indexed vector insert element
+ def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
+ (VINSBRX $vDi, InsertEltShift.Sub32Left0, $rA)>;
+ def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+ (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, $rA)>;
+
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
+
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+
+ // Immediate vector insert element
+ foreach i = [0, 1, 2, 3] in {
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), $rA)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (LWZX memrr:$rA))>;
+ }
+ foreach i = [0, 1] in
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
+ (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in {
+ // Indexed vector insert element
+ def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i32:$rB)),
+ (VINSBLX $vDi, $rB, $rA)>;
+ def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i32:$rB)),
+ (VINSHLX $vDi, $rB, $rA)>;
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i32:$rB)),
+ (VINSWLX $vDi, $rB, $rA)>;
+
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i32:$rB)),
+ (VINSWLX $vDi, $rB, Bitcast.FltToInt)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
+ i32:$rB)),
+ (VINSWLX $vDi, $rB, (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
+ i32:$rB)),
+ (VINSWLX $vDi, $rB, (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
+ i32:$rB)),
+ (VINSWLX $vDi, $rB, (LWZX memrr:$rA))>;
+
+ // Immediate vector insert element
+ foreach i = [0, 1, 2, 3] in {
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i32 i))),
+ (VINSW $vDi, !mul(i, 4), $rA)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
+ (i32 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
+ (i32 i))),
+ (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
+ (i32 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>;
+ }
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
+ // Indexed vector insert element
+ def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
+ (VINSBLX $vDi, InsertEltShift.Sub32Left0, $rA)>;
+ def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+ (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, $rA)>;
+
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
+
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+
+ // Immediate vector insert element
+ foreach i = [0, 1, 2, 3] in {
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
+ (VINSW $vDi, !mul(i, 4), $rA)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>;
+ }
+ foreach i = [0, 1] in
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
+ (VINSD $vDi, !mul(i, 8), $rA)>;
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index 299b34c..1e0cc7f 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -147,11 +147,11 @@
def EFDADD : EFXForm_1<736, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"efdadd $RT, $RA, $RB", IIC_FPAddSub,
- [(set f64:$RT, (fadd f64:$RA, f64:$RB))]>;
+ [(set f64:$RT, (any_fadd f64:$RA, f64:$RB))]>;
def EFDCFS : EFXForm_2a<751, (outs sperc:$RT), (ins spe4rc:$RB),
"efdcfs $RT, $RB", IIC_FPDGeneral,
- [(set f64:$RT, (fpextend f32:$RB))]>;
+ [(set f64:$RT, (any_fpextend f32:$RB))]>;
def EFDCFSF : EFXForm_2a<755, (outs sperc:$RT), (ins spe4rc:$RB),
"efdcfsf $RT, $RB", IIC_FPDGeneral, []>;
@@ -216,11 +216,11 @@
def EFDDIV : EFXForm_1<745, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"efddiv $RT, $RA, $RB", IIC_FPDivD,
- [(set f64:$RT, (fdiv f64:$RA, f64:$RB))]>;
+ [(set f64:$RT, (any_fdiv f64:$RA, f64:$RB))]>;
def EFDMUL : EFXForm_1<744, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"efdmul $RT, $RA, $RB", IIC_FPDGeneral,
- [(set f64:$RT, (fmul f64:$RA, f64:$RB))]>;
+ [(set f64:$RT, (any_fmul f64:$RA, f64:$RB))]>;
def EFDNABS : EFXForm_2<741, (outs sperc:$RT), (ins sperc:$RA),
"efdnabs $RT, $RA", IIC_FPDGeneral,
@@ -232,7 +232,7 @@
def EFDSUB : EFXForm_1<737, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"efdsub $RT, $RA, $RB", IIC_FPDGeneral,
- [(set f64:$RT, (fsub f64:$RA, f64:$RB))]>;
+ [(set f64:$RT, (any_fsub f64:$RA, f64:$RB))]>;
let isCompare = 1 in {
def EFDTSTEQ : EFXForm_3<766, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
@@ -250,11 +250,11 @@
def EFSADD : EFXForm_1<704, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
"efsadd $RT, $RA, $RB", IIC_FPAddSub,
- [(set f32:$RT, (fadd f32:$RA, f32:$RB))]>;
+ [(set f32:$RT, (any_fadd f32:$RA, f32:$RB))]>;
def EFSCFD : EFXForm_2a<719, (outs spe4rc:$RT), (ins sperc:$RB),
"efscfd $RT, $RB", IIC_FPSGeneral,
- [(set f32:$RT, (fpround f64:$RB))]>;
+ [(set f32:$RT, (any_fpround f64:$RB))]>;
def EFSCFSF : EFXForm_2a<723, (outs spe4rc:$RT), (ins spe4rc:$RB),
"efscfsf $RT, $RB", IIC_FPSGeneral, []>;
@@ -303,11 +303,11 @@
def EFSDIV : EFXForm_1<713, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
"efsdiv $RT, $RA, $RB", IIC_FPDivD,
- [(set f32:$RT, (fdiv f32:$RA, f32:$RB))]>;
+ [(set f32:$RT, (any_fdiv f32:$RA, f32:$RB))]>;
def EFSMUL : EFXForm_1<712, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
"efsmul $RT, $RA, $RB", IIC_FPGeneral,
- [(set f32:$RT, (fmul f32:$RA, f32:$RB))]>;
+ [(set f32:$RT, (any_fmul f32:$RA, f32:$RB))]>;
def EFSNABS : EFXForm_2<709, (outs spe4rc:$RT), (ins spe4rc:$RA),
"efsnabs $RT, $RA", IIC_FPGeneral,
@@ -319,7 +319,7 @@
def EFSSUB : EFXForm_1<705, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
"efssub $RT, $RA, $RB", IIC_FPSGeneral,
- [(set f32:$RT, (fsub f32:$RA, f32:$RB))]>;
+ [(set f32:$RT, (any_fsub f32:$RA, f32:$RB))]>;
let isCompare = 1 in {
def EFSTSTEQ : EFXForm_3<734, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index db6e00c..c0f2aed 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -152,6 +152,7 @@
def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">;
def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
+def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
//--------------------- VSX-specific instruction formats ---------------------//
// By default, all VSX instructions are to be selected over their Altivec
@@ -315,13 +316,13 @@
let CodeSize = 3 in
def XFLOADf64 : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#XFLOADf64",
- [(set f64:$XT, (load xoaddr:$src))]>;
+ [(set f64:$XT, (load XForm:$src))]>;
let Predicates = [HasVSX, HasOnlySwappingMemOps] in
def LXVD2X : XX1Form_memOp<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
"lxvd2x $XT, $src", IIC_LdStLFD,
- [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+ []>;
def LXVDSX : XX1Form_memOp<31, 332,
(outs vsrc:$XT), (ins memrr:$src),
@@ -346,7 +347,7 @@
let CodeSize = 3 in
def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
"#XFSTOREf64",
- [(store f64:$XT, xoaddr:$dst)]>;
+ [(store f64:$XT, XForm:$dst)]>;
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
// The behaviour of this instruction is endianness-specific so we provide no
@@ -848,14 +849,16 @@
[(set v2f64:$XT, (int_ppc_vsx_xvcvspdp v4f32:$XB))]>;
def XVCVSPSXDS : XX2Form<60, 408,
(outs vsrc:$XT), (ins vsrc:$XB),
- "xvcvspsxds $XT, $XB", IIC_VecFP, []>;
+ "xvcvspsxds $XT, $XB", IIC_VecFP,
+ [(set v2i64:$XT, (int_ppc_vsx_xvcvspsxds v4f32:$XB))]>;
def XVCVSPSXWS : XX2Form<60, 152,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvspsxws $XT, $XB", IIC_VecFP,
[(set v4i32:$XT, (any_fp_to_sint v4f32:$XB))]>;
def XVCVSPUXDS : XX2Form<60, 392,
(outs vsrc:$XT), (ins vsrc:$XB),
- "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
+ "xvcvspuxds $XT, $XB", IIC_VecFP,
+ [(set v2i64:$XT, (int_ppc_vsx_xvcvspuxds v4f32:$XB))]>;
def XVCVSPUXWS : XX2Form<60, 136,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvspuxws $XT, $XB", IIC_VecFP,
@@ -1009,13 +1012,13 @@
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlandc $XT, $XA, $XB", IIC_VecGeneral,
[(set v4i32:$XT, (and v4i32:$XA,
- (vnot_ppc v4i32:$XB)))]>;
+ (vnot v4i32:$XB)))]>;
let isCommutable = 1 in {
def XXLNOR : XX3Form<60, 162,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlnor $XT, $XA, $XB", IIC_VecGeneral,
- [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
- v4i32:$XB)))]>;
+ [(set v4i32:$XT, (vnot (or v4i32:$XA,
+ v4i32:$XB)))]>;
def XXLOR : XX3Form<60, 146,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlor $XT, $XA, $XB", IIC_VecGeneral,
@@ -1092,12 +1095,11 @@
def XXLEQV : XX3Form<60, 186,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxleqv $XT, $XA, $XB", IIC_VecGeneral,
- [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>;
+ [(set v4i32:$XT, (vnot (xor v4i32:$XA, v4i32:$XB)))]>;
def XXLNAND : XX3Form<60, 178,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlnand $XT, $XA, $XB", IIC_VecGeneral,
- [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
- v4i32:$XB)))]>;
+ [(set v4i32:$XT, (vnot (and v4i32:$XA, v4i32:$XB)))]>;
} // isCommutable
let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
@@ -1110,7 +1112,7 @@
def XXLORC : XX3Form<60, 170,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlorc $XT, $XA, $XB", IIC_VecGeneral,
- [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
+ [(set v4i32:$XT, (or v4i32:$XA, (vnot v4i32:$XB)))]>;
// VSX scalar loads introduced in ISA 2.07
let mayLoad = 1, mayStore = 0 in {
@@ -1126,15 +1128,15 @@
let CodeSize = 3 in
def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
"#XFLOADf32",
- [(set f32:$XT, (load xoaddr:$src))]>;
+ [(set f32:$XT, (load XForm:$src))]>;
// Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#LIWAX",
- [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+ [(set f64:$XT, (PPClfiwax ForceXForm:$src))]>;
// Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#LIWZX",
- [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+ [(set f64:$XT, (PPClfiwzx ForceXForm:$src))]>;
} // mayLoad
// VSX scalar stores introduced in ISA 2.07
@@ -1149,11 +1151,11 @@
let CodeSize = 3 in
def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
"#XFSTOREf32",
- [(store f32:$XT, xoaddr:$dst)]>;
+ [(store f32:$XT, XForm:$dst)]>;
// Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
"#STIWX",
- [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+ [(PPCstfiwx f64:$XT, ForceXForm:$dst)]>;
} // mayStore
// VSX Elementary Scalar FP arithmetic (SP)
@@ -1537,10 +1539,10 @@
// Insert Exponent DP/QP
// XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
+ def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+ "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let hasSideEffects = 1 in {
- def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
- "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
// vB NOTE: only vB.dword[0] is used, that's why we don't use
// X_VT5_VA5_VB5 form
def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
@@ -1548,11 +1550,11 @@
}
// Extract Exponent/Significand DP/QP
+ def XSXEXPDP : XX2_RT5_XO5_XB6<60, 0, 347, "xsxexpdp", []>;
+ def XSXSIGDP : XX2_RT5_XO5_XB6<60, 1, 347, "xsxsigdp", []>;
+
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let hasSideEffects = 1 in {
- def XSXEXPDP : XX2_RT5_XO5_XB6<60, 0, 347, "xsxexpdp", []>;
- def XSXSIGDP : XX2_RT5_XO5_XB6<60, 1, 347, "xsxsigdp", []>;
-
def XSXEXPQP : X_VT5_XO5_VB5 <63, 2, 804, "xsxexpqp", []>;
def XSXSIGQP : X_VT5_XO5_VB5 <63, 18, 804, "xsxsigqp", []>;
}
@@ -1680,9 +1682,9 @@
// Load as Integer Byte/Halfword & Zero Indexed
def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
- [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
+ [(set f64:$XT, (PPClxsizx ForceXForm:$src, 1))]>;
def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
- [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;
+ [(set f64:$XT, (PPClxsizx ForceXForm:$src, 2))]>;
// Load Vector Halfword*8/Byte*16 Indexed
def LXVH8X : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
@@ -1690,7 +1692,7 @@
// Load Vector Indexed
def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
- [(set v2f64:$XT, (load xaddrX16:$src))]>;
+ [(set v2f64:$XT, (load XForm:$src))]>;
// Load Vector (Left-justified) with Length
def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvl $XT, $src, $rB", IIC_LdStLoad,
@@ -1718,9 +1720,9 @@
// Store as Integer Byte/Halfword Indexed
def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc,
- [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
+ [(PPCstxsix f64:$XT, ForceXForm:$dst, 1)]>;
def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc,
- [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
+ [(PPCstxsix f64:$XT, ForceXForm:$dst, 2)]>;
let isCodeGenOnly = 1 in {
def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsrc, []>;
def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsrc, []>;
@@ -1732,7 +1734,7 @@
// Store Vector Indexed
def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
- [(store v2f64:$XT, xaddrX16:$dst)]>;
+ [(store v2f64:$XT, XForm:$dst)]>;
// Store Vector (Left-justified) with Length
def STXVL : XX1Form_memOp<31, 397, (outs),
@@ -1749,16 +1751,16 @@
def DFLOADf32 : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
"#DFLOADf32",
- [(set f32:$XT, (load iaddrX4:$src))]>;
+ [(set f32:$XT, (load DSForm:$src))]>;
def DFLOADf64 : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
"#DFLOADf64",
- [(set f64:$XT, (load iaddrX4:$src))]>;
+ [(set f64:$XT, (load DSForm:$src))]>;
def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
"#DFSTOREf32",
- [(store f32:$XT, iaddrX4:$dst)]>;
+ [(store f32:$XT, DSForm:$dst)]>;
def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
"#DFSTOREf64",
- [(store f64:$XT, iaddrX4:$dst)]>;
+ [(store f64:$XT, DSForm:$dst)]>;
let mayStore = 1 in {
def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
@@ -1811,6 +1813,13 @@
}
//----------------------------- DAG Definitions ------------------------------//
+
+// Output dag used to bitcast f32 to i32 and f64 to i64
+def Bitcast {
+ dag FltToInt = (i32 (MFVSRWZ (EXTRACT_SUBREG (XSCVDPSPN $A), sub_64)));
+ dag DblToLong = (i64 (MFVSRD $A));
+}
+
def FpMinMax {
dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC),
(COPY_TO_REGCLASS $B, VSFRC)),
@@ -1821,19 +1830,19 @@
}
def ScalarLoads {
- dag Li8 = (i32 (extloadi8 xoaddr:$src));
- dag ZELi8 = (i32 (zextloadi8 xoaddr:$src));
- dag ZELi8i64 = (i64 (zextloadi8 xoaddr:$src));
- dag SELi8 = (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
- dag SELi8i64 = (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
+ dag Li8 = (i32 (extloadi8 ForceXForm:$src));
+ dag ZELi8 = (i32 (zextloadi8 ForceXForm:$src));
+ dag ZELi8i64 = (i64 (zextloadi8 ForceXForm:$src));
+ dag SELi8 = (i32 (sext_inreg (extloadi8 ForceXForm:$src), i8));
+ dag SELi8i64 = (i64 (sext_inreg (extloadi8 ForceXForm:$src), i8));
- dag Li16 = (i32 (extloadi16 xoaddr:$src));
- dag ZELi16 = (i32 (zextloadi16 xoaddr:$src));
- dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
- dag SELi16 = (i32 (sextloadi16 xoaddr:$src));
- dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
+ dag Li16 = (i32 (extloadi16 ForceXForm:$src));
+ dag ZELi16 = (i32 (zextloadi16 ForceXForm:$src));
+ dag ZELi16i64 = (i64 (zextloadi16 ForceXForm:$src));
+ dag SELi16 = (i32 (sextloadi16 ForceXForm:$src));
+ dag SELi16i64 = (i64 (sextloadi16 ForceXForm:$src));
- dag Li32 = (i32 (load xoaddr:$src));
+ dag Li32 = (i32 (load ForceXForm:$src));
}
def DWToSPExtractConv {
@@ -2179,6 +2188,11 @@
dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+ // BE variable float 32-bit version
+ dag BE_32B_VFLOAT_PERM_VEC = (v16i8 (LVSL (i32 ZERO), (RLWINM $Idx, 2, 0, 29)));
+ dag BE_32B_VFLOAT_PERMUTE = (VPERM $S, $S, BE_32B_VFLOAT_PERM_VEC);
+ dag BE_32B_VARIABLE_FLOAT = (XSCVSPDPN BE_32B_VFLOAT_PERMUTE);
+
/* BE variable double
Same as the BE doubleword except there is no move.
*/
@@ -2186,11 +2200,19 @@
(v16i8 (COPY_TO_REGCLASS $S, VRRC)),
BE_VDWORD_PERM_VEC));
dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+
+ // BE variable double 32-bit version
+ dag BE_32B_VDWORD_PERM_VEC = (v16i8 (LVSL (i32 ZERO),
+ (RLWINM (ANDI_rec $Idx, 1), 3, 0, 28)));
+ dag BE_32B_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ BE_32B_VDWORD_PERM_VEC));
+ dag BE_32B_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_32B_VDOUBLE_PERMUTE, VSRC);
}
def AlignValues {
- dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
- dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
+ dag F32_TO_BE_WORD1 = (v4f32 (XSCVDPSPN $B));
+ dag I32_TO_BE_WORD1 = (SUBREG_TO_REG (i64 1), (MTVSRWZ $B), sub_64);
}
// Integer extend helper dags 32 -> 64
@@ -2271,22 +2293,22 @@
}
def FltToIntLoad {
- dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 ForceXForm:$A)))));
}
def FltToUIntLoad {
- dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 ForceXForm:$A)))));
}
def FltToLongLoad {
- dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ForceXForm:$A)))));
}
def FltToLongLoadP9 {
- dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddrX4:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 DSForm:$A)))));
}
def FltToULongLoad {
- dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ForceXForm:$A)))));
}
def FltToULongLoadP9 {
- dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddrX4:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 DSForm:$A)))));
}
def FltToLong {
dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A)))));
@@ -2313,42 +2335,42 @@
dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A))));
}
def DblToIntLoad {
- dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ForceXForm:$A)))));
}
def DblToIntLoadP9 {
- dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddrX4:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load DSForm:$A)))));
}
def DblToUIntLoad {
- dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ForceXForm:$A)))));
}
def DblToUIntLoadP9 {
- dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddrX4:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load DSForm:$A)))));
}
def DblToLongLoad {
- dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load ForceXForm:$A)))));
}
def DblToULongLoad {
- dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load ForceXForm:$A)))));
}
// FP load dags (for f32 -> v4f32)
def LoadFP {
- dag A = (f32 (load xoaddr:$A));
- dag B = (f32 (load xoaddr:$B));
- dag C = (f32 (load xoaddr:$C));
- dag D = (f32 (load xoaddr:$D));
+ dag A = (f32 (load ForceXForm:$A));
+ dag B = (f32 (load ForceXForm:$B));
+ dag C = (f32 (load ForceXForm:$C));
+ dag D = (f32 (load ForceXForm:$D));
}
// FP merge dags (for f32 -> v4f32)
def MrgFP {
- dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC);
- dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC);
- dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC);
- dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC);
- dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
- (COPY_TO_REGCLASS $C, VSRC), 0));
- dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
- (COPY_TO_REGCLASS $D, VSRC), 0));
+ dag LD32A = (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$A), sub_64);
+ dag LD32B = (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$B), sub_64);
+ dag LD32C = (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$C), sub_64);
+ dag LD32D = (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$D), sub_64);
+ dag AC = (XVCVDPSP (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
+ (SUBREG_TO_REG (i64 1), $C, sub_64), 0));
+ dag BD = (XVCVDPSP (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64),
+ (SUBREG_TO_REG (i64 1), $D, sub_64), 0));
dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0));
dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3));
dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0));
@@ -2375,10 +2397,10 @@
// For big endian, we merge hi doublewords of (A, C) and (B, D), convert
// then merge.
- dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC),
- (COPY_TO_REGCLASS f64:$C, VSRC), 0));
- dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC),
- (COPY_TO_REGCLASS f64:$D, VSRC), 0));
+ dag AC = (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), f64:$A, sub_64),
+ (SUBREG_TO_REG (i64 1), f64:$C, sub_64), 0));
+ dag BD = (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), f64:$B, sub_64),
+ (SUBREG_TO_REG (i64 1), f64:$D, sub_64), 0));
dag CVACS = (v4i32 (XVCVDPSXWS AC));
dag CVBDS = (v4i32 (XVCVDPSXWS BD));
dag CVACU = (v4i32 (XVCVDPUXWS AC));
@@ -2386,16 +2408,32 @@
// For little endian, we merge hi doublewords of (D, B) and (C, A), convert
// then merge.
- dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC),
- (COPY_TO_REGCLASS f64:$B, VSRC), 0));
- dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC),
- (COPY_TO_REGCLASS f64:$A, VSRC), 0));
+ dag DB = (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), f64:$D, sub_64),
+ (SUBREG_TO_REG (i64 1), f64:$B, sub_64), 0));
+ dag CA = (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), f64:$C, sub_64),
+ (SUBREG_TO_REG (i64 1), f64:$A, sub_64), 0));
dag CVDBS = (v4i32 (XVCVDPSXWS DB));
dag CVCAS = (v4i32 (XVCVDPSXWS CA));
dag CVDBU = (v4i32 (XVCVDPUXWS DB));
dag CVCAU = (v4i32 (XVCVDPUXWS CA));
}
+def DblwdCmp {
+ dag SGTW = (v2i64 (v2i64 (VCMPGTSW v2i64:$vA, v2i64:$vB)));
+ dag UGTW = (v2i64 (v2i64 (VCMPGTUW v2i64:$vA, v2i64:$vB)));
+ dag EQW = (v2i64 (v2i64 (VCMPEQUW v2i64:$vA, v2i64:$vB)));
+ dag UGTWSHAND = (v2i64 (XXLAND (v2i64 (XXSLDWI UGTW, UGTW, 1)), EQW));
+ dag EQWSHAND = (v2i64 (XXLAND (v2i64 (XXSLDWI EQW, EQW, 1)), EQW));
+ dag SGTWOR = (v2i64 (XXLOR SGTW, UGTWSHAND));
+ dag UGTWOR = (v2i64 (XXLOR UGTW, UGTWSHAND));
+ dag MRGSGT = (v2i64 (XXPERMDI (v2i64 (XXSPLTW SGTWOR, 0)),
+ (v2i64 (XXSPLTW SGTWOR, 2)), 0));
+ dag MRGUGT = (v2i64 (XXPERMDI (v2i64 (XXSPLTW UGTWOR, 0)),
+ (v2i64 (XXSPLTW UGTWOR, 2)), 0));
+ dag MRGEQ = (v2i64 (XXPERMDI (v2i64 (XXSPLTW EQWSHAND, 0)),
+ (v2i64 (XXSPLTW EQWSHAND, 2)), 0));
+}
+
//---------------------------- Anonymous Patterns ----------------------------//
// Predicate combinations are kept in roughly chronological order in terms of
// instruction availability in the architecture. For example, VSX came in with
@@ -2415,9 +2453,11 @@
// [HasVSX, IsLittleEndian]
// [HasVSX, NoP9Vector]
// [HasVSX, NoP9Vector, IsLittleEndian]
+// [HasVSX, NoP9Vector, IsBigEndian]
// [HasVSX, HasOnlySwappingMemOps]
// [HasVSX, HasOnlySwappingMemOps, IsBigEndian]
// [HasVSX, HasP8Vector]
+// [HasVSX, HasP8Vector, IsBigEndian]
// [HasVSX, HasP8Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP8Vector, IsLittleEndian]
// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64]
@@ -2430,6 +2470,8 @@
// [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
// [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
// [HasVSX, HasP9Vector]
+// [HasVSX, HasP9Vector, NoP10Vector]
+// [HasVSX, HasP9Vector, IsBigEndian]
// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP9Vector, IsLittleEndian]
// [HasVSX, HasP9Altivec]
@@ -2453,9 +2495,9 @@
let AddedComplexity = 400 in {
// Valid for any VSX subtarget, regardless of endianness.
let Predicates = [HasVSX] in {
-def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
+def : Pat<(v4i32 (vnot v4i32:$A)),
(v4i32 (XXLNOR $A, $A))>;
-def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A),
+def : Pat<(v4i32 (or (and (vnot v4i32:$C), v4i32:$A),
(and v4i32:$B, v4i32:$C))),
(v4i32 (XXSEL $A, $B, $C))>;
@@ -2655,6 +2697,11 @@
(XXSEL $vC, $vB, $vA)>;
def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
(XXSEL $vC, $vB, $vA)>;
+def : Pat<(v1i128 (vselect v1i128:$vA, v1i128:$vB, v1i128:$vC)),
+ (COPY_TO_REGCLASS
+ (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
+ (COPY_TO_REGCLASS $vB, VSRC),
+ (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
def : Pat<(v4f32 (any_fmaxnum v4f32:$src1, v4f32:$src2)),
(v4f32 (XVMAXSP $src1, $src2))>;
@@ -2713,12 +2760,12 @@
def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
(f64 (XSMAXDP $A, $B))>;
-def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
- (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
- (STXVW4X $rS, xoaddr:$dst)>;
-def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, ForceXForm:$dst),
+ (STXVD2X $rS, ForceXForm:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, ForceXForm:$dst),
+ (STXVW4X $rS, ForceXForm:$dst)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be ForceXForm:$src)), (LXVW4X ForceXForm:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
// Rounding for single precision.
def : Pat<(f32 (any_fround f32:$S)),
@@ -2752,30 +2799,34 @@
// Build vectors of floating point converted to i32.
def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
DblToInt.A, DblToInt.A)),
- (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
+ (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWS $A), sub_64), 1))>;
def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
DblToUInt.A, DblToUInt.A)),
- (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
+ (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPUXWS $A), sub_64), 1))>;
def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
- (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
- (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
+ (v2i64 (XXPERMDI (SUBREG_TO_REG (i64 1), (XSCVDPSXDS $A), sub_64),
+ (SUBREG_TO_REG (i64 1), (XSCVDPSXDS $A), sub_64), 0))>;
def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
- (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
- (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
+ (v2i64 (XXPERMDI (SUBREG_TO_REG (i64 1), (XSCVDPUXDS $A), sub_64),
+ (SUBREG_TO_REG (i64 1), (XSCVDPUXDS $A), sub_64), 0))>;
defm : ScalToVecWPermute<
v4i32, FltToIntLoad.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1),
- (COPY_TO_REGCLASS (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWSs (XFLOADf32 ForceXForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPSXWSs (XFLOADf32 ForceXForm:$A)), sub_64)>;
defm : ScalToVecWPermute<
v4i32, FltToUIntLoad.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1),
- (COPY_TO_REGCLASS (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPUXWSs (XFLOADf32 ForceXForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPUXWSs (XFLOADf32 ForceXForm:$A)), sub_64)>;
+def : Pat<(v4f32 (build_vector (f32 (fpround f64:$A)), (f32 (fpround f64:$A)),
+ (f32 (fpround f64:$A)), (f32 (fpround f64:$A)))),
+ (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$A), sub_64), 0))>;
+
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
-def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
- (v2f64 (LXVDSX xoaddr:$A))>;
-def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
- (v2i64 (LXVDSX xoaddr:$A))>;
+def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)),
+ (v2f64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)),
+ (v2i64 (LXVDSX ForceXForm:$A))>;
// Build vectors of floating point converted to i64.
def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -2786,10 +2837,27 @@
(COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
defm : ScalToVecWPermute<
v2i64, DblToLongLoad.A,
- (XVCVDPSXDS (LXVDSX xoaddr:$A)), (XVCVDPSXDS (LXVDSX xoaddr:$A))>;
+ (XVCVDPSXDS (LXVDSX ForceXForm:$A)), (XVCVDPSXDS (LXVDSX ForceXForm:$A))>;
defm : ScalToVecWPermute<
v2i64, DblToULongLoad.A,
- (XVCVDPUXDS (LXVDSX xoaddr:$A)), (XVCVDPUXDS (LXVDSX xoaddr:$A))>;
+ (XVCVDPUXDS (LXVDSX ForceXForm:$A)), (XVCVDPUXDS (LXVDSX ForceXForm:$A))>;
+
+// Doubleword vector predicate comparisons without Power8.
+let AddedComplexity = 0 in {
+def : Pat<(v2i64 (PPCvcmp_rec v2i64:$vA, v2i64:$vB, 967)),
+ (VCMPGTUB_rec DblwdCmp.MRGSGT, (v2i64 (XXLXORz)))>;
+def : Pat<(v2i64 (PPCvcmp_rec v2i64:$vA, v2i64:$vB, 711)),
+ (VCMPGTUB_rec DblwdCmp.MRGUGT, (v2i64 (XXLXORz)))>;
+def : Pat<(v2i64 (PPCvcmp_rec v2i64:$vA, v2i64:$vB, 199)),
+ (VCMPGTUB_rec DblwdCmp.MRGEQ, (v2i64 (XXLXORz)))>;
+} // AddedComplexity = 0
+
+// XL Compat builtins.
+def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (XSMSUBMDP $A, $B, $C)>;
+def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (XSNMSUBMDP $A, $B, $C)>;
+def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (XSNMADDMDP $A, $B, $C)>;
+def : Pat<(int_ppc_fre f64:$A), (XSREDP $A)>;
+def : Pat<(int_ppc_frsqrte vsfrc:$XB), (XSRSQRTEDP $XB)>;
} // HasVSX
// Any big endian VSX subtarget.
@@ -2815,8 +2883,8 @@
def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
(v2f64 (XXPERMDI
- (COPY_TO_REGCLASS $A, VSRC),
- (COPY_TO_REGCLASS $B, VSRC), 0))>;
+ (SUBREG_TO_REG (i64 1), $A, sub_64),
+ (SUBREG_TO_REG (i64 1), $B, sub_64), 0))>;
// Using VMRGEW to assemble the final vector would be a lower latency
// solution. However, we choose to go with the slightly higher latency
// XXPERMDI for 2 reasons:
@@ -2875,6 +2943,22 @@
(f64 (fpextend (extractelt v4f32:$B, 3))))),
(v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $A, $B, 3),
(XXPERMDI $A, $B, 3), 1)))>;
+def : Pat<(v2i64 (fp_to_sint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+ (f64 (fpextend (extractelt v4f32:$A, 2)))))),
+ (v2i64 (XVCVSPSXDS $A))>;
+def : Pat<(v2i64 (fp_to_uint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+ (f64 (fpextend (extractelt v4f32:$A, 2)))))),
+ (v2i64 (XVCVSPUXDS $A))>;
+def : Pat<(v2i64 (fp_to_sint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+ (f64 (fpextend (extractelt v4f32:$A, 3)))))),
+ (v2i64 (XVCVSPSXDS (XXSLDWI $A, $A, 1)))>;
+def : Pat<(v2i64 (fp_to_uint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+ (f64 (fpextend (extractelt v4f32:$A, 3)))))),
+ (v2i64 (XVCVSPUXDS (XXSLDWI $A, $A, 1)))>;
def : Pat<WToDPExtractConv.BV02S,
(v2f64 (XVCVSXWDP $A))>;
def : Pat<WToDPExtractConv.BV13S,
@@ -2883,6 +2967,10 @@
(v2f64 (XVCVUXWDP $A))>;
def : Pat<WToDPExtractConv.BV13U,
(v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 3)))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)),
+ (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64), $A, 1))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)),
+ (v2f64 (XXPERMDI $A, (SUBREG_TO_REG (i64 1), $B, sub_64), 0))>;
} // HasVSX, IsBigEndian
// Any little endian VSX subtarget.
@@ -2892,19 +2980,22 @@
(SUBREG_TO_REG (i64 1), $A, sub_64), 0),
(SUBREG_TO_REG (i64 1), $A, sub_64)>;
+def : Pat<(f64 (extractelt (v2f64 (bitconvert (v16i8
+ (PPCvperm v16i8:$A, v16i8:$B, v16i8:$C)))), 0)),
+ (f64 (EXTRACT_SUBREG (VPERM $B, $A, $C), sub_64))>;
def : Pat<(f64 (extractelt v2f64:$S, 0)),
(f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
def : Pat<(f64 (extractelt v2f64:$S, 1)),
(f64 (EXTRACT_SUBREG $S, sub_64))>;
-def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
-def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(v2f64 (PPCld_vec_be ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
+def : Pat<(PPCst_vec_be v2f64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>;
+def : Pat<(v4f32 (PPCld_vec_be ForceXForm:$src)), (LXVW4X ForceXForm:$src)>;
+def : Pat<(PPCst_vec_be v4f32:$rS, ForceXForm:$dst), (STXVW4X $rS, ForceXForm:$dst)>;
+def : Pat<(v2i64 (PPCld_vec_be ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
+def : Pat<(PPCst_vec_be v2i64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>;
+def : Pat<(v4i32 (PPCld_vec_be ForceXForm:$src)), (LXVW4X ForceXForm:$src)>;
+def : Pat<(PPCst_vec_be v4i32:$rS, ForceXForm:$dst), (STXVW4X $rS, ForceXForm:$dst)>;
def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
(f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
@@ -2920,8 +3011,8 @@
// Little endian, available on all targets with VSX
def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
(v2f64 (XXPERMDI
- (COPY_TO_REGCLASS $B, VSRC),
- (COPY_TO_REGCLASS $A, VSRC), 0))>;
+ (SUBREG_TO_REG (i64 1), $B, sub_64),
+ (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
// Using VMRGEW to assemble the final vector would be a lower latency
// solution. However, we choose to go with the slightly higher latency
// XXPERMDI for 2 reasons:
@@ -2980,6 +3071,22 @@
def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
(f64 (fpextend (extractelt v4f32:$B, 3))))),
(v2f64 (XVCVSPDP (XXPERMDI $B, $A, 0)))>;
+def : Pat<(v2i64 (fp_to_sint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+ (f64 (fpextend (extractelt v4f32:$A, 3)))))),
+ (v2i64 (XVCVSPSXDS $A))>;
+def : Pat<(v2i64 (fp_to_uint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+ (f64 (fpextend (extractelt v4f32:$A, 3)))))),
+ (v2i64 (XVCVSPUXDS $A))>;
+def : Pat<(v2i64 (fp_to_sint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+ (f64 (fpextend (extractelt v4f32:$A, 2)))))),
+ (v2i64 (XVCVSPSXDS (XXSLDWI $A, $A, 1)))>;
+def : Pat<(v2i64 (fp_to_uint
+ (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+ (f64 (fpextend (extractelt v4f32:$A, 2)))))),
+ (v2i64 (XVCVSPUXDS (XXSLDWI $A, $A, 1)))>;
def : Pat<WToDPExtractConv.BV02S,
(v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 1)))>;
def : Pat<WToDPExtractConv.BV13S,
@@ -2988,35 +3095,39 @@
(v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 1)))>;
def : Pat<WToDPExtractConv.BV13U,
(v2f64 (XVCVUXWDP $A))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)),
+ (v2f64 (XXPERMDI $A, (SUBREG_TO_REG (i64 1), $B, sub_64), 0))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)),
+ (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64), $A, 1))>;
} // HasVSX, IsLittleEndian
// Any pre-Power9 VSX subtarget.
let Predicates = [HasVSX, NoP9Vector] in {
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
- (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ForceXForm:$dst, 8),
+ (STXSDX (XSCVDPSXDS f64:$src), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
- (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ForceXForm:$dst, 8),
+ (STXSDX (XSCVDPUXDS f64:$src), ForceXForm:$dst)>;
// Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
defm : ScalToVecWPermute<
v4i32, DblToIntLoad.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1),
- (COPY_TO_REGCLASS (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (XFLOADf64 ForceXForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (XFLOADf64 ForceXForm:$A)), sub_64)>;
defm : ScalToVecWPermute<
v4i32, DblToUIntLoad.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1),
- (COPY_TO_REGCLASS (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (XFLOADf64 ForceXForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (XFLOADf64 ForceXForm:$A)), sub_64)>;
defm : ScalToVecWPermute<
v2i64, FltToLongLoad.A,
- (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0),
- (SUBREG_TO_REG (i64 1), (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A),
+ (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 ForceXForm:$A), VSFRC)), 0),
+ (SUBREG_TO_REG (i64 1), (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 ForceXForm:$A),
VSFRC)), sub_64)>;
defm : ScalToVecWPermute<
v2i64, FltToULongLoad.A,
- (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0),
- (SUBREG_TO_REG (i64 1), (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A),
+ (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 ForceXForm:$A), VSFRC)), 0),
+ (SUBREG_TO_REG (i64 1), (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 ForceXForm:$A),
VSFRC)), sub_64)>;
} // HasVSX, NoP9Vector
@@ -3024,48 +3135,55 @@
let Predicates = [HasVSX, NoP9Vector, IsLittleEndian] in {
// Load-and-splat using only X-Form VSX loads.
defm : ScalToVecWPermute<
- v2i64, (i64 (load xoaddr:$src)),
- (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
- (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+ v2i64, (i64 (load ForceXForm:$src)),
+ (XXPERMDIs (XFLOADf64 ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v2f64, (f64 (load xoaddr:$src)),
- (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
- (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+ v2f64, (f64 (load ForceXForm:$src)),
+ (XXPERMDIs (XFLOADf64 ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
} // HasVSX, NoP9Vector, IsLittleEndian
+let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x ForceXForm:$src)),
+ (LXVD2X ForceXForm:$src)>;
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst),
+ (STXVD2X $rS, ForceXForm:$dst)>;
+} // HasVSX, NoP9Vector, IsBigEndian
+
// Any VSX subtarget that only has loads and stores that load in big endian
// order regardless of endianness. This is really pre-Power9 subtargets.
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
- def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(v2f64 (PPClxvd2x ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
// Stores.
- def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
- (STXVD2X $rS, xoaddr:$dst)>;
- def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(PPCstxvd2x v2f64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>;
} // HasVSX, HasOnlySwappingMemOps
// Big endian VSX subtarget that only has loads and stores that always
// load in big endian order. Really big endian pre-Power9 subtargets.
let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in {
- def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
- def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
- def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
- def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
- def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
- def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
- def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
- def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
- (STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(v2f64 (load ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
+ def : Pat<(v2i64 (load ForceXForm:$src)), (LXVD2X ForceXForm:$src)>;
+ def : Pat<(v4i32 (load ForceXForm:$src)), (LXVW4X ForceXForm:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x ForceXForm:$src)), (LXVW4X ForceXForm:$src)>;
+ def : Pat<(store v2f64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>;
+ def : Pat<(store v2i64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>;
+ def : Pat<(store v4i32:$XT, ForceXForm:$dst), (STXVW4X $XT, ForceXForm:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, ForceXForm:$dst),
+ (STXVW4X $rS, ForceXForm:$dst)>;
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load ForceXForm:$src)))),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
} // HasVSX, HasOnlySwappingMemOps, IsBigEndian
// Any Power8 VSX subtarget.
let Predicates = [HasVSX, HasP8Vector] in {
def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
(XXLEQV $A, $B)>;
-def : Pat<(f64 (extloadf32 xoaddr:$src)),
- (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
-def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
- (f32 (XFLOADf32 xoaddr:$src))>;
+def : Pat<(f64 (extloadf32 XForm:$src)),
+ (COPY_TO_REGCLASS (XFLOADf32 XForm:$src), VSFRC)>;
+def : Pat<(f32 (fpround (f64 (extloadf32 ForceXForm:$src)))),
+ (f32 (XFLOADf32 ForceXForm:$src))>;
def : Pat<(f64 (any_fpextend f32:$src)),
(COPY_TO_REGCLASS $src, VSFRC)>;
@@ -3108,11 +3226,11 @@
// Instructions for converting float to i32 feeding a store.
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
- (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ForceXForm:$dst, 4),
+ (STIWX (XSCVDPSXWS f64:$src), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
- (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ForceXForm:$dst, 4),
+ (STIWX (XSCVDPUXWS f64:$src), ForceXForm:$dst)>;
def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)),
(v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC),
@@ -3135,10 +3253,26 @@
(v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
(v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+
+// XL Compat builtins.
+def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (XSMSUBMSP $A, $B, $C)>;
+def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (XSNMSUBMSP $A, $B, $C)>;
+def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (XSNMADDMSP $A, $B, $C)>;
+def : Pat<(int_ppc_fres f32:$A), (XSRESP $A)>;
+def : Pat<(i32 (int_ppc_extract_exp f64:$A)),
+ (EXTRACT_SUBREG (XSXEXPDP (COPY_TO_REGCLASS $A, VSFRC)), sub_32)>;
+def : Pat<(int_ppc_extract_sig f64:$A),
+ (XSXSIGDP (COPY_TO_REGCLASS $A, VSFRC))>;
+def : Pat<(f64 (int_ppc_insert_exp f64:$A, i64:$B)),
+ (COPY_TO_REGCLASS (XSIEXPDP (COPY_TO_REGCLASS $A, G8RC), $B), F8RC)>;
+
+def : Pat<(int_ppc_stfiw ForceXForm:$dst, f64:$XT),
+ (STXSIWX f64:$XT, ForceXForm:$dst)>;
+def : Pat<(int_ppc_frsqrtes vssrc:$XB), (XSRSQRTESP $XB)>;
} // HasVSX, HasP8Vector
-// Big endian Power8 VSX subtarget.
-let Predicates = [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] in {
+// Any big endian Power8 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, IsBigEndian] in {
def : Pat<DWToSPExtractConv.El0SS1,
(f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
def : Pat<DWToSPExtractConv.El1SS1,
@@ -3149,8 +3283,7 @@
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
// v4f32 scalar <-> vector conversions (BE)
-def : Pat<(v4f32 (scalar_to_vector f32:$A)),
- (v4f32 (XSCVDPSPN $A))>;
+defm : ScalToVecWPermute<v4f32, (f32 f32:$A), (XSCVDPSPN $A), (XSCVDPSPN $A)>;
def : Pat<(f32 (vector_extract v4f32:$S, 0)),
(f32 (XSCVSPDPN $S))>;
def : Pat<(f32 (vector_extract v4f32:$S, 1)),
@@ -3159,8 +3292,6 @@
(f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
def : Pat<(f32 (vector_extract v4f32:$S, 3)),
(f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
-def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
- (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
@@ -3179,19 +3310,33 @@
def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+def : Pat<(f32 (vector_extract v4f32:$S, i32:$Idx)),
+ (f32 VectorExtractions.BE_32B_VARIABLE_FLOAT)>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, i32:$Idx)),
+ (f64 VectorExtractions.BE_32B_VARIABLE_DOUBLE)>;
+} // HasVSX, HasP8Vector, IsBigEndian
+
+// Big endian Power8 64Bit VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] in {
+def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+ (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+
// LIWAX - This instruction is used for sign extending i32 -> i64.
// LIWZX - This instruction will be emitted for i32, f32, and when
// zero-extending i32 to i64 (zext i32 -> i64).
-def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
- (v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
- (v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
- (v4i32 (XXSLDWIs
- (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
-def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
- (v4f32 (XXSLDWIs
- (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 ForceXForm:$src)))),
+ (v2i64 (SUBREG_TO_REG (i64 1), (LIWAX ForceXForm:$src), sub_64))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 ForceXForm:$src)))),
+ (v2i64 (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64))>;
+defm : ScalToVecWPermute<
+ v4i32, (i32 (load ForceXForm:$src)),
+ (XXSLDWIs (LIWZX ForceXForm:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4f32, (f32 (load ForceXForm:$src)),
+ (XXSLDWIs (LIWZX ForceXForm:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
def : Pat<DWToSPExtractConv.BVU,
(v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
@@ -3199,22 +3344,22 @@
def : Pat<DWToSPExtractConv.BVS,
(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3),
(XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3)))>;
-def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src),
- (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src),
- (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (i32 (extractelt v4i32:$A, 1)), ForceXForm:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+def : Pat<(store (f32 (extractelt v4f32:$A, 1)), ForceXForm:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
// Elements in a register on a BE system are in order <0, 1, 2, 3>.
// The store instructions store the second word from the left.
// So to align element zero, we need to modulo-left-shift by 3 words.
// Similar logic applies for elements 2 and 3.
foreach Idx = [ [0,3], [2,1], [3,2] ] in {
- def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+ def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), ForceXForm:$src),
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
- sub_64), xoaddr:$src)>;
- def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+ sub_64), ForceXForm:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), ForceXForm:$src),
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
- sub_64), xoaddr:$src)>;
+ sub_64), ForceXForm:$src)>;
}
} // HasVSX, HasP8Vector, IsBigEndian, IsPPC64
@@ -3232,12 +3377,9 @@
(f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
// v4f32 scalar <-> vector conversions (LE)
- // The permuted version is no better than the version that puts the value
- // into the right element because XSCVDPSPN is different from all the other
- // instructions used for PPCSToV.
defm : ScalToVecWPermute<v4f32, (f32 f32:$A),
(XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1),
- (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 3)>;
+ (XSCVDPSPN $A)>;
def : Pat<(f32 (vector_extract v4f32:$S, 0)),
(f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
def : Pat<(f32 (vector_extract v4f32:$S, 1)),
@@ -3270,24 +3412,24 @@
// LIWZX - This instruction will be emitted for i32, f32, and when
// zero-extending i32 to i64 (zext i32 -> i64).
defm : ScalToVecWPermute<
- v2i64, (i64 (sextloadi32 xoaddr:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (LIWAX xoaddr:$src), sub_64)>;
+ v2i64, (i64 (sextloadi32 ForceXForm:$src)),
+ (XXPERMDIs (LIWAX ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (LIWAX ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v2i64, (i64 (zextloadi32 xoaddr:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+ v2i64, (i64 (zextloadi32 ForceXForm:$src)),
+ (XXPERMDIs (LIWZX ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v4i32, (i32 (load xoaddr:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+ v4i32, (i32 (load ForceXForm:$src)),
+ (XXPERMDIs (LIWZX ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v4f32, (f32 (load xoaddr:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+ v4f32, (f32 (load ForceXForm:$src)),
+ (XXPERMDIs (LIWZX ForceXForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
def : Pat<DWToSPExtractConv.BVU,
(v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3),
@@ -3295,61 +3437,59 @@
def : Pat<DWToSPExtractConv.BVS,
(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3),
(XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3)))>;
-def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src),
- (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src),
- (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (i32 (extractelt v4i32:$A, 2)), ForceXForm:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+def : Pat<(store (f32 (extractelt v4f32:$A, 2)), ForceXForm:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
// Elements in a register on a LE system are in order <3, 2, 1, 0>.
// The store instructions store the second word from the left.
// So to align element 3, we need to modulo-left-shift by 3 words.
// Similar logic applies for elements 0 and 1.
foreach Idx = [ [0,2], [1,1], [3,3] ] in {
- def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+ def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), ForceXForm:$src),
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
- sub_64), xoaddr:$src)>;
- def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+ sub_64), ForceXForm:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), ForceXForm:$src),
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
- sub_64), xoaddr:$src)>;
+ sub_64), ForceXForm:$src)>;
}
} // HasVSX, HasP8Vector, IsLittleEndian
// Big endian pre-Power9 VSX subtarget.
let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] in {
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), ForceXForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), ForceXForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), ForceXForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
- xoaddr:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
+ ForceXForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), ForceXForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
- xoaddr:$src)>;
+ ForceXForm:$src)>;
} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64
// Little endian pre-Power9 VSX subtarget.
let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] in {
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), ForceXForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
- xoaddr:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
+ ForceXForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), ForceXForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
- xoaddr:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ ForceXForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), ForceXForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), ForceXForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
} // HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian
// Any VSX target with direct moves.
let Predicates = [HasVSX, HasDirectMove] in {
// bitconvert f32 -> i32
// (convert to 32-bit fp single, shift right 1 word, move to GPR)
-def : Pat<(i32 (bitconvert f32:$S)),
- (i32 (MFVSRWZ (EXTRACT_SUBREG
- (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
- sub_64)))>;
+def : Pat<(i32 (bitconvert f32:$A)), Bitcast.FltToInt>;
+
// bitconvert i32 -> f32
// (move to FPR, shift left 1 word, convert to 64-bit fp single)
def : Pat<(f32 (bitconvert i32:$A)),
@@ -3358,8 +3498,7 @@
// bitconvert f64 -> i64
// (move to GPR, nothing else needed)
-def : Pat<(i64 (bitconvert f64:$S)),
- (i64 (MFVSRD $S))>;
+def : Pat<(i64 (bitconvert f64:$A)), Bitcast.DblToLong>;
// bitconvert i64 -> f64
// (move to FPR, nothing else needed)
@@ -3402,12 +3541,18 @@
// Big endian VSX subtarget with direct moves.
let Predicates = [HasVSX, HasDirectMove, IsBigEndian] in {
// v16i8 scalar <-> vector conversions (BE)
-def : Pat<(v16i8 (scalar_to_vector i32:$A)),
- (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
-def : Pat<(v8i16 (scalar_to_vector i32:$A)),
- (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
-def : Pat<(v4i32 (scalar_to_vector i32:$A)),
- (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+defm : ScalToVecWPermute<
+ v16i8, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
+defm : ScalToVecWPermute<
+ v8i16, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
def : Pat<(v2i64 (scalar_to_vector i64:$A)),
(v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
@@ -3588,16 +3733,16 @@
// Big endian integer vectors using direct moves.
def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
(v2i64 (XXPERMDI
- (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
- (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
+ (SUBREG_TO_REG (i64 1), (MTVSRD $A), sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRD $B), sub_64), 0))>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
(XXPERMDI
- (COPY_TO_REGCLASS
- (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
- (COPY_TO_REGCLASS
- (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
+ (SUBREG_TO_REG (i64 1),
+ (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), sub_64),
+ (SUBREG_TO_REG (i64 1),
+ (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), sub_64), 0)>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
- (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64), 1)>;
} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64
// Little endian pre-Power9 VSX subtarget that has direct moves.
@@ -3605,16 +3750,16 @@
// Little endian integer vectors using direct moves.
def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
(v2i64 (XXPERMDI
- (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
- (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
+ (SUBREG_TO_REG (i64 1), (MTVSRD $B), sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRD $A), sub_64), 0))>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
(XXPERMDI
- (COPY_TO_REGCLASS
- (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
- (COPY_TO_REGCLASS
- (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
+ (SUBREG_TO_REG (i64 1),
+ (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), sub_64),
+ (SUBREG_TO_REG (i64 1),
+ (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), sub_64), 0)>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
- (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64), 1)>;
}
// Any Power9 VSX subtarget.
@@ -3641,12 +3786,12 @@
// Convert (Un)Signed Word -> QP.
def : Pat<(f128 (any_sint_to_fp i32:$src)),
(f128 (XSCVSDQP (MTVSRWA $src)))>;
-def : Pat<(f128 (any_sint_to_fp (i32 (load xoaddr:$src)))),
- (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
+def : Pat<(f128 (any_sint_to_fp (i32 (load ForceXForm:$src)))),
+ (f128 (XSCVSDQP (LIWAX ForceXForm:$src)))>;
def : Pat<(f128 (any_uint_to_fp i32:$src)),
(f128 (XSCVUDQP (MTVSRWZ $src)))>;
-def : Pat<(f128 (any_uint_to_fp (i32 (load xoaddr:$src)))),
- (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
+def : Pat<(f128 (any_uint_to_fp (i32 (load ForceXForm:$src)))),
+ (f128 (XSCVUDQP (LIWZX ForceXForm:$src)))>;
// Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
// separate pattern so that it can convert the input register class from
@@ -3687,95 +3832,81 @@
(v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
// D-Form Load/Store
-def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-def : Pat<(f128 (quadwOffsetLoad iaddrX16:$src)),
+foreach Ty = [v4i32, v4f32, v2i64, v2f64] in {
+ def : Pat<(Ty (load DQForm:$src)), (LXV memrix16:$src)>;
+ def : Pat<(Ty (load XForm:$src)), (LXVX XForm:$src)>;
+ def : Pat<(store Ty:$rS, DQForm:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(store Ty:$rS, XForm:$dst), (STXVX $rS, XForm:$dst)>;
+}
+
+def : Pat<(f128 (load DQForm:$src)),
(COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
-def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>;
-def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(f128 (load XForm:$src)),
+ (COPY_TO_REGCLASS (LXVX XForm:$src), VRRC)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x DQForm:$src)), (LXV memrix16:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x DQForm:$src)), (LXV memrix16:$src)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x XForm:$src)), (LXVX XForm:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x XForm:$src)), (LXVX XForm:$src)>;
-def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-def : Pat<(quadwOffsetStore f128:$rS, iaddrX16:$dst),
+def : Pat<(store f128:$rS, DQForm:$dst),
(STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
-def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst),
+def : Pat<(store f128:$rS, XForm:$dst),
+ (STXVX (COPY_TO_REGCLASS $rS, VSRC), XForm:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, DQForm:$dst),
(STXV $rS, memrix16:$dst)>;
-def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst),
+def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, DQForm:$dst),
(STXV $rS, memrix16:$dst)>;
-
-def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
-def : Pat<(f128 (nonQuadwOffsetLoad xoaddr:$src)),
- (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
-def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
- (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
-def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, XForm:$dst),
+ (STXVX $rS, XForm:$dst)>;
+def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, XForm:$dst),
+ (STXVX $rS, XForm:$dst)>;
// Build vectors from i8 loads
-defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
- (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
- (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
- (VSPLTHs 3, (LXSIBZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
+ (VSPLTHs 3, (LXSIBZX ForceXForm:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi8,
- (XXSPLTWs (LXSIBZX xoaddr:$src), 1),
- (XXSPLTWs (LXSIBZX xoaddr:$src), 1)>;
+ (XXSPLTWs (LXSIBZX ForceXForm:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX ForceXForm:$src), sub_64)>;
defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi8i64,
- (XXPERMDIs (LXSIBZX xoaddr:$src), 0),
- (XXPERMDIs (LXSIBZX xoaddr:$src), 0)>;
-defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi8,
- (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1),
- (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
- (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0),
- (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
+ (XXPERMDIs (LXSIBZX ForceXForm:$src), 0),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX ForceXForm:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.SELi8,
+ (XXSPLTWs (VEXTSB2Ws (LXSIBZX ForceXForm:$src)), 1),
+ (SUBREG_TO_REG (i64 1), (VEXTSB2Ws (LXSIBZX ForceXForm:$src)), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.SELi8i64,
+ (XXPERMDIs (VEXTSB2Ds (LXSIBZX ForceXForm:$src)), 0),
+ (SUBREG_TO_REG (i64 1), (VEXTSB2Ds (LXSIBZX ForceXForm:$src)), sub_64)>;
// Build vectors from i16 loads
-defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
- (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
-defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
- (XXSPLTWs (LXSIHZX xoaddr:$src), 1),
- (XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi16i64,
- (XXPERMDIs (LXSIHZX xoaddr:$src), 0),
- (XXPERMDIs (LXSIHZX xoaddr:$src), 0)>;
-defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi16,
- (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1),
- (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi16i64,
- (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0),
- (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.ZELi16,
+ (XXSPLTWs (LXSIHZX ForceXForm:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX ForceXForm:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.ZELi16i64,
+ (XXPERMDIs (LXSIHZX ForceXForm:$src), 0),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX ForceXForm:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.SELi16,
+ (XXSPLTWs (VEXTSH2Ws (LXSIHZX ForceXForm:$src)), 1),
+ (SUBREG_TO_REG (i64 1), (VEXTSH2Ws (LXSIHZX ForceXForm:$src)), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.SELi16i64,
+ (XXPERMDIs (VEXTSH2Ds (LXSIHZX ForceXForm:$src)), 0),
+ (SUBREG_TO_REG (i64 1), (VEXTSH2Ds (LXSIHZX ForceXForm:$src)), sub_64)>;
// Load/convert and convert/store patterns for f16.
-def : Pat<(f64 (extloadf16 xoaddr:$src)),
- (f64 (XSCVHPDP (LXSIHZX xoaddr:$src)))>;
-def : Pat<(truncstoref16 f64:$src, xoaddr:$dst),
- (STXSIHX (XSCVDPHP $src), xoaddr:$dst)>;
-def : Pat<(f32 (extloadf16 xoaddr:$src)),
- (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX xoaddr:$src)), VSSRC))>;
-def : Pat<(truncstoref16 f32:$src, xoaddr:$dst),
- (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), xoaddr:$dst)>;
+def : Pat<(f64 (extloadf16 ForceXForm:$src)),
+ (f64 (XSCVHPDP (LXSIHZX ForceXForm:$src)))>;
+def : Pat<(truncstoref16 f64:$src, ForceXForm:$dst),
+ (STXSIHX (XSCVDPHP $src), ForceXForm:$dst)>;
+def : Pat<(f32 (extloadf16 ForceXForm:$src)),
+ (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX ForceXForm:$src)), VSSRC))>;
+def : Pat<(truncstoref16 f32:$src, ForceXForm:$dst),
+ (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), ForceXForm:$dst)>;
def : Pat<(f64 (f16_to_fp i32:$A)),
(f64 (XSCVHPDP (MTVSRWZ $A)))>;
def : Pat<(f32 (f16_to_fp i32:$A)),
@@ -3790,33 +3921,33 @@
def : Pat<(f64 (PPCVexts f64:$A, 2)),
(f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
-def : Pat<(f64 (extloadf32 iaddrX4:$src)),
- (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>;
-def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))),
- (f32 (DFLOADf32 iaddrX4:$src))>;
+def : Pat<(f64 (extloadf32 DSForm:$src)),
+ (COPY_TO_REGCLASS (DFLOADf32 DSForm:$src), VSFRC)>;
+def : Pat<(f32 (fpround (f64 (extloadf32 DSForm:$src)))),
+ (f32 (DFLOADf32 DSForm:$src))>;
-def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
- (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
-def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)),
- (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>;
+def : Pat<(v4f32 (PPCldvsxlh XForm:$src)),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 XForm:$src), sub_64)>;
+def : Pat<(v4f32 (PPCldvsxlh DSForm:$src)),
+ (SUBREG_TO_REG (i64 1), (DFLOADf64 DSForm:$src), sub_64)>;
// Convert (Un)Signed DWord in memory -> QP
-def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))),
- (f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>;
-def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))),
- (f128 (XSCVSDQP (LXSD iaddrX4:$src)))>;
-def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))),
- (f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>;
-def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))),
- (f128 (XSCVUDQP (LXSD iaddrX4:$src)))>;
+def : Pat<(f128 (sint_to_fp (i64 (load XForm:$src)))),
+ (f128 (XSCVSDQP (LXSDX XForm:$src)))>;
+def : Pat<(f128 (sint_to_fp (i64 (load DSForm:$src)))),
+ (f128 (XSCVSDQP (LXSD DSForm:$src)))>;
+def : Pat<(f128 (uint_to_fp (i64 (load XForm:$src)))),
+ (f128 (XSCVUDQP (LXSDX XForm:$src)))>;
+def : Pat<(f128 (uint_to_fp (i64 (load DSForm:$src)))),
+ (f128 (XSCVUDQP (LXSD DSForm:$src)))>;
// Convert Unsigned HWord in memory -> QP
def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
- (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;
+ (f128 (XSCVUDQP (LXSIHZX XForm:$src)))>;
// Convert Unsigned Byte in memory -> QP
def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
- (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
+ (f128 (XSCVUDQP (LXSIBZX ForceXForm:$src)))>;
// Truncate & Convert QP -> (Un)Signed (D)Word.
def : Pat<(i64 (any_fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
@@ -3829,65 +3960,65 @@
// Instructions for store(fptosi).
// The 8-byte version is repeated here due to availability of D-Form STXSD.
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8),
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), XForm:$dst, 8),
(STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
- xaddrX4:$dst)>;
+ XForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8),
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), DSForm:$dst, 8),
(STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
- iaddrX4:$dst)>;
+ DSForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
- (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ForceXForm:$dst, 4),
+ (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
- (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ForceXForm:$dst, 2),
+ (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
- (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ForceXForm:$dst, 1),
+ (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8),
- (STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), XForm:$dst, 8),
+ (STXSDX (XSCVDPSXDS f64:$src), XForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8),
- (STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), DSForm:$dst, 8),
+ (STXSD (XSCVDPSXDS f64:$src), DSForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
- (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ForceXForm:$dst, 2),
+ (STXSIHX (XSCVDPSXWS f64:$src), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
- (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ForceXForm:$dst, 1),
+ (STXSIBX (XSCVDPSXWS f64:$src), ForceXForm:$dst)>;
// Instructions for store(fptoui).
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8),
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), XForm:$dst, 8),
(STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
- xaddrX4:$dst)>;
+ XForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8),
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), DSForm:$dst, 8),
(STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
- iaddrX4:$dst)>;
+ DSForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
- (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ForceXForm:$dst, 4),
+ (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
- (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ForceXForm:$dst, 2),
+ (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
- (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ForceXForm:$dst, 1),
+ (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8),
- (STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), XForm:$dst, 8),
+ (STXSDX (XSCVDPUXDS f64:$src), XForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8),
- (STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), DSForm:$dst, 8),
+ (STXSD (XSCVDPUXDS f64:$src), DSForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
- (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ForceXForm:$dst, 2),
+ (STXSIHX (XSCVDPUXWS f64:$src), ForceXForm:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
- (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ForceXForm:$dst, 1),
+ (STXSIBX (XSCVDPUXWS f64:$src), ForceXForm:$dst)>;
// Round & Convert QP -> DP/SP
def : Pat<(f64 (any_fpround f128:$src)), (f64 (XSCVQPDP $src))>;
@@ -3907,7 +4038,8 @@
VSSRC))>;
// Endianness-neutral patterns for const splats with ISA 3.0 instructions.
-defm : ScalToVecWPermute<v4i32, (i32 i32:$A), (MTVSRWS $A), (MTVSRWS $A)>;
+defm : ScalToVecWPermute<v4i32, (i32 i32:$A), (MTVSRWS $A),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
(v4i32 (MTVSRWS $A))>;
def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
@@ -3919,40 +4051,75 @@
immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
(v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
-defm : ScalToVecWPermute<v4i32, FltToIntLoad.A,
- (XVCVSPSXWS (LXVWSX xoaddr:$A)),
- (XVCVSPSXWS (LXVWSX xoaddr:$A))>;
-defm : ScalToVecWPermute<v4i32, FltToUIntLoad.A,
- (XVCVSPUXWS (LXVWSX xoaddr:$A)),
- (XVCVSPUXWS (LXVWSX xoaddr:$A))>;
+defm : ScalToVecWPermute<
+ v4i32, FltToIntLoad.A,
+ (XVCVSPSXWS (LXVWSX ForceXForm:$A)),
+ (XVCVSPSXWS (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$A), sub_64))>;
+defm : ScalToVecWPermute<
+ v4i32, FltToUIntLoad.A,
+ (XVCVSPUXWS (LXVWSX ForceXForm:$A)),
+ (XVCVSPUXWS (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$A), sub_64))>;
defm : ScalToVecWPermute<
v4i32, DblToIntLoadP9.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1),
- (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), sub_64)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (DFLOADf64 DSForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (DFLOADf64 DSForm:$A)), sub_64)>;
defm : ScalToVecWPermute<
v4i32, DblToUIntLoadP9.A,
- (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1),
- (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), sub_64)>;
+ (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (DFLOADf64 DSForm:$A)), sub_64), 1),
+ (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (DFLOADf64 DSForm:$A)), sub_64)>;
defm : ScalToVecWPermute<
v2i64, FltToLongLoadP9.A,
- (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0),
+ (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 DSForm:$A), VSFRC)), 0),
(SUBREG_TO_REG
(i64 1),
- (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), sub_64)>;
+ (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 DSForm:$A), VSFRC)), sub_64)>;
defm : ScalToVecWPermute<
v2i64, FltToULongLoadP9.A,
- (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0),
+ (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 DSForm:$A), VSFRC)), 0),
(SUBREG_TO_REG
(i64 1),
- (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), sub_64)>;
-def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
- (v4f32 (LXVWSX xoaddr:$A))>;
-def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
- (v4i32 (LXVWSX xoaddr:$A))>;
+ (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 DSForm:$A), VSFRC)), sub_64)>;
+def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
+ (v4f32 (LXVWSX ForceXForm:$A))>;
+def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
+ (v4i32 (LXVWSX ForceXForm:$A))>;
} // HasVSX, HasP9Vector
-// Big endian 64Bit Power9 subtarget.
-let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
+// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
+// patterns.
+// Two identical blocks are required due to the slightly different predicates:
+// One without P10 instructions, the other is BigEndian only with P10 instructions.
+let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in {
+// Little endian Power10 subtargets produce a shorter pattern but require a
+// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions
+// to perform the operation, when only one instruction is produced in practice.
+// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets.
+defm : ScalToVecWPermute<
+ v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX ForceXForm:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX ForceXForm:$src), sub_64)>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<
+ v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX ForceXForm:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX ForceXForm:$src), sub_64)>;
+} // HasVSX, HasP9Vector, NoP10Vector
+
+// Any big endian Power9 VSX subtarget
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+// Power10 VSX subtargets produce a shorter pattern for little endian targets
+// but this is still the best pattern for Power9 and Power10 VSX big endian
+// Build vectors from i8 loads
+defm : ScalToVecWPermute<
+ v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX ForceXForm:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX ForceXForm:$src), sub_64)>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<
+ v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX ForceXForm:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX ForceXForm:$src), sub_64)>;
+
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
@@ -3986,87 +4153,103 @@
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 0)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 0))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 1)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 4))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 2)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 8))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 3)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 12))>;
+
// Scalar stores of i8
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), ForceXForm:$dst)>;
// Scalar stores of i16
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), ForceXForm:$dst)>;
+} // HasVSX, HasP9Vector, IsBigEndian
-def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
- (v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
- (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
+// Big endian 64Bit Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
+def : Pat<(v2i64 (scalar_to_vector (i64 (load DSForm:$src)))),
+ (v2i64 (SUBREG_TO_REG (i64 1), (DFLOADf64 DSForm:$src), sub_64))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (load XForm:$src)))),
+ (v2i64 (SUBREG_TO_REG (i64 1), (XFLOADf64 XForm:$src), sub_64))>;
-def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
- (v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
-def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
- (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
+def : Pat<(v2f64 (scalar_to_vector (f64 (load DSForm:$src)))),
+ (v2f64 (SUBREG_TO_REG (i64 1), (DFLOADf64 DSForm:$src), sub_64))>;
+def : Pat<(v2f64 (scalar_to_vector (f64 (load XForm:$src)))),
+ (v2f64 (SUBREG_TO_REG (i64 1), (XFLOADf64 XForm:$src), sub_64))>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), XForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), xaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
+ sub_64), XForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), XForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), xaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
+ sub_64), XForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), XForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), XForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), XForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), XForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), DSForm:$src),
(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), iaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
+ sub_64), DSForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), DSForm:$src),
(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), iaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
- (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
- (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ sub_64), DSForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), DSForm:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), DSForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), DSForm:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), DSForm:$src)>;
// (Un)Signed DWord vector extract -> QP
def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
@@ -4093,7 +4276,7 @@
(f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
}
-// (Un)Signed HWord vector extract -> QP
+// (Un)Signed HWord vector extract -> QP/DP/SP
foreach Idx = 0-7 in {
def : Pat<(f128 (sint_to_fp
(i32 (sext_inreg
@@ -4106,6 +4289,31 @@
(and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
(f128 (XSCVUDQP (EXTRACT_SUBREG
(VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsrz (and (i32 (vector_extract v8i16:$src, Idx)),
+ 65535))))),
+ (f32 (XSCVUXDSP (EXTRACT_SUBREG
+ (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg (vector_extract v8i16:$src, Idx),
+ i16)))))),
+ (f32 (XSCVSXDSP (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
+ sub_64)))>;
+ def : Pat<(f64 (PPCfcfidu
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v8i16:$src, Idx)),
+ 65535))))),
+ (f64 (XSCVUXDDP (EXTRACT_SUBREG
+ (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+ def : Pat<(f64 (PPCfcfid
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg (vector_extract v8i16:$src, Idx),
+ i16)))))),
+ (f64 (XSCVSXDDP (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
+ sub_64)))>;
}
// (Un)Signed Byte vector extract -> QP
@@ -4119,6 +4327,33 @@
(and (i32 (vector_extract v16i8:$src, Idx)), 255))),
(f128 (XSCVUDQP
(EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
+
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v16i8:$src, Idx)),
+ 255))))),
+ (f32 (XSCVUXDSP (EXTRACT_SUBREG
+ (VEXTRACTUB !add(Idx, Idx), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
+ i8)))))),
+ (f32 (XSCVSXDSP (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUB !add(Idx, Idx), $src)),
+ sub_64)))>;
+ def : Pat<(f64 (PPCfcfidu
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v16i8:$src, Idx)),
+ 255))))),
+ (f64 (XSCVUXDDP (EXTRACT_SUBREG
+ (VEXTRACTUB !add(Idx, Idx), $src), sub_64)))>;
+ def : Pat<(f64 (PPCfcfid
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
+ i8)))))),
+ (f64 (XSCVSXDDP (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUB !add(Idx, Idx), $src)),
+ sub_64)))>;
}
// Unsiged int in vsx register -> QP
@@ -4162,105 +4397,118 @@
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
-def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
- (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
-def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
- (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 0)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 12))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 1)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 8))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 2)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 4))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, (f32 (fpround f64:$B)), 3)),
+ (v4f32 (XXINSERTW v4f32:$A,
+ (SUBREG_TO_REG (i64 1), (XSCVDPSP f64:$B), sub_64), 0))>;
-def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
- (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
-def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
- (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+def : Pat<(v8i16 (PPCld_vec_be ForceXForm:$src)),
+ (COPY_TO_REGCLASS (LXVH8X ForceXForm:$src), VRRC)>;
+def : Pat<(PPCst_vec_be v8i16:$rS, ForceXForm:$dst),
+ (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), ForceXForm:$dst)>;
+
+def : Pat<(v16i8 (PPCld_vec_be ForceXForm:$src)),
+ (COPY_TO_REGCLASS (LXVB16X ForceXForm:$src), VRRC)>;
+def : Pat<(PPCst_vec_be v16i8:$rS, ForceXForm:$dst),
+ (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), ForceXForm:$dst)>;
// Scalar stores of i8
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), ForceXForm:$dst),
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), ForceXForm:$dst)>;
// Scalar stores of i16
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), ForceXForm:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), ForceXForm:$dst),
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), ForceXForm:$dst)>;
defm : ScalToVecWPermute<
- v2i64, (i64 (load iaddrX4:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (DFLOADf64 iaddrX4:$src), sub_64)>;
+ v2i64, (i64 (load DSForm:$src)),
+ (XXPERMDIs (DFLOADf64 DSForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (DFLOADf64 DSForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v2i64, (i64 (load xaddrX4:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (XFLOADf64 xaddrX4:$src), sub_64)>;
+ v2i64, (i64 (load XForm:$src)),
+ (XXPERMDIs (XFLOADf64 XForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 XForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v2f64, (f64 (load iaddrX4:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (DFLOADf64 iaddrX4:$src), sub_64)>;
+ v2f64, (f64 (load DSForm:$src)),
+ (XXPERMDIs (DFLOADf64 DSForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (DFLOADf64 DSForm:$src), sub_64)>;
defm : ScalToVecWPermute<
- v2f64, (f64 (load xaddrX4:$src)),
- (XXPERMDIs (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSFRC), 2),
- (SUBREG_TO_REG (i64 1), (XFLOADf64 xaddrX4:$src), sub_64)>;
+ v2f64, (f64 (load XForm:$src)),
+ (XXPERMDIs (XFLOADf64 XForm:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 XForm:$src), sub_64)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), XForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), xaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
+ sub_64), XForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), XForm:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), xaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
- (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
+ sub_64), XForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), XForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), XForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), XForm:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), XForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), DSForm:$src),
(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
- sub_64), iaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
+ sub_64), DSForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), DSForm:$src),
(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
- iaddrX4:$src)>;
-def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
- (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
- (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ DSForm:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), DSForm:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), DSForm:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), DSForm:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), DSForm:$src)>;
// (Un)Signed DWord vector extract -> QP
def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
@@ -4289,7 +4537,7 @@
(f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
}
-// (Un)Signed HWord vector extract -> QP
+// (Un)Signed HWord vector extract -> QP/DP/SP
// The Nested foreach lists identifies the vector element and corresponding
// register byte location.
foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
@@ -4305,9 +4553,37 @@
65535))),
(f128 (XSCVUDQP (EXTRACT_SUBREG
(VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v8i16:$src, !head(Idx))),
+ 65535))))),
+ (f32 (XSCVUXDSP (EXTRACT_SUBREG
+ (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg (vector_extract v8i16:$src,
+ !head(Idx)), i16)))))),
+ (f32 (XSCVSXDSP
+ (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUH !head(!tail(Idx)), $src)),
+ sub_64)))>;
+ def : Pat<(f64 (PPCfcfidu
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v8i16:$src, !head(Idx))),
+ 65535))))),
+ (f64 (XSCVUXDDP (EXTRACT_SUBREG
+ (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+ def : Pat<(f64 (PPCfcfid
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg
+ (vector_extract v8i16:$src, !head(Idx)), i16)))))),
+ (f64 (XSCVSXDDP
+ (EXTRACT_SUBREG (VEXTSH2D
+ (VEXTRACTUH !head(!tail(Idx)), $src)),
+ sub_64)))>;
}
-// (Un)Signed Byte vector extract -> QP
+// (Un)Signed Byte vector extract -> QP/DP/SP
foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
[9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
def : Pat<(f128 (sint_to_fp
@@ -4323,6 +4599,44 @@
(f128 (XSCVUDQP
(EXTRACT_SUBREG
(VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsrz
+ (and (i32 (vector_extract v16i8:$src, !head(Idx))),
+ 255))))),
+ (f32 (XSCVUXDSP (EXTRACT_SUBREG
+ (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg
+ (vector_extract v16i8:$src, !head(Idx)), i8)))))),
+ (f32 (XSCVSXDSP
+ (EXTRACT_SUBREG (VEXTSH2D
+ (VEXTRACTUB !head(!tail(Idx)), $src)),
+ sub_64)))>;
+ def : Pat<(f64 (PPCfcfidu
+ (f64 (PPCmtvsrz
+ (and (i32
+ (vector_extract v16i8:$src, !head(Idx))), 255))))),
+ (f64 (XSCVUXDDP (EXTRACT_SUBREG
+ (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+ def : Pat<(f64 (PPCfcfidu
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg
+ (vector_extract v16i8:$src, !head(Idx)), i8)))))),
+ (f64 (XSCVSXDDP
+ (EXTRACT_SUBREG (VEXTSH2D
+ (VEXTRACTUB !head(!tail(Idx)), $src)),
+ sub_64)))>;
+
+ def : Pat<(f64 (PPCfcfid
+ (f64 (PPCmtvsra
+ (i32 (sext_inreg
+ (vector_extract v16i8:$src, !head(Idx)), i8)))))),
+ (f64 (XSCVSXDDP
+ (EXTRACT_SUBREG (VEXTSH2D
+ (VEXTRACTUH !head(!tail(Idx)), $src)),
+ sub_64)))>;
}
// Unsiged int in vsx register -> QP
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index c242409..1d2b1ed 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -41,8 +41,6 @@
// *++p = c;
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "ppc-loop-instr-form-prep"
-
#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
@@ -79,6 +77,8 @@
#include <iterator>
#include <utility>
+#define DEBUG_TYPE "ppc-loop-instr-form-prep"
+
using namespace llvm;
static cl::opt<unsigned> MaxVarsPrep("ppc-formprep-max-vars",
@@ -191,11 +191,11 @@
/// Collect condition matched(\p isValidCandidate() returns true)
/// candidates in Loop \p L.
- SmallVector<Bucket, 16>
- collectCandidates(Loop *L,
- std::function<bool(const Instruction *, const Value *)>
- isValidCandidate,
- unsigned MaxCandidateNum);
+ SmallVector<Bucket, 16> collectCandidates(
+ Loop *L,
+ std::function<bool(const Instruction *, const Value *, const Type *)>
+ isValidCandidate,
+ unsigned MaxCandidateNum);
/// Add a candidate to candidates \p Buckets.
void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
@@ -331,27 +331,27 @@
SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
Loop *L,
- std::function<bool(const Instruction *, const Value *)> isValidCandidate,
+ std::function<bool(const Instruction *, const Value *, const Type *)>
+ isValidCandidate,
unsigned MaxCandidateNum) {
SmallVector<Bucket, 16> Buckets;
for (const auto &BB : L->blocks())
for (auto &J : *BB) {
Value *PtrValue;
- Instruction *MemI;
+ Type *PointerElementType;
if (LoadInst *LMemI = dyn_cast<LoadInst>(&J)) {
- MemI = LMemI;
PtrValue = LMemI->getPointerOperand();
+ PointerElementType = LMemI->getType();
} else if (StoreInst *SMemI = dyn_cast<StoreInst>(&J)) {
- MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
+ PointerElementType = SMemI->getValueOperand()->getType();
} else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
+ PointerElementType = Type::getInt8Ty(J.getContext());
if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
- MemI = IMemI;
PtrValue = IMemI->getArgOperand(0);
} else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
- MemI = IMemI;
PtrValue = IMemI->getArgOperand(1);
} else continue;
} else continue;
@@ -368,8 +368,8 @@
if (!LARSCEV || LARSCEV->getLoop() != L)
continue;
- if (isValidCandidate(&J, PtrValue))
- addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum);
+ if (isValidCandidate(&J, PtrValue, PointerElementType))
+ addOneCandidate(&J, LSCEV, Buckets, MaxCandidateNum);
}
return Buckets;
}
@@ -825,12 +825,11 @@
}
// Check if a load/store has update form. This lambda is used by function
// collectCandidates which can collect candidates for types defined by lambda.
- auto isUpdateFormCandidate = [&] (const Instruction *I,
- const Value *PtrValue) {
+ auto isUpdateFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+ const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
// There are no update forms for Altivec vector load/stores.
- if (ST && ST->hasAltivec() &&
- PtrValue->getType()->getPointerElementType()->isVectorTy())
+ if (ST && ST->hasAltivec() && PointerElementType->isVectorTy())
return false;
// There are no update forms for P10 lxvp/stxvp intrinsic.
auto *II = dyn_cast<IntrinsicInst>(I);
@@ -842,7 +841,7 @@
// fits in a 16-bit signed field but isn't a multiple of 4, it will be
// useless and possible to break some original well-form addressing mode
// to make this pre-inc prep for it.
- if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
+ if (PointerElementType->isIntegerTy(64)) {
const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LARSCEV || LARSCEV->getLoop() != L)
@@ -856,13 +855,13 @@
}
return true;
};
-
+
// Check if a load/store has DS form.
- auto isDSFormCandidate = [] (const Instruction *I, const Value *PtrValue) {
+ auto isDSFormCandidate = [](const Instruction *I, const Value *PtrValue,
+ const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
if (isa<IntrinsicInst>(I))
return false;
- Type *PointerElementType = PtrValue->getType()->getPointerElementType();
return (PointerElementType->isIntegerTy(64)) ||
(PointerElementType->isFloatTy()) ||
(PointerElementType->isDoubleTy()) ||
@@ -872,7 +871,8 @@
};
// Check if a load/store has DQ form.
- auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) {
+ auto isDQFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+ const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
// Check if it is a P10 lxvp/stxvp intrinsic.
auto *II = dyn_cast<IntrinsicInst>(I);
@@ -880,8 +880,7 @@
return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp ||
II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp;
// Check if it is a P9 vector load/store.
- return ST && ST->hasP9Vector() &&
- (PtrValue->getType()->getPointerElementType()->isVectorTy());
+ return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy());
};
// intrinsic for update form.
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 27b2c9a..4c9f5ff 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -28,9 +28,6 @@
namespace {
-// Length of the suffix "massv", which is specific to IBM MASSV library entries.
-const unsigned MASSVSuffixLength = 5;
-
static StringRef MASSVFuncs[] = {
#define TLI_DEFINE_MASSV_VECFUNCS_NAMES
#include "llvm/Analysis/VecFuncs.def"
@@ -70,20 +67,26 @@
// FIXME:
/// Returns a string corresponding to the specified PowerPC subtarget. e.g.:
-/// "P8" for Power8, "P9" for Power9. The string is used as a suffix while
+/// "_P8" for Power8, "_P9" for Power9. The string is used as a suffix while
/// generating subtarget-specific MASSV library functions. Current support
-/// includes Power8 and Power9 subtargets.
+/// includes minimum subtarget Power8 for Linux and Power7 for AIX.
StringRef PPCLowerMASSVEntries::getCPUSuffix(const PPCSubtarget *Subtarget) {
- // Assume Power8 when Subtarget is unavailable.
+ // Assume generic when Subtarget is unavailable.
if (!Subtarget)
- return "P8";
+ return "";
+ // TODO: add _P10 enties to Linux MASS lib and remove the check for AIX
+ if (Subtarget->isAIXABI() && Subtarget->hasP10Vector())
+ return "_P10";
if (Subtarget->hasP9Vector())
- return "P9";
+ return "_P9";
if (Subtarget->hasP8Vector())
- return "P8";
+ return "_P8";
+ if (Subtarget->isAIXABI())
+ return "_P7";
- report_fatal_error("Unsupported Subtarget: MASSV is supported only on "
- "Power8 and Power9 subtargets.");
+ report_fatal_error(
+ "Mininum subtarget for -vector-library=MASSV option is Power8 on Linux "
+ "and Power7 on AIX when vectorization is not disabled.");
}
/// Creates PowerPC subtarget-specific name corresponding to the specified
@@ -92,7 +95,7 @@
PPCLowerMASSVEntries::createMASSVFuncName(Function &Func,
const PPCSubtarget *Subtarget) {
StringRef Suffix = getCPUSuffix(Subtarget);
- auto GenericName = Func.getName().drop_back(MASSVSuffixLength).str();
+ auto GenericName = Func.getName().str();
std::string MASSVEntryName = GenericName + Suffix.str();
return MASSVEntryName;
}
@@ -101,7 +104,7 @@
/// intrinsics when the exponent is 0.25 or 0.75.
bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
Module &M) {
- if (Func.getName() != "__powf4_massv" && Func.getName() != "__powd2_massv")
+ if (Func.getName() != "__powf4" && Func.getName() != "__powd2")
return false;
if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
@@ -167,9 +170,7 @@
// Call to lowerMASSVCall() invalidates the iterator over users upon
// replacing the users. Precomputing the current list of users allows us to
// replace all the call sites.
- SmallVector<User *, 4> MASSVUsers;
- for (auto *User: Func.users())
- MASSVUsers.push_back(User);
+ SmallVector<User *, 4> MASSVUsers(Func.users());
for (auto *User : MASSVUsers) {
auto *CI = dyn_cast<CallInst>(User);
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index c8b01aa..4bbb6ed 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -226,28 +226,30 @@
void PPCMIPeephole::UpdateTOCSaves(
std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
- assert(MF->getSubtarget<PPCSubtarget>().isELFv2ABI() &&
- "TOC-save removal only supported on ELFv2");
- PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+ // FIXME: Saving TOC in prologue hasn't been implemented well in AIX ABI part,
+ // here only support it under ELFv2.
+ if (MF->getSubtarget<PPCSubtarget>().isELFv2ABI()) {
+ PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
- MachineBasicBlock *Entry = &MF->front();
- uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency();
+ MachineBasicBlock *Entry = &MF->front();
+ uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency();
- // If the block in which the TOC save resides is in a block that
- // post-dominates Entry, or a block that is hotter than entry (keep in mind
- // that early MachineLICM has already run so the TOC save won't be hoisted)
- // we can just do the save in the prologue.
- if (CurrBlockFreq > EntryFreq || MPDT->dominates(MI->getParent(), Entry))
- FI->setMustSaveTOC(true);
+ // If the block in which the TOC save resides is in a block that
+ // post-dominates Entry, or a block that is hotter than entry (keep in mind
+ // that early MachineLICM has already run so the TOC save won't be hoisted)
+ // we can just do the save in the prologue.
+ if (CurrBlockFreq > EntryFreq || MPDT->dominates(MI->getParent(), Entry))
+ FI->setMustSaveTOC(true);
- // If we are saving the TOC in the prologue, all the TOC saves can be removed
- // from the code.
- if (FI->mustSaveTOC()) {
- for (auto &TOCSave : TOCSaves)
- TOCSave.second = false;
- // Add new instruction to map.
- TOCSaves[MI] = false;
- return;
+ // If we are saving the TOC in the prologue, all the TOC saves can be
+ // removed from the code.
+ if (FI->mustSaveTOC()) {
+ for (auto &TOCSave : TOCSaves)
+ TOCSave.second = false;
+ // Add new instruction to map.
+ TOCSaves[MI] = false;
+ return;
+ }
}
bool Keep = true;
@@ -476,10 +478,12 @@
}
break;
}
+ case PPC::STW:
case PPC::STD: {
MachineFrameInfo &MFI = MF->getFrameInfo();
if (MFI.hasVarSizedObjects() ||
- !MF->getSubtarget<PPCSubtarget>().isELFv2ABI())
+ (!MF->getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+ !MF->getSubtarget<PPCSubtarget>().isAIXABI()))
break;
// When encountering a TOC save instruction, call UpdateTOCSaves
// to add it to the TOCSaves map and mark any existing TOC saves
@@ -660,7 +664,8 @@
Register ShiftOp1 = DefMI->getOperand(1).getReg();
Register ShiftOp2 = DefMI->getOperand(2).getReg();
unsigned ShiftImm = DefMI->getOperand(3).getImm();
- unsigned SplatImm = MI.getOperand(2).getImm();
+ unsigned SplatImm =
+ MI.getOperand(MyOpcode == PPC::XXSPLTW ? 2 : 1).getImm();
if (ShiftOp1 == ShiftOp2) {
unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
if (MRI->hasOneNonDBGUse(ShiftRes)) {
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index c976a9c..782d41f 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -66,34 +66,122 @@
}
void PPCFunctionInfo::appendParameterType(ParamType Type) {
- uint32_t CopyParamType = ParameterType;
- int Bits = 0;
- // If it is fixed type, we only need to increase the FixedParamNum, for
- // the bit encode of fixed type is bit of zero, we do not need to change the
- // ParamType.
- if (Type == FixedType) {
- ++FixedParamNum;
+ ParamtersType.push_back(Type);
+ switch (Type) {
+ case FixedType:
+ ++FixedParmsNum;
+ return;
+ case ShortFloatingPoint:
+ case LongFloatingPoint:
+ ++FloatingParmsNum;
+ return;
+ case VectorChar:
+ case VectorShort:
+ case VectorInt:
+ case VectorFloat:
+ ++VectorParmsNum;
return;
}
+ llvm_unreachable("Error ParamType type.");
+}
- ++FloatingPointParamNum;
+uint32_t PPCFunctionInfo::getVecExtParmsType() const {
- for (int I = 0;
- I < static_cast<int>(FloatingPointParamNum + FixedParamNum - 1); ++I) {
- if (CopyParamType & XCOFF::TracebackTable::ParmTypeIsFloatingBit) {
+ uint32_t VectExtParamInfo = 0;
+ unsigned ShiftBits = 32 - XCOFF::TracebackTable::WidthOfParamType;
+ int Bits = 0;
+
+ if (!hasVectorParms())
+ return 0;
+
+ for (const auto &Elt : ParamtersType) {
+ switch (Elt) {
+ case VectorChar:
+ VectExtParamInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ VectExtParamInfo |=
+ XCOFF::TracebackTable::ParmTypeIsVectorCharBit >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ case VectorShort:
+ VectExtParamInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ VectExtParamInfo |=
+ XCOFF::TracebackTable::ParmTypeIsVectorShortBit >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ case VectorInt:
+ VectExtParamInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ VectExtParamInfo |=
+ XCOFF::TracebackTable::ParmTypeIsVectorIntBit >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ case VectorFloat:
+ VectExtParamInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ VectExtParamInfo |=
+ XCOFF::TracebackTable::ParmTypeIsVectorFloatBit >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ default:
+ break;
+ }
+
+ // There are only 32bits in the VectExtParamInfo.
+ if (Bits >= 32)
+ break;
+ }
+ return Bits < 32 ? VectExtParamInfo << (32 - Bits) : VectExtParamInfo;
+}
+
+uint32_t PPCFunctionInfo::getParmsType() const {
+ uint32_t ParamsTypeInfo = 0;
+ unsigned ShiftBits = 32 - XCOFF::TracebackTable::WidthOfParamType;
+
+ int Bits = 0;
+ for (const auto &Elt : ParamtersType) {
+
+ if (Bits > 31 || (Bits > 30 && (Elt != FixedType || hasVectorParms())))
+ break;
+
+ switch (Elt) {
+ case FixedType:
+ if (hasVectorParms()) {
+ //'00' ==> fixed parameter if HasVectorParms is true.
+ ParamsTypeInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ ParamsTypeInfo |=
+ XCOFF::TracebackTable::ParmTypeIsFixedBits >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ } else {
+ //'0' ==> fixed parameter if HasVectorParms is false.
+ ParamsTypeInfo <<= 1;
+ ++Bits;
+ }
+ break;
+ case ShortFloatingPoint:
// '10'b => floating point short parameter.
+ ParamsTypeInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ ParamsTypeInfo |=
+ XCOFF::TracebackTable::ParmTypeIsFloatingBits >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ case LongFloatingPoint:
// '11'b => floating point long parameter.
- CopyParamType <<= 2;
- Bits += 2;
- } else {
- // '0'b => fixed parameter.
- CopyParamType <<= 1;
- ++Bits;
+ ParamsTypeInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ ParamsTypeInfo |=
+ XCOFF::TracebackTable::ParmTypeIsDoubleBits >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
+ case VectorChar:
+ case VectorShort:
+ case VectorInt:
+ case VectorFloat:
+ // '01' ==> vector parameter
+ ParamsTypeInfo <<= XCOFF::TracebackTable::WidthOfParamType;
+ ParamsTypeInfo |=
+ XCOFF::TracebackTable::ParmTypeIsVectorBits >> ShiftBits;
+ Bits += XCOFF::TracebackTable::WidthOfParamType;
+ break;
}
}
- assert(Type != FixedType && "FixedType should already be handled.");
- if (Bits < 31)
- ParameterType |= Type << (30 - Bits);
+ return Bits < 32 ? ParamsTypeInfo << (32 - Bits) : ParamsTypeInfo;
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 4b73b36..07c503d 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -23,12 +23,14 @@
/// PowerPC target-specific information for each MachineFunction.
class PPCFunctionInfo : public MachineFunctionInfo {
public:
- // The value in the ParamType are used to indicate the bitstrings used in the
- // encoding format.
enum ParamType {
- FixedType = 0x0,
- ShortFloatPoint = 0x2,
- LongFloatPoint = 0x3
+ FixedType,
+ ShortFloatingPoint,
+ LongFloatingPoint,
+ VectorChar,
+ VectorShort,
+ VectorInt,
+ VectorFloat
};
private:
@@ -49,6 +51,9 @@
/// Frame index where the old PIC base pointer is stored.
int PICBasePointerSaveIndex = 0;
+ /// Frame index where the ROP Protection Hash is stored.
+ int ROPProtectionHashSaveIndex = 0;
+
/// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
/// function. This is only valid after the initial scan of the function by
/// PEI.
@@ -117,19 +122,18 @@
/// register for parameter passing.
unsigned VarArgsNumFPR = 0;
- /// FixedParamNum - Number of fixed parameter.
- unsigned FixedParamNum = 0;
+ /// FixedParmsNum - The number of fixed parameters.
+ unsigned FixedParmsNum = 0;
- /// FloatingParamNum - Number of floating point parameter.
- unsigned FloatingPointParamNum = 0;
+ /// FloatingParmsNum - The number of floating parameters.
+ unsigned FloatingParmsNum = 0;
- /// ParamType - Encode type for every parameter
- /// in the order of parameters passing in.
- /// Bitstring starts from the most significant (leftmost) bit.
- /// '0'b => fixed parameter.
- /// '10'b => floating point short parameter.
- /// '11'b => floating point long parameter.
- uint32_t ParameterType = 0;
+ /// VectorParmsNum - The number of vector parameters.
+ unsigned VectorParmsNum = 0;
+
+ /// ParamtersType - Store all the parameter's type that are saved on
+ /// registers.
+ SmallVector<ParamType, 32> ParamtersType;
/// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
int CRSpillFrameIndex = 0;
@@ -161,6 +165,13 @@
int getPICBasePointerSaveIndex() const { return PICBasePointerSaveIndex; }
void setPICBasePointerSaveIndex(int Idx) { PICBasePointerSaveIndex = Idx; }
+ int getROPProtectionHashSaveIndex() const {
+ return ROPProtectionHashSaveIndex;
+ }
+ void setROPProtectionHashSaveIndex(int Idx) {
+ ROPProtectionHashSaveIndex = Idx;
+ }
+
unsigned getMinReservedArea() const { return MinReservedArea; }
void setMinReservedArea(unsigned size) { MinReservedArea = size; }
@@ -214,11 +225,15 @@
unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
- unsigned getFixedParamNum() const { return FixedParamNum; }
+ unsigned getFixedParmsNum() const { return FixedParmsNum; }
+ unsigned getFloatingPointParmsNum() const { return FloatingParmsNum; }
+ unsigned getVectorParmsNum() const { return VectorParmsNum; }
+ bool hasVectorParms() const { return VectorParmsNum != 0; }
- unsigned getFloatingPointParamNum() const { return FloatingPointParamNum; }
+ uint32_t getParmsType() const;
- uint32_t getParameterType() const { return ParameterType; }
+ uint32_t getVecExtParmsType() const;
+
void appendParameterType(ParamType Type);
unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
index ce615e5..0371287 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -46,7 +46,7 @@
return false;
}
-void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// From GenericScheduler::tryCandidate
@@ -54,25 +54,25 @@
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
- return;
+ return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
- return;
+ return TryCand.Reason != NoCand;
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
- return;
+ return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
- return;
+ return TryCand.Reason != NoCand;
// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
@@ -86,12 +86,12 @@
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
- return;
+ return TryCand.Reason != NoCand;
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
- return;
+ return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
@@ -106,37 +106,37 @@
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
- return;
+ return TryCand.Reason != NoCand;
if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
- return;
+ return TryCand.Reason != NoCand;
}
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
- return;
+ return TryCand.Reason != NoCand;
if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
- return;
+ return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
- return;
+ return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
- return;
+ return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
@@ -150,14 +150,16 @@
// Add powerpc specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
- return;
+ return true;
// There are some benefits to schedule the ADDI before the load to hide the
// latency, as RA may create a true dependency between the load and addi.
if (SameBoundary) {
if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
- return;
+ return TryCand.Reason != NoCand;
}
+
+ return TryCand.Reason != NoCand;
}
bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
@@ -172,38 +174,38 @@
return false;
}
-void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
// From PostGenericScheduler::tryCandidate
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
- return;
+ return true;
}
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
- return;
+ return TryCand.Reason != NoCand;
// Keep clustered nodes together.
if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
- return;
+ return TryCand.Reason != NoCand;
// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
- return;
+ return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
- return;
+ return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
- return;
+ return TryCand.Reason != NoCand;
}
// Fall through to original instruction order.
@@ -215,14 +217,16 @@
// Add powerpc post ra specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
- return;
+ return true;
// There are some benefits to schedule the ADDI as early as possible post ra
// to avoid stalled by vector instructions which take up all the hw units.
// And ADDI is usually used to post inc the loop indvar, which matters the
// performance.
if (biasAddiCandidate(Cand, TryCand))
- return;
+ return TryCand.Reason != NoCand;
+
+ return TryCand.Reason != NoCand;
}
void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
index a9734ca..27e80c7 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
@@ -23,8 +23,9 @@
PPCPreRASchedStrategy(const MachineSchedContext *C) :
GenericScheduler(C) {}
protected:
- void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;
+
private:
bool biasAddiLoadCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
@@ -43,7 +44,7 @@
void enterMBB(MachineBasicBlock *MBB) override;
void leaveMBB() override;
- void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const;
};
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.h
index 91cbedf..cbf49ee 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.h
@@ -11,6 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H
+#define LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H
+
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -20,3 +23,5 @@
/// to PPCPassConfig::createMachineScheduler() to have an effect.
std::unique_ptr<ScheduleDAGMutation> createPowerPCMacroFusionDAGMutation();
} // llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 178a134..4f16c7f 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -133,6 +133,12 @@
ImmToIdxMap[PPC::EVSTDD] = PPC::EVSTDDX;
ImmToIdxMap[PPC::SPESTW] = PPC::SPESTWX;
ImmToIdxMap[PPC::SPELWZ] = PPC::SPELWZX;
+
+ // Power10
+ ImmToIdxMap[PPC::LXVP] = PPC::LXVPX;
+ ImmToIdxMap[PPC::STXVP] = PPC::STXVPX;
+ ImmToIdxMap[PPC::PLXVP] = PPC::LXVPX;
+ ImmToIdxMap[PPC::PSTXVP] = PPC::STXVPX;
}
/// getPointerRegClass - Return the register class to use to hold pointers.
@@ -156,17 +162,19 @@
const MCPhysReg*
PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
- if (Subtarget.isAIXABI() &&
- (Subtarget.hasAltivec() && !TM.getAIXExtendedAltivecABI()))
- report_fatal_error("the default AIX Altivec ABI is not yet "
- "supported.");
if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
if (!TM.isPPC64() && Subtarget.isAIXABI())
report_fatal_error("AnyReg unimplemented on 32-bit AIX.");
- if (Subtarget.hasVSX())
+ if (Subtarget.hasVSX()) {
+ if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
+ return CSR_64_AllRegs_AIX_Dflt_VSX_SaveList;
return CSR_64_AllRegs_VSX_SaveList;
- if (Subtarget.hasAltivec())
+ }
+ if (Subtarget.hasAltivec()) {
+ if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
+ return CSR_64_AllRegs_AIX_Dflt_Altivec_SaveList;
return CSR_64_AllRegs_Altivec_SaveList;
+ }
return CSR_64_AllRegs_SaveList;
}
@@ -200,15 +208,18 @@
}
// Standard calling convention CSRs.
if (TM.isPPC64()) {
- if (Subtarget.hasAltivec())
+ if (Subtarget.hasAltivec() &&
+ (!Subtarget.isAIXABI() || TM.getAIXExtendedAltivecABI())) {
return SaveR2 ? CSR_PPC64_R2_Altivec_SaveList
: CSR_PPC64_Altivec_SaveList;
+ }
return SaveR2 ? CSR_PPC64_R2_SaveList : CSR_PPC64_SaveList;
}
// 32-bit targets.
if (Subtarget.isAIXABI()) {
if (Subtarget.hasAltivec())
- return CSR_AIX32_Altivec_SaveList;
+ return TM.getAIXExtendedAltivecABI() ? CSR_AIX32_Altivec_SaveList
+ : CSR_AIX32_SaveList;
return CSR_AIX32_SaveList;
}
if (Subtarget.hasAltivec())
@@ -223,18 +234,27 @@
CallingConv::ID CC) const {
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
if (CC == CallingConv::AnyReg) {
- if (Subtarget.hasVSX())
+ if (Subtarget.hasVSX()) {
+ if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
+ return CSR_64_AllRegs_AIX_Dflt_VSX_RegMask;
return CSR_64_AllRegs_VSX_RegMask;
- if (Subtarget.hasAltivec())
+ }
+ if (Subtarget.hasAltivec()) {
+ if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
+ return CSR_64_AllRegs_AIX_Dflt_Altivec_RegMask;
return CSR_64_AllRegs_Altivec_RegMask;
+ }
return CSR_64_AllRegs_RegMask;
}
if (Subtarget.isAIXABI()) {
- return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
- : CSR_PPC64_RegMask)
- : (Subtarget.hasAltivec() ? CSR_AIX32_Altivec_RegMask
- : CSR_AIX32_RegMask);
+ return TM.isPPC64()
+ ? ((Subtarget.hasAltivec() && TM.getAIXExtendedAltivecABI())
+ ? CSR_PPC64_Altivec_RegMask
+ : CSR_PPC64_RegMask)
+ : ((Subtarget.hasAltivec() && TM.getAIXExtendedAltivecABI())
+ ? CSR_AIX32_Altivec_RegMask
+ : CSR_AIX32_RegMask);
}
if (CC == CallingConv::Cold) {
@@ -335,6 +355,20 @@
IE = PPC::VRRCRegClass.end(); I != IE; ++I)
markSuperRegs(Reserved, *I);
+ if (Subtarget.isAIXABI() && Subtarget.hasAltivec() &&
+ !TM.getAIXExtendedAltivecABI()) {
+ // In the AIX default Altivec ABI, vector registers VR20-VR31 are reserved
+ // and cannot be used.
+ for (auto Reg : CSR_Altivec_SaveList) {
+ if (Reg == 0)
+ break;
+ markSuperRegs(Reserved, Reg);
+ for (MCRegAliasIterator AS(Reg, this, true); AS.isValid(); ++AS) {
+ Reserved.set(*AS);
+ }
+ }
+ }
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
@@ -345,22 +379,32 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
const std::vector<CalleeSavedInfo> &Info = MFI.getCalleeSavedInfo();
+ LLVM_DEBUG(dbgs() << "requiresFrameIndexScavenging for " << MF.getName()
+ << ".\n");
// If the callee saved info is invalid we have to default to true for safety.
- if (!MFI.isCalleeSavedInfoValid())
+ if (!MFI.isCalleeSavedInfoValid()) {
+ LLVM_DEBUG(dbgs() << "TRUE - Invalid callee saved info.\n");
return true;
+ }
// We will require the use of X-Forms because the frame is larger than what
// can be represented in signed 16 bits that fit in the immediate of a D-Form.
// If we need an X-Form then we need a register to store the address offset.
unsigned FrameSize = MFI.getStackSize();
// Signed 16 bits means that the FrameSize cannot be more than 15 bits.
- if (FrameSize & ~0x7FFF)
+ if (FrameSize & ~0x7FFF) {
+ LLVM_DEBUG(dbgs() << "TRUE - Frame size is too large for D-Form.\n");
return true;
+ }
// The callee saved info is valid so it can be traversed.
// Checking for registers that need saving that do not have load or store
// forms where the address offset is an immediate.
for (unsigned i = 0; i < Info.size(); i++) {
+ // If the spill is to a register no scavenging is required.
+ if (Info[i].isSpilledToReg())
+ continue;
+
int FrIdx = Info[i].getFrameIdx();
unsigned Reg = Info[i].getReg();
@@ -369,8 +413,13 @@
if (!MFI.isFixedObjectIndex(FrIdx)) {
// This is not a fixed object. If it requires alignment then we may still
// need to use the XForm.
- if (offsetMinAlignForOpcode(Opcode) > 1)
+ if (offsetMinAlignForOpcode(Opcode) > 1) {
+ LLVM_DEBUG(dbgs() << "Memory Operand: " << InstrInfo->getName(Opcode)
+ << " for register " << printReg(Reg, this) << ".\n");
+ LLVM_DEBUG(dbgs() << "TRUE - Not fixed frame object that requires "
+ << "alignment.\n");
return true;
+ }
}
// This is eiher:
@@ -379,37 +428,106 @@
// need to consider the alignment here.
// 2) A not fixed object but in that case we now know that the min required
// alignment is no more than 1 based on the previous check.
- if (InstrInfo->isXFormMemOp(Opcode))
+ if (InstrInfo->isXFormMemOp(Opcode)) {
+ LLVM_DEBUG(dbgs() << "Memory Operand: " << InstrInfo->getName(Opcode)
+ << " for register " << printReg(Reg, this) << ".\n");
+ LLVM_DEBUG(dbgs() << "TRUE - Memory operand is X-Form.\n");
return true;
+ }
}
+ LLVM_DEBUG(dbgs() << "FALSE - Scavenging is not required.\n");
return false;
}
+bool PPCRegisterInfo::requiresVirtualBaseRegisters(
+ const MachineFunction &MF) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Do not use virtual base registers when ROP protection is turned on.
+ // Virtual base registers break the layout of the local variable space and may
+ // push the ROP Hash location past the 512 byte range of the ROP store
+ // instruction.
+ return !Subtarget.hasROPProtect();
+}
+
bool PPCRegisterInfo::isCallerPreservedPhysReg(MCRegister PhysReg,
const MachineFunction &MF) const {
assert(Register::isPhysicalRegister(PhysReg));
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!TM.isPPC64())
- return false;
- if (!Subtarget.isSVR4ABI())
+ if (!Subtarget.is64BitELFABI() && !Subtarget.isAIXABI())
return false;
- if (PhysReg == PPC::X2)
- // X2 is guaranteed to be preserved within a function if it is reserved.
+ if (PhysReg == Subtarget.getTOCPointerRegister())
+ // X2/R2 is guaranteed to be preserved within a function if it is reserved.
// The reason it's reserved is that it's the TOC pointer (and the function
// uses the TOC). In functions where it isn't reserved (i.e. leaf functions
// with no TOC access), we can't claim that it is preserved.
- return (getReservedRegs(MF).test(PPC::X2));
- if (StackPtrConst && (PhysReg == PPC::X1) && !MFI.hasVarSizedObjects()
- && !MFI.hasOpaqueSPAdjustment())
+ return (getReservedRegs(MF).test(PhysReg));
+ if (StackPtrConst && PhysReg == Subtarget.getStackPointerRegister() &&
+ !MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
// The value of the stack pointer does not change within a function after
// the prologue and before the epilogue if there are no dynamic allocations
- // and no inline asm which clobbers X1.
+ // and no inline asm which clobbers X1/R1.
return true;
return false;
}
+bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ // Call the base implementation first to set any hints based on the usual
+ // heuristics and decide what the return value should be. We want to return
+ // the same value returned by the base implementation. If the base
+ // implementation decides to return true and force the allocation then we
+ // will leave it as such. On the other hand if the base implementation
+ // decides to return false the following code will not force the allocation
+ // as we are just looking to provide a hint.
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+ // We are interested in instructions that copy values to ACC/UACC.
+ // The copy into UACC will be simply a COPY to a subreg so we
+ // want to allocate the corresponding physical subreg for the source.
+ // The copy into ACC will be a BUILD_UACC so we want to allocate
+ // the same number UACC for the source.
+ for (MachineInstr &Use : MRI->reg_nodbg_instructions(VirtReg)) {
+ const MachineOperand *ResultOp = nullptr;
+ Register ResultReg;
+ switch (Use.getOpcode()) {
+ case TargetOpcode::COPY: {
+ ResultOp = &Use.getOperand(0);
+ ResultReg = ResultOp->getReg();
+ if (Register::isVirtualRegister(ResultReg) &&
+ MRI->getRegClass(ResultReg)->contains(PPC::UACC0) &&
+ VRM->hasPhys(ResultReg)) {
+ Register UACCPhys = VRM->getPhys(ResultReg);
+ Register HintReg = getSubReg(UACCPhys, ResultOp->getSubReg());
+ Hints.push_back(HintReg);
+ }
+ break;
+ }
+ case PPC::BUILD_UACC: {
+ ResultOp = &Use.getOperand(0);
+ ResultReg = ResultOp->getReg();
+ if (MRI->getRegClass(ResultReg)->contains(PPC::ACC0) &&
+ VRM->hasPhys(ResultReg)) {
+ Register ACCPhys = VRM->getPhys(ResultReg);
+ assert((ACCPhys >= PPC::ACC0 && ACCPhys <= PPC::ACC7) &&
+ "Expecting an ACC register for BUILD_UACC.");
+ Register HintReg = PPC::UACC0 + (ACCPhys - PPC::ACC0);
+ Hints.push_back(HintReg);
+ }
+ break;
+ }
+ }
+ }
+ return BaseImplRetVal;
+}
+
unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const PPCFrameLowering *TFI = getFrameLowering(MF);
@@ -426,15 +544,28 @@
unsigned FP = TFI->hasFP(MF) ? 1 : 0;
return 32 - FP - DefaultSafety;
}
- case PPC::F8RCRegClassID:
case PPC::F4RCRegClassID:
- case PPC::VRRCRegClassID:
- case PPC::VFRCRegClassID:
+ case PPC::F8RCRegClassID:
case PPC::VSLRCRegClassID:
return 32 - DefaultSafety;
- case PPC::VSRCRegClassID:
+ case PPC::VFRCRegClassID:
+ case PPC::VRRCRegClassID: {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Vector registers VR20-VR31 are reserved and cannot be used in the default
+ // Altivec ABI on AIX.
+ if (!TM.getAIXExtendedAltivecABI() && Subtarget.isAIXABI())
+ return 20 - DefaultSafety;
+ }
+ return 32 - DefaultSafety;
case PPC::VSFRCRegClassID:
case PPC::VSSRCRegClassID:
+ case PPC::VSRCRegClassID: {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ if (!TM.getAIXExtendedAltivecABI() && Subtarget.isAIXABI())
+ // Vector registers VR20-VR31 are reserved and cannot be used in the
+ // default Altivec ABI on AIX.
+ return 52 - DefaultSafety;
+ }
return 64 - DefaultSafety;
case PPC::CRRCRegClassID:
return 8 - DefaultSafety;
@@ -445,6 +576,8 @@
PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const {
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const auto *DefaultSuperclass =
+ TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
if (Subtarget.hasVSX()) {
// With VSX, we can inflate various sub-register classes to the full VSX
// register set.
@@ -452,7 +585,7 @@
// For Power9 we allow the user to enable GPR to vector spills.
// FIXME: Currently limited to spilling GP8RC. A follow on patch will add
// support to spill GPRC.
- if (TM.isELFv2ABI()) {
+ if (TM.isELFv2ABI() || Subtarget.isAIXABI()) {
if (Subtarget.hasP9Vector() && EnableGPRToVecSpills &&
RC == &PPC::G8RCRegClass) {
InflateGP8RC++;
@@ -461,15 +594,27 @@
if (RC == &PPC::GPRCRegClass && EnableGPRToVecSpills)
InflateGPRC++;
}
- if (RC == &PPC::F8RCRegClass)
- return &PPC::VSFRCRegClass;
- else if (RC == &PPC::VRRCRegClass)
- return &PPC::VSRCRegClass;
- else if (RC == &PPC::F4RCRegClass && Subtarget.hasP8Vector())
- return &PPC::VSSRCRegClass;
+
+ for (const auto *I = RC->getSuperClasses(); *I; ++I) {
+ if (getRegSizeInBits(**I) != getRegSizeInBits(*RC))
+ continue;
+
+ switch ((*I)->getID()) {
+ case PPC::VSSRCRegClassID:
+ return Subtarget.hasP8Vector() ? *I : DefaultSuperclass;
+ case PPC::VSFRCRegClassID:
+ case PPC::VSRCRegClassID:
+ return *I;
+ case PPC::VSRpRCRegClassID:
+ return Subtarget.pairedVectorMemops() ? *I : DefaultSuperclass;
+ case PPC::ACCRCRegClassID:
+ case PPC::UACCRCRegClassID:
+ return Subtarget.hasMMA() ? *I : DefaultSuperclass;
+ }
+ }
}
- return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
+ return DefaultSuperclass;
}
//===----------------------------------------------------------------------===//
@@ -1063,6 +1208,59 @@
MBB.erase(II);
}
+/// lowerQuadwordSpilling - Generate code to spill paired general register.
+void PPCRegisterInfo::lowerQuadwordSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ Register SrcReg = MI.getOperand(0).getReg();
+ bool IsKilled = MI.getOperand(0).isKill();
+
+ Register Reg = PPC::X0 + (SrcReg - PPC::G8p0) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STD))
+ .addReg(Reg, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 8 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STD))
+ .addReg(Reg + 1, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 0 : 8);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+/// lowerQuadwordRestore - Generate code to restore paired general register.
+void PPCRegisterInfo::lowerQuadwordRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ Register DestReg = MI.getOperand(0).getReg();
+ assert(MI.definesRegister(DestReg) &&
+ "RESTORE_QUADWORD does not define its destination");
+
+ Register Reg = PPC::X0 + (DestReg - PPC::G8p0) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LD), Reg), FrameIndex,
+ IsLittleEndian ? 8 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LD), Reg + 1), FrameIndex,
+ IsLittleEndian ? 0 : 8);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
Register Reg, int &FrameIdx) const {
// For the nonvolatile condition registers (CR2, CR3, CR4) return true to
@@ -1099,12 +1297,16 @@
case PPC::LXSSP:
case PPC::STXSD:
case PPC::STXSSP:
+ case PPC::STQ:
return 4;
case PPC::EVLDD:
case PPC::EVSTDD:
return 8;
case PPC::LXV:
case PPC::STXV:
+ case PPC::LQ:
+ case PPC::LXVP:
+ case PPC::STXVP:
return 16;
}
}
@@ -1200,6 +1402,12 @@
} else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
lowerACCRestore(II, FrameIndex);
return;
+ } else if (OpC == PPC::SPILL_QUADWORD) {
+ lowerQuadwordSpilling(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::RESTORE_QUADWORD) {
+ lowerQuadwordRestore(II, FrameIndex);
+ return;
}
// Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
@@ -1226,6 +1434,16 @@
Offset += MFI.getStackSize();
}
+ // If we encounter an LXVP/STXVP with an offset that doesn't fit, we can
+ // transform it to the prefixed version so we don't have to use the XForm.
+ if ((OpC == PPC::LXVP || OpC == PPC::STXVP) &&
+ (!isInt<16>(Offset) || (Offset % offsetMinAlign(MI)) != 0) &&
+ Subtarget.hasPrefixInstrs()) {
+ unsigned NewOpc = OpC == PPC::LXVP ? PPC::PLXVP : PPC::PSTXVP;
+ MI.setDesc(TII.get(NewOpc));
+ OpC = NewOpc;
+ }
+
// If we can, encode the offset directly into the instruction. If this is a
// normal PPC "ri" instruction, any 16-bit value can be safely encoded. If
// this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
@@ -1234,9 +1452,13 @@
// happen in invalid code.
assert(OpC != PPC::DBG_VALUE &&
"This should be handled in a target-independent way");
+ // FIXME: This should be factored out to a separate function as prefixed
+ // instructions add a number of opcodes for which we can use 34-bit imm.
bool OffsetFitsMnemonic = (OpC == PPC::EVSTDD || OpC == PPC::EVLDD) ?
isUInt<8>(Offset) :
isInt<16>(Offset);
+ if (OpC == PPC::PLXVP || OpC == PPC::PSTXVP)
+ OffsetFitsMnemonic = isInt<34>(Offset);
if (!noImmForm && ((OffsetFitsMnemonic &&
((Offset % offsetMinAlign(MI)) == 0)) ||
OpC == TargetOpcode::STACKMAP ||
@@ -1323,7 +1545,7 @@
// If we need to realign the stack, then the stack pointer can no longer
// serve as an offset into the caller's stack space. As a result, we need a
// base pointer.
- return needsStackRealignment(MF);
+ return hasStackRealignment(MF);
}
/// Returns true if the instruction's frame index
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 93f330a..c22a582 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -94,6 +94,16 @@
bool isCallerPreservedPhysReg(MCRegister PhysReg,
const MachineFunction &MF) const override;
+ // Provide hints to the register allocator for allocating subregisters
+ // of primed and unprimed accumulators. For example, if accumulator
+ // ACC5 is assigned, we also want to assign UACC5 to the input.
+ // Similarly if UACC5 is assigned, we want to assign VSRp10, VSRp11
+ // to its inputs.
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
/// We require the register scavenger.
bool requiresRegisterScavenging(const MachineFunction &MF) const override {
return true;
@@ -101,9 +111,7 @@
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
- bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
- return true;
- }
+ bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
@@ -125,6 +133,11 @@
void lowerACCRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;
+ void lowerQuadwordSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerQuadwordRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+
static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg,
MCRegister SrcReg);
@@ -134,6 +147,8 @@
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
+ bool addAllocPriorityToGlobalRanges() const override { return true; }
+
// Support for virtual base registers.
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 551735c..044035e 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -20,6 +20,8 @@
def sub_vsx1 : SubRegIndex<128, 128>;
def sub_pair0 : SubRegIndex<256>;
def sub_pair1 : SubRegIndex<256, 256>;
+def sub_gp8_x0 : SubRegIndex<64>;
+def sub_gp8_x1 : SubRegIndex<64, 64>;
}
@@ -119,6 +121,15 @@
let SubRegs = subregs;
}
+// GP8Pair - Consecutive even-odd paired GP8.
+class GP8Pair<string n, bits<5> EvenIndex> : PPCReg<n> {
+ assert !eq(EvenIndex{0}, 0), "Index should be even.";
+ let HWEncoding{4-0} = EvenIndex;
+ let SubRegs = [!cast<GP8>("X"#EvenIndex), !cast<GP8>("X"#!add(EvenIndex, 1))];
+ let DwarfNumbers = [-1, -1];
+ let SubRegIndices = [sub_gp8_x0, sub_gp8_x1];
+}
+
// General-purpose registers
foreach Index = 0-31 in {
def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
@@ -185,6 +196,11 @@
}
}
+// 16 paired even-odd consecutive GP8s.
+foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
+ def G8p#!srl(Index, 1) : GP8Pair<"r"#Index, Index>;
+}
+
// The representation of r0 when treated as the constant 0.
def ZERO : GPR<0, "0">, DwarfRegAlias<R0>;
def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
@@ -287,9 +303,14 @@
R31, R0, R1, FP, BP)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
- let AltOrders = [(add (sub GPRC, R2), R2)];
+ // On AIX, CSRs are allocated starting from R31 according to:
+ // https://www.ibm.com/docs/en/ssw_aix_72/assembler/assembler_pdf.pdf.
+ // This also helps setting the correct `NumOfGPRsSaved' in traceback table.
+ let AltOrders = [(add (sub GPRC, R2), R2),
+ (add (sequence "R%u", 2, 12),
+ (sequence "R%u", 31, 13), R0, R1, FP, BP)];
let AltOrderSelect = [{
- return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
+ return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
}];
}
@@ -298,9 +319,11 @@
X31, X13, X0, X1, FP8, BP8)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
- let AltOrders = [(add (sub G8RC, X2), X2)];
+ let AltOrders = [(add (sub G8RC, X2), X2),
+ (add (sequence "X%u", 2, 12),
+ (sequence "X%u", 31, 13), X0, X1, FP8, BP8)];
let AltOrderSelect = [{
- return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
+ return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
}];
}
@@ -310,18 +333,22 @@
def GPRC_NOR0 : RegisterClass<"PPC", [i32,f32], 32, (add (sub GPRC, R0), ZERO)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
- let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
+ let AltOrders = [(add (sub GPRC_NOR0, R2), R2),
+ (add (sequence "R%u", 2, 12),
+ (sequence "R%u", 31, 13), R1, FP, BP, ZERO)];
let AltOrderSelect = [{
- return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
+ return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
}];
}
def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
- let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
+ let AltOrders = [(add (sub G8RC_NOX0, X2), X2),
+ (add (sequence "X%u", 2, 12),
+ (sequence "X%u", 31, 13), X1, FP8, BP8, ZERO8)];
let AltOrderSelect = [{
- return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
+ return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
}];
}
@@ -433,6 +460,13 @@
}
def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
ACC4, ACC5, ACC6, ACC7)> {
+ // The AllocationPriority is in the range [0, 63]. Assigned the ACC registers
+ // the highest possible priority in this range to force the register allocator
+ // to assign these registers first. This is done because the ACC registers
+ // must represent 4 advacent vector registers. For example ACC1 must be
+ // VS4 - VS7. The value here must be at least 32 as we want to allocate
+ // these registers even before we allocate global ranges.
+ let AllocationPriority = 63;
let Size = 512;
}
@@ -449,16 +483,48 @@
def UACCRC : RegisterClass<"PPC", [v512i1], 128,
(add UACC0, UACC1, UACC2, UACC3,
UACC4, UACC5, UACC6, UACC7)> {
+ // The AllocationPriority for the UACC registers is still high and must be at
+ // least 32 as we want to allocate these registers before we allocate other
+ // global ranges. The value must be less than the AllocationPriority of the
+ // ACC registers.
+ let AllocationPriority = 36;
let Size = 512;
}
-// Allocate in the same order as the underlying VSX registers.
+// FIXME: This allocation order may increase stack frame size when allocating
+// non-volatile registers.
+//
+// Placing Altivec registers first and allocate the rest as underlying VSX
+// ones, to reduce interference with accumulator registers (lower 32 VSRs).
+// This reduces copies when loading for accumulators, which is common use for
+// paired VSX registers.
def VSRpRC :
RegisterClass<"PPC", [v256i1], 128,
- (add (sequence "VSRp%u", 0, 6),
- (sequence "VSRp%u", 15, 7), VSRp17, VSRp18,
- VSRp16, VSRp19, VSRp20, VSRp21, VSRp22, VSRp23,
- VSRp24, VSRp25, VSRp31, VSRp30, VSRp29, VSRp28,
- VSRp27, VSRp26)> {
+ (add VSRp17, VSRp18, VSRp16, VSRp19, VSRp20, VSRp21,
+ VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30,
+ VSRp29, VSRp28, VSRp27, VSRp26,
+ (sequence "VSRp%u", 0, 6),
+ (sequence "VSRp%u", 15, 7))> {
+ // Give the VSRp registers a non-zero AllocationPriority. The value is less
+ // than 32 as these registers should not always be allocated before global
+ // ranges and the value should be less than the AllocationPriority - 32 for
+ // the UACC registers. Even global VSRp registers should be allocated after
+ // the UACC registers have been chosen.
+ let AllocationPriority = 2;
let Size = 256;
}
+
+// Make AllocationOrder as similar as G8RC's to avoid potential spilling.
+// Similarly, we have an AltOrder for 64-bit ELF ABI which r2 is allocated
+// at last.
+def G8pRC :
+ RegisterClass<"PPC", [i128], 128,
+ (add (sequence "G8p%u", 1, 5),
+ (sequence "G8p%u", 14, 7),
+ G8p15, G8p6, G8p0)> {
+ let AltOrders = [(add (sub G8pRC, G8p1), G8p1)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
+ }];
+ let Size = 128;
+}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSchedule.td b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSchedule.td
index 4fa29d9..e378d57 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSchedule.td
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -54,6 +54,8 @@
def IIC_LdStLHAU : InstrItinClass;
def IIC_LdStLHAUX : InstrItinClass;
def IIC_LdStLMW : InstrItinClass;
+def IIC_LdStLQ : InstrItinClass;
+def IIC_LdStLQARX : InstrItinClass;
def IIC_LdStLVecX : InstrItinClass;
def IIC_LdStLWA : InstrItinClass;
def IIC_LdStLWARX : InstrItinClass;
@@ -61,6 +63,8 @@
def IIC_LdStSLBIE : InstrItinClass;
def IIC_LdStSTD : InstrItinClass;
def IIC_LdStSTDCX : InstrItinClass;
+def IIC_LdStSTQ : InstrItinClass;
+def IIC_LdStSTQCX : InstrItinClass;
def IIC_LdStSTU : InstrItinClass;
def IIC_LdStSTUX : InstrItinClass;
def IIC_LdStSTFD : InstrItinClass;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index d31195f..87ce32f 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -87,6 +87,8 @@
HasP9Vector = false;
HasP9Altivec = false;
HasMMA = false;
+ HasROPProtect = false;
+ HasPrivileged = false;
HasP10Vector = false;
HasPrefixInstrs = false;
HasPCRelativeMemops = false;
@@ -117,6 +119,7 @@
HasICBT = false;
HasInvariantFunctionDescriptors = false;
HasPartwordAtomics = false;
+ HasQuadwordAtomics = false;
HasDirectMove = false;
HasHTM = false;
HasFloat128 = false;
@@ -124,6 +127,7 @@
HasStoreFusion = false;
HasAddiLoadFusion = false;
HasAddisLoadFusion = false;
+ IsISA2_07 = false;
IsISA3_0 = false;
IsISA3_1 = false;
UseLongCalls = false;
@@ -181,9 +185,7 @@
StackAlignment = getPlatformStackAlignment();
// Determine endianness.
- // FIXME: Part of the TargetMachine.
- IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le ||
- TargetTriple.getArch() == Triple::ppcle);
+ IsLittleEndian = TM.isLittleEndian();
}
bool PPCSubtarget::enableMachineScheduler() const { return true; }
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 50d8939..e916b0c 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -112,6 +112,8 @@
bool HasPrefixInstrs;
bool HasPCRelativeMemops;
bool HasMMA;
+ bool HasROPProtect;
+ bool HasPrivileged;
bool HasFCPSGN;
bool HasFSQRT;
bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -137,6 +139,7 @@
bool HasICBT;
bool HasInvariantFunctionDescriptors;
bool HasPartwordAtomics;
+ bool HasQuadwordAtomics;
bool HasDirectMove;
bool HasHTM;
bool HasFloat128;
@@ -144,6 +147,7 @@
bool HasStoreFusion;
bool HasAddiLoadFusion;
bool HasAddisLoadFusion;
+ bool IsISA2_07;
bool IsISA3_0;
bool IsISA3_1;
bool UseLongCalls;
@@ -273,6 +277,8 @@
bool hasPrefixInstrs() const { return HasPrefixInstrs; }
bool hasPCRelativeMemops() const { return HasPCRelativeMemops; }
bool hasMMA() const { return HasMMA; }
+ bool hasROPProtect() const { return HasROPProtect; }
+ bool hasPrivileged() const { return HasPrivileged; }
bool pairedVectorMemops() const { return PairedVectorMemops; }
bool hasMFOCRF() const { return HasMFOCRF; }
bool hasISEL() const { return HasISEL; }
@@ -297,6 +303,7 @@
bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; }
bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; }
bool hasPartwordAtomics() const { return HasPartwordAtomics; }
+ bool hasQuadwordAtomics() const { return HasQuadwordAtomics; }
bool hasDirectMove() const { return HasDirectMove; }
Align getPlatformStackAlignment() const {
@@ -315,6 +322,7 @@
bool hasHTM() const { return HasHTM; }
bool hasFloat128() const { return HasFloat128; }
+ bool isISA2_07() const { return IsISA2_07; }
bool isISA3_0() const { return IsISA3_0; }
bool isISA3_1() const { return IsISA3_1; }
bool useLongCalls() const { return UseLongCalls; }
@@ -407,6 +415,16 @@
return PredictableSelectIsExpensive;
}
+ // Select allocation orders of GPRC and G8RC. It should be strictly consistent
+ // with corresponding AltOrders in PPCRegisterInfo.td.
+ unsigned getGPRAllocationOrderIdx() const {
+ if (is64BitELFABI())
+ return 1;
+ if (isAIXABI())
+ return 2;
+ return 0;
+ }
+
// GlobalISEL
const CallLowering *getCallLowering() const override;
const RegisterBankInfo *getRegBankInfo() const override;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 43dcc58..3186d19 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -50,6 +50,7 @@
bool Changed = false;
bool NeedFence = true;
bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+ bool IsAIX = MBB.getParent()->getSubtarget<PPCSubtarget>().isAIXABI();
bool IsPCREL = false;
for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
@@ -60,7 +61,9 @@
if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
MI.getOpcode() != PPC::ADDItlsldLADDR &&
MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
- MI.getOpcode() != PPC::ADDItlsldLADDR32 && !IsPCREL) {
+ MI.getOpcode() != PPC::ADDItlsldLADDR32 &&
+ MI.getOpcode() != PPC::TLSGDAIX &&
+ MI.getOpcode() != PPC::TLSGDAIX8 && !IsPCREL) {
// Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
// as scheduling fences, we skip creating fences if we already
// have existing ADJCALLSTACKDOWN/UP to avoid nesting,
@@ -79,6 +82,7 @@
Register OutReg = MI.getOperand(0).getReg();
Register InReg = PPC::NoRegister;
Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
+ Register GPR4 = Is64Bit ? PPC::X4 : PPC::R4;
SmallVector<Register, 3> OrigRegs = {OutReg, GPR3};
if (!IsPCREL) {
InReg = MI.getOperand(1).getReg();
@@ -106,6 +110,16 @@
Opc1 = PPC::ADDItlsldL32;
Opc2 = PPC::GETtlsldADDR32;
break;
+ case PPC::TLSGDAIX8:
+ // TLSGDAIX8 is expanded to two copies and GET_TLS_ADDR, so we only
+ // set Opc2 here.
+ Opc2 = PPC::GETtlsADDR64AIX;
+ break;
+ case PPC::TLSGDAIX:
+ // TLSGDAIX is expanded to two copies and GET_TLS_ADDR, so we only
+ // set Opc2 here.
+ Opc2 = PPC::GETtlsADDR32AIX;
+ break;
case PPC::PADDI8pc:
assert(IsPCREL && "Expecting General/Local Dynamic PCRel");
Opc1 = PPC::PADDI8pc;
@@ -125,29 +139,38 @@
BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
.addImm(0);
- MachineInstr *Addi;
- if (IsPCREL) {
- Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addImm(0);
- } else {
- // Expand into two ops built prior to the existing instruction.
- assert(InReg != PPC::NoRegister && "Operand must be a register");
- Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addReg(InReg);
- }
-
- Addi->addOperand(MI.getOperand(2));
-
// The ADDItls* instruction is the first instruction in the
// repair range.
MachineBasicBlock::iterator First = I;
--First;
- MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
- .addReg(GPR3));
- if (IsPCREL)
- Call->addOperand(MI.getOperand(2));
- else
- Call->addOperand(MI.getOperand(3));
+ if (IsAIX) {
+ // The variable offset and region handle are copied in r4 and r3. The
+ // copies are followed by GETtlsADDR32AIX/GETtlsADDR64AIX.
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), GPR4)
+ .addReg(MI.getOperand(1).getReg());
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), GPR3)
+ .addReg(MI.getOperand(2).getReg());
+ BuildMI(MBB, I, DL, TII->get(Opc2), GPR3).addReg(GPR3).addReg(GPR4);
+ } else {
+ MachineInstr *Addi;
+ if (IsPCREL) {
+ Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addImm(0);
+ } else {
+ // Expand into two ops built prior to the existing instruction.
+ assert(InReg != PPC::NoRegister && "Operand must be a register");
+ Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addReg(InReg);
+ }
+ Addi->addOperand(MI.getOperand(2));
+
+ MachineInstr *Call =
+ (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3).addReg(GPR3));
+ if (IsPCREL)
+ Call->addOperand(MI.getOperand(2));
+ else
+ Call->addOperand(MI.getOperand(3));
+ }
if (NeedFence)
BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 0634833..82717300 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -123,16 +123,21 @@
initializePPCTLSDynamicCallPass(PR);
initializePPCMIPeepholePass(PR);
initializePPCLowerMASSVEntriesPass(PR);
+ initializePPCExpandAtomicPseudoPass(PR);
initializeGlobalISel(PR);
}
+static bool isLittleEndianTriple(const Triple &T) {
+ return T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle;
+}
+
/// Return the datalayout string of a subtarget.
static std::string getDataLayoutString(const Triple &T) {
bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
std::string Ret;
// Most PPC* platforms are big endian, PPC(64)LE is little endian.
- if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
+ if (isLittleEndianTriple(T))
Ret = "e";
else
Ret = "E";
@@ -157,9 +162,8 @@
// Specify the vector alignment explicitly. For v256i1 and v512i1, the
// calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
// which is 256 and 512 bytes - way over aligned.
- if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) &&
- (T.isOSAIX() || T.isOSLinux()))
- Ret += "-v256:256:256-v512:512:512";
+ if (is64Bit && (T.isOSAIX() || T.isOSLinux()))
+ Ret += "-S128-v256:256:256-v512:512:512";
return Ret;
}
@@ -318,7 +322,8 @@
getEffectiveRelocModel(TT, RM),
getEffectivePPCCodeModel(TT, CM, JIT), OL),
TLOF(createTLOF(getTargetTriple())),
- TargetABI(computeTargetABI(TT, Options)) {
+ TargetABI(computeTargetABI(TT, Options)),
+ Endianness(isLittleEndianTriple(TT) ? Endian::LITTLE : Endian::BIG) {
initAsmInfo();
}
@@ -339,8 +344,7 @@
// function before we can generate a subtarget. We also need to use
// it as a key for the subtarget since that can be the only difference
// between two functions.
- bool SoftFloat =
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool SoftFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
// If the soft float attribute is set on the function turn on the soft float
// subtarget feature.
if (SoftFloat)
@@ -394,6 +398,7 @@
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+ void addPreEmitPass2() override;
// GlobalISEL
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
@@ -532,6 +537,13 @@
if (getOptLevel() != CodeGenOpt::None)
addPass(createPPCEarlyReturnPass());
+}
+
+void PPCPassConfig::addPreEmitPass2() {
+ // Schedule the expansion of AMOs at the last possible moment, avoiding the
+ // possibility for other passes to break the requirements for forward
+ // progress in the LL/SC block.
+ addPass(createPPCExpandAtomicPseudoPass());
// Must run branch selection immediately preceding the asm printer.
addPass(createPPCBranchSelectionPass());
}
@@ -541,6 +553,12 @@
return TargetTransformInfo(PPCTTIImpl(this, F));
}
+bool PPCTargetMachine::isLittleEndian() const {
+ assert(Endianness != Endian::NOT_DETECTED &&
+ "Unable to determine endianness");
+ return Endianness == Endian::LITTLE;
+}
+
static MachineSchedRegistry
PPCPreRASchedRegistry("ppc-prera",
"Run PowerPC PreRA specific scheduler",
@@ -568,6 +586,6 @@
}
bool PPCPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index 21faa4e..ed9e74b 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -25,9 +25,12 @@
class PPCTargetMachine final : public LLVMTargetMachine {
public:
enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
+ enum Endian { NOT_DETECTED, LITTLE, BIG };
+
private:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
PPCABI TargetABI;
+ Endian Endianness = Endian::NOT_DETECTED;
mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
@@ -63,6 +66,8 @@
// Addrspacecasts are always noops.
return true;
}
+
+ bool isLittleEndian() const;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index d52c9f9..736150f 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -40,7 +40,8 @@
if (Kind.isReadOnly()) {
const auto *GVar = dyn_cast<GlobalVariable>(GO);
- if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation())
+ if (GVar && GVar->isConstant() &&
+ GVar->getInitializer()->needsDynamicRelocation())
Kind = SectionKind::getReadOnlyWithRel();
}
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
index e17361d..82fcd9e 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -23,7 +23,8 @@
PPCTargetStreamer(MCStreamer &S);
~PPCTargetStreamer() override;
- virtual void emitTCEntry(const MCSymbol &S) = 0;
+ virtual void emitTCEntry(const MCSymbol &S,
+ MCSymbolRefExpr::VariantKind Kind) = 0;
virtual void emitMachine(StringRef CPU) = 0;
virtual void emitAbiVersion(int AbiVersion) = 0;
virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index c90ff8b..d5a7873 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -167,8 +167,8 @@
return None;
}
-int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCost(Imm, Ty, CostKind);
@@ -197,9 +197,9 @@
return 4 * TTI::TCC_Basic;
}
-int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
@@ -232,10 +232,10 @@
return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
@@ -318,9 +318,9 @@
return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned
-PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
- TTI::TargetCostKind CostKind) {
+InstructionCost PPCTTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind) {
// We already implement getCastInstrCost and getMemoryOpCost where we perform
// the vector adjustment there.
if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
@@ -328,7 +328,8 @@
if (U->getType()->isVectorTy()) {
// Instructions that need to be split should cost more.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, U->getType());
return LT.first * BaseT::getUserCost(U, Operands, CostKind);
}
@@ -370,7 +371,7 @@
InlineAsm::ConstraintInfo &C = CIV[i];
if (C.Type != InlineAsm::isInput)
for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
- if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+ if (StringRef(C.Codes[j]).equals_insensitive("{ctr}"))
return true;
}
return false;
@@ -871,16 +872,18 @@
}
}
-unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
- if (Vector) {
- if (ST->hasAltivec()) return 128;
- return 0;
+TypeSize
+PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
}
- if (ST->isPPC64())
- return 64;
- return 32;
-
+ llvm_unreachable("Unsupported register kind");
}
unsigned PPCTTIImpl::getCacheLineSize() const {
@@ -938,12 +941,13 @@
// Adjust the cost of vector instructions on targets which there is overlap
// between the vector and scalar units, thereby reducing the overall throughput
// of vector code wrt. scalar code.
-int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
- Type *Ty2) {
+InstructionCost PPCTTIImpl::vectorCostAdjustment(InstructionCost Cost,
+ unsigned Opcode, Type *Ty1,
+ Type *Ty2) {
if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
return Cost;
- std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
+ std::pair<InstructionCost, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
// If type legalization involves splitting the vector, we don't want to
// double the cost at every step - only the last step.
if (LT1.first != 1 || !LT1.second.isVector())
@@ -954,7 +958,7 @@
return Cost;
if (Ty2) {
- std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
+ std::pair<InstructionCost, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
if (LT2.first != 1 || !LT2.second.isVector())
return Cost;
}
@@ -962,14 +966,12 @@
return Cost * 2;
}
-int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost PPCTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
@@ -978,16 +980,16 @@
Opd2PropInfo, Args, CxtI);
// Fallback to the default implementation.
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
- Op2Info,
- Opd1PropInfo, Opd2PropInfo);
+ InstructionCost Cost = BaseT::getArithmeticInstrCost(
+ Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
}
-int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+ ArrayRef<int> Mask, int Index,
+ Type *SubTp) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// PPC, for both Altivec/VSX, support cheap arbitrary permutations
// (at least in the sense that there need only be one non-loop-invariant
@@ -998,20 +1000,24 @@
nullptr);
}
-int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+InstructionCost PPCTTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
return Opcode == Instruction::PHI ? 0 : 1;
// Branches are assumed to be predicted.
- return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+ return 0;
}
-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
- int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+ InstructionCost Cost =
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
// TODO: Allow non-throughput costs that aren't binary.
if (CostKind != TTI::TCK_RecipThroughput)
@@ -1019,11 +1025,12 @@
return Cost;
}
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
- int Cost =
+InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ InstructionCost Cost =
BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
@@ -1031,13 +1038,14 @@
return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
}
-int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
+ InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
@@ -1097,20 +1105,21 @@
return Cost;
}
-int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (TLI->getValueType(DL, Src, true) == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Invalid Opcode");
- int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
- CostKind);
+ InstructionCost Cost =
+ BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return Cost;
@@ -1177,7 +1186,7 @@
return Cost;
}
-int PPCTTIImpl::getInterleavedMemoryOpCost(
+InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
@@ -1190,12 +1199,11 @@
"Expect a vector type for interleaved memory op");
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
// Firstly, the cost of load/store operation.
- int Cost =
- getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
+ AddressSpace, CostKind);
// PPC, for both Altivec/VSX, support cheap arbitrary permutations
// (at least in the sense that there need only be one non-loop-invariant
@@ -1207,8 +1215,9 @@
return Cost;
}
-unsigned PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
@@ -1273,6 +1282,14 @@
return false;
}
+bool PPCTTIImpl::shouldBuildRelLookupTables() const {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+ // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
+ if (!TM.isELFv2ABI())
+ return false;
+ return BaseT::shouldBuildRelLookupTables();
+}
+
bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) {
switch (Inst->getIntrinsicID()) {
diff --git a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index c38ae90..8ac3038 100644
--- a/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -48,17 +48,19 @@
/// @{
using BaseT::getIntImmCost;
- int getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
- unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
- TTI::TargetCostKind CostKind);
+ InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
@@ -76,7 +78,7 @@
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool isNumRegsMajorCostOfLSR();
-
+ bool shouldBuildRelLookupTables() const;
/// @}
/// \name Vector TTI Implementations
@@ -93,12 +95,13 @@
unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
const char* getRegisterClassName(unsigned ClassID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getCacheLineSize() const override;
unsigned getPrefetchDistance() const override;
unsigned getMaxInterleaveFactor(unsigned VF);
- int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
- int getArithmeticInstrCost(
+ InstructionCost vectorCostAdjustment(InstructionCost Cost, unsigned Opcode,
+ Type *Ty1, Type *Ty2);
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -107,28 +110,31 @@
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getInterleavedMemoryOpCost(
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+ ArrayRef<int> Mask, int Index, Type *SubTp);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
-
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
bool areFunctionArgsABICompatible(const Function *Caller,
const Function *Callee,
SmallPtrSetImpl<Argument *> &Args) const;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/src/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index dcf7525..87496e0 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -528,8 +528,7 @@
RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
- return IsConstantImm && isInt<6>(Imm) &&
- VK == RISCVMCExpr::VK_RISCV_None;
+ return IsConstantImm && isInt<6>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm6NonZero() const {
@@ -841,10 +840,18 @@
switch (c) {
default:
llvm_unreachable("FenceArg must contain only [iorw]");
- case 'i': Imm |= RISCVFenceField::I; break;
- case 'o': Imm |= RISCVFenceField::O; break;
- case 'r': Imm |= RISCVFenceField::R; break;
- case 'w': Imm |= RISCVFenceField::W; break;
+ case 'i':
+ Imm |= RISCVFenceField::I;
+ break;
+ case 'o':
+ Imm |= RISCVFenceField::O;
+ break;
+ case 'r':
+ Imm |= RISCVFenceField::R;
+ break;
+ case 'w':
+ Imm |= RISCVFenceField::W;
+ break;
}
}
Inst.addOperand(MCOperand::createImm(Imm));
@@ -894,6 +901,21 @@
return Reg - RISCV::F0_D + RISCV::F0_F;
}
+static MCRegister convertVRToVRMx(const MCRegisterInfo &RI, MCRegister Reg,
+ unsigned Kind) {
+ unsigned RegClassID;
+ if (Kind == MCK_VRM2)
+ RegClassID = RISCV::VRM2RegClassID;
+ else if (Kind == MCK_VRM4)
+ RegClassID = RISCV::VRM4RegClassID;
+ else if (Kind == MCK_VRM8)
+ RegClassID = RISCV::VRM8RegClassID;
+ else
+ return 0;
+ return RI.getMatchingSuperReg(Reg, RISCV::sub_vrm1_0,
+ &RISCVMCRegisterClasses[RegClassID]);
+}
+
unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
unsigned Kind) {
RISCVOperand &Op = static_cast<RISCVOperand &>(AsmOp);
@@ -905,6 +927,7 @@
RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg);
bool IsRegFPR64C =
RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg);
+ bool IsRegVR = RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg);
// As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
// register from FPR64 to FPR32 or FPR64C to FPR32C if necessary.
@@ -919,6 +942,14 @@
Op.Reg.RegNum = convertFPR64ToFPR16(Reg);
return Match_Success;
}
+ // As the parser couldn't differentiate an VRM2/VRM4/VRM8 from an VR, coerce
+ // the register from VR to VRM2/VRM4/VRM8 if necessary.
+ if (IsRegVR && (Kind == MCK_VRM2 || Kind == MCK_VRM4 || Kind == MCK_VRM8)) {
+ Op.Reg.RegNum = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind);
+ if (Op.Reg.RegNum == 0)
+ return Match_InvalidOperand;
+ return Match_Success;
+ }
return Match_InvalidOperand;
}
@@ -930,8 +961,8 @@
}
static std::string RISCVMnemonicSpellCheck(StringRef S,
- const FeatureBitset &FBS,
- unsigned VariantID = 0);
+ const FeatureBitset &FBS,
+ unsigned VariantID = 0);
bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
@@ -941,9 +972,8 @@
MCInst Inst;
FeatureBitset MissingFeatures;
- auto Result =
- MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
- MatchingInlineAsm);
+ auto Result = MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+ MatchingInlineAsm);
switch (Result) {
default:
break;
@@ -966,8 +996,8 @@
}
case Match_MnemonicFail: {
FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
- std::string Suggestion = RISCVMnemonicSpellCheck(
- ((RISCVOperand &)*Operands[0]).getToken(), FBS);
+ std::string Suggestion =
+ RISCVMnemonicSpellCheck(((RISCVOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "unrecognized instruction mnemonic" + Suggestion);
}
case Match_InvalidOperand: {
@@ -990,10 +1020,10 @@
if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
- return Error(ErrorLoc, "too few operands for instruction");
+ return Error(ErrorLoc, "too few operands for instruction");
}
- switch(Result) {
+ switch (Result) {
default:
break;
case Match_InvalidImmXLenLI:
@@ -1282,6 +1312,11 @@
auto SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
if (!SysReg)
SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
+ if (!SysReg)
+ if ((SysReg = RISCVSysReg::lookupSysRegByDeprecatedName(Identifier)))
+ Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
+ SysReg->Name + "'");
+
// Accept a named Sys Reg if the required features are present.
if (SysReg) {
if (!SysReg->haveRequiredFeatures(getSTI().getFeatureBits())) {
@@ -1496,80 +1531,82 @@
OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
SMLoc S = getLoc();
- if (getLexer().getKind() != AsmToken::Identifier)
+ if (getLexer().isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
- // Parse "e8,m1,t[a|u],m[a|u]"
- StringRef Name = getLexer().getTok().getIdentifier();
- if (!Name.consume_front("e"))
- return MatchOperand_NoMatch;
- unsigned Sew;
- if (Name.getAsInteger(10, Sew))
- return MatchOperand_NoMatch;
- if (!RISCVVType::isValidSEW(Sew))
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ SmallVector<AsmToken, 7> VTypeIElements;
+ // Put all the tokens for vtypei operand into VTypeIElements vector.
+ while (getLexer().isNot(AsmToken::EndOfStatement)) {
+ VTypeIElements.push_back(getLexer().getTok());
+ getLexer().Lex();
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+ if (getLexer().isNot(AsmToken::Comma))
+ goto MatchFail;
+ AsmToken Comma = getLexer().getTok();
+ VTypeIElements.push_back(Comma);
+ getLexer().Lex();
+ }
- if (!getLexer().is(AsmToken::Comma))
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ if (VTypeIElements.size() == 7) {
+ // The VTypeIElements layout is:
+ // SEW comma LMUL comma TA comma MA
+ // 0 1 2 3 4 5 6
+ StringRef Name = VTypeIElements[0].getIdentifier();
+ if (!Name.consume_front("e"))
+ goto MatchFail;
+ unsigned Sew;
+ if (Name.getAsInteger(10, Sew))
+ goto MatchFail;
+ if (!RISCVVType::isValidSEW(Sew))
+ goto MatchFail;
- Name = getLexer().getTok().getIdentifier();
- if (!Name.consume_front("m"))
- return MatchOperand_NoMatch;
- // "m" or "mf"
- bool Fractional = Name.consume_front("f");
- unsigned Lmul;
- if (Name.getAsInteger(10, Lmul))
- return MatchOperand_NoMatch;
- if (!RISCVVType::isValidLMUL(Lmul, Fractional))
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ Name = VTypeIElements[2].getIdentifier();
+ if (!Name.consume_front("m"))
+ goto MatchFail;
+ // "m" or "mf"
+ bool Fractional = Name.consume_front("f");
+ unsigned Lmul;
+ if (Name.getAsInteger(10, Lmul))
+ goto MatchFail;
+ if (!RISCVVType::isValidLMUL(Lmul, Fractional))
+ goto MatchFail;
- if (!getLexer().is(AsmToken::Comma))
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ // ta or tu
+ Name = VTypeIElements[4].getIdentifier();
+ bool TailAgnostic;
+ if (Name == "ta")
+ TailAgnostic = true;
+ else if (Name == "tu")
+ TailAgnostic = false;
+ else
+ goto MatchFail;
- Name = getLexer().getTok().getIdentifier();
- // ta or tu
- bool TailAgnostic;
- if (Name == "ta")
- TailAgnostic = true;
- else if (Name == "tu")
- TailAgnostic = false;
- else
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ // ma or mu
+ Name = VTypeIElements[6].getIdentifier();
+ bool MaskAgnostic;
+ if (Name == "ma")
+ MaskAgnostic = true;
+ else if (Name == "mu")
+ MaskAgnostic = false;
+ else
+ goto MatchFail;
- if (!getLexer().is(AsmToken::Comma))
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ unsigned LmulLog2 = Log2_32(Lmul);
+ RISCVII::VLMUL VLMUL =
+ static_cast<RISCVII::VLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
- Name = getLexer().getTok().getIdentifier();
- // ma or mu
- bool MaskAgnostic;
- if (Name == "ma")
- MaskAgnostic = true;
- else if (Name == "mu")
- MaskAgnostic = false;
- else
- return MatchOperand_NoMatch;
- getLexer().Lex();
+ unsigned VTypeI =
+ RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
+ Operands.push_back(RISCVOperand::createVType(VTypeI, S, isRV64()));
+ return MatchOperand_Success;
+ }
- if (getLexer().getKind() != AsmToken::EndOfStatement)
- return MatchOperand_NoMatch;
-
- unsigned SewLog2 = Log2_32(Sew / 8);
- unsigned LmulLog2 = Log2_32(Lmul);
- RISCVVSEW VSEW = static_cast<RISCVVSEW>(SewLog2);
- RISCVVLMUL VLMUL =
- static_cast<RISCVVLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
-
- unsigned VTypeI =
- RISCVVType::encodeVTYPE(VLMUL, VSEW, TailAgnostic, MaskAgnostic);
- Operands.push_back(RISCVOperand::createVType(VTypeI, S, isRV64()));
-
- return MatchOperand_Success;
+// If NoMatch, unlex all the tokens that comprise a vtypei operand
+MatchFail:
+ while (!VTypeIElements.empty())
+ getLexer().UnLex(VTypeIElements.pop_back_val());
+ return MatchOperand_NoMatch;
}
OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
@@ -1933,7 +1970,7 @@
if (Parser.getTok().is(AsmToken::Identifier)) {
StringRef Name = Parser.getTok().getIdentifier();
Optional<unsigned> Ret =
- ELFAttrs::attrTypeFromString(Name, RISCVAttrs::RISCVAttributeTags);
+ ELFAttrs::attrTypeFromString(Name, RISCVAttrs::getRISCVAttributeTags());
if (!Ret.hasValue()) {
Error(TagLoc, "attribute name not recognised: " + Name);
return false;
@@ -2173,14 +2210,19 @@
void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
MCStreamer &Out) {
- RISCVMatInt::InstSeq Seq;
- RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
+ RISCVMatInt::InstSeq Seq =
+ RISCVMatInt::generateInstSeq(Value, getSTI().getFeatureBits());
MCRegister SrcReg = RISCV::X0;
for (RISCVMatInt::Inst &Inst : Seq) {
if (Inst.Opc == RISCV::LUI) {
emitToStreamer(
Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
+ } else if (Inst.Opc == RISCV::ADDUW) {
+ emitToStreamer(Out, MCInstBuilder(RISCV::ADDUW)
+ .addReg(DestReg)
+ .addReg(SrcReg)
+ .addReg(RISCV::X0));
} else {
emitToStreamer(
Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
@@ -2367,7 +2409,8 @@
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
.addReg(RISCV::V0));
- } else if (Inst.getNumOperands() == 5) {
+ } else if (Inst.getNumOperands() == 5 &&
+ Inst.getOperand(0).getReg() == RISCV::V0) {
// masked va >= x, vd == v0
//
// pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
@@ -2385,6 +2428,31 @@
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1)));
+ } else if (Inst.getNumOperands() == 5) {
+ // masked va >= x, any vd
+ //
+ // pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ // expansion: vmslt{u}.vx vt, va, x; vmandnot.mm vt, v0, vt; vmandnot.mm vd,
+ // vd, v0; vmor.mm vd, vt, vd
+ assert(Inst.getOperand(1).getReg() != RISCV::V0 &&
+ "The temporary vector register should not be V0.");
+ emitToStreamer(Out, MCInstBuilder(Opcode)
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(2))
+ .addOperand(Inst.getOperand(3))
+ .addReg(RISCV::NoRegister));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+ .addOperand(Inst.getOperand(1))
+ .addReg(RISCV::V0)
+ .addOperand(Inst.getOperand(1)));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(0))
+ .addReg(RISCV::V0));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMOR_MM)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(0)));
}
}
@@ -2408,9 +2476,20 @@
bool RISCVAsmParser::validateInstruction(MCInst &Inst,
OperandVector &Operands) {
+ if (Inst.getOpcode() == RISCV::PseudoVMSGEU_VX_M_T ||
+ Inst.getOpcode() == RISCV::PseudoVMSGE_VX_M_T) {
+ unsigned DestReg = Inst.getOperand(0).getReg();
+ unsigned TempReg = Inst.getOperand(1).getReg();
+ if (DestReg == TempReg) {
+ SMLoc Loc = Operands.back()->getStartLoc();
+ return Error(Loc, "The temporary vector register cannot be the same as "
+ "the destination register.");
+ }
+ }
+
const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
- unsigned Constraints =
- (MCID.TSFlags & RISCVII::ConstraintMask) >> RISCVII::ConstraintShift;
+ RISCVII::VConstraintType Constraints =
+ RISCVII::getConstraint(MCID.TSFlags);
if (Constraints == RISCVII::NoConstraint)
return false;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/RISCV/CMakeLists.txt
index c4f3d31..961781b 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -21,10 +21,10 @@
add_llvm_target(RISCVCodeGen
RISCVAsmPrinter.cpp
RISCVCallLowering.cpp
- RISCVCleanupVSETVLI.cpp
RISCVExpandAtomicPseudoInsts.cpp
RISCVExpandPseudoInsts.cpp
RISCVFrameLowering.cpp
+ RISCVInsertVSETVLI.cpp
RISCVInstrInfo.cpp
RISCVInstructionSelector.cpp
RISCVISelDAGToDAG.cpp
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/CMakeLists.txt
index b0aae87..682ff19 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/CMakeLists.txt
@@ -2,7 +2,9 @@
RISCVDisassembler.cpp
LINK_COMPONENTS
+ MC
MCDisassembler
+ RISCVDesc
RISCVInfo
Support
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 6235523..504a78d 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -172,6 +172,66 @@
return MCDisassembler::Success;
}
+static DecodeStatus DecodeVRM2RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 32)
+ return MCDisassembler::Fail;
+
+ if (RegNo % 2)
+ return MCDisassembler::Fail;
+
+ const RISCVDisassembler *Dis =
+ static_cast<const RISCVDisassembler *>(Decoder);
+ const MCRegisterInfo *RI = Dis->getContext().getRegisterInfo();
+ MCRegister Reg =
+ RI->getMatchingSuperReg(RISCV::V0 + RegNo, RISCV::sub_vrm1_0,
+ &RISCVMCRegisterClasses[RISCV::VRM2RegClassID]);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVRM4RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 32)
+ return MCDisassembler::Fail;
+
+ if (RegNo % 4)
+ return MCDisassembler::Fail;
+
+ const RISCVDisassembler *Dis =
+ static_cast<const RISCVDisassembler *>(Decoder);
+ const MCRegisterInfo *RI = Dis->getContext().getRegisterInfo();
+ MCRegister Reg =
+ RI->getMatchingSuperReg(RISCV::V0 + RegNo, RISCV::sub_vrm1_0,
+ &RISCVMCRegisterClasses[RISCV::VRM4RegClassID]);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 32)
+ return MCDisassembler::Fail;
+
+ if (RegNo % 8)
+ return MCDisassembler::Fail;
+
+ const RISCVDisassembler *Dis =
+ static_cast<const RISCVDisassembler *>(Decoder);
+ const MCRegisterInfo *RI = Dis->getContext().getRegisterInfo();
+ MCRegister Reg =
+ RI->getMatchingSuperReg(RISCV::V0 + RegNo, RISCV::sub_vrm1_0,
+ &RISCVMCRegisterClasses[RISCV::VRM8RegClassID]);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
uint64_t Address, const void *Decoder) {
MCRegister Reg = RISCV::NoRegister;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index bb1f1cc..b93197e 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -9,6 +9,7 @@
#include "RISCVAsmBackend.h"
#include "RISCVMCExpr.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -18,7 +19,10 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -30,6 +34,9 @@
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_RISCV_NONE)
+ .Case("BFD_RELOC_32", ELF::R_RISCV_32)
+ .Case("BFD_RELOC_64", ELF::R_RISCV_64)
.Default(-1u);
if (Type != -1u)
return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
@@ -67,7 +74,26 @@
{"fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_riscv_call_plt", 0, 64, MCFixupKindInfo::FKF_IsPCRel},
{"fixup_riscv_relax", 0, 0, 0},
- {"fixup_riscv_align", 0, 0, 0}};
+ {"fixup_riscv_align", 0, 0, 0},
+
+ {"fixup_riscv_set_8", 0, 8, 0},
+ {"fixup_riscv_add_8", 0, 8, 0},
+ {"fixup_riscv_sub_8", 0, 8, 0},
+
+ {"fixup_riscv_set_16", 0, 16, 0},
+ {"fixup_riscv_add_16", 0, 16, 0},
+ {"fixup_riscv_sub_16", 0, 16, 0},
+
+ {"fixup_riscv_set_32", 0, 32, 0},
+ {"fixup_riscv_add_32", 0, 32, 0},
+ {"fixup_riscv_sub_32", 0, 32, 0},
+
+ {"fixup_riscv_add_64", 0, 64, 0},
+ {"fixup_riscv_sub_64", 0, 64, 0},
+
+ {"fixup_riscv_set_6b", 2, 6, 0},
+ {"fixup_riscv_sub_6b", 2, 6, 0},
+ };
static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
"Not all fixup kinds added to Infos array");
@@ -176,6 +202,135 @@
Inst = std::move(Res);
}
+bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
+ MCAsmLayout &Layout,
+ bool &WasRelaxed) const {
+ MCContext &C = Layout.getAssembler().getContext();
+
+ int64_t LineDelta = DF.getLineDelta();
+ const MCExpr &AddrDelta = DF.getAddrDelta();
+ SmallVectorImpl<char> &Data = DF.getContents();
+ SmallVectorImpl<MCFixup> &Fixups = DF.getFixups();
+ size_t OldSize = Data.size();
+
+ int64_t Value;
+ bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, Layout);
+ assert(IsAbsolute && "CFA with invalid expression");
+ (void)IsAbsolute;
+
+ Data.clear();
+ Fixups.clear();
+ raw_svector_ostream OS(Data);
+
+ // INT64_MAX is a signal that this is actually a DW_LNE_end_sequence.
+ if (LineDelta != INT64_MAX) {
+ OS << uint8_t(dwarf::DW_LNS_advance_line);
+ encodeSLEB128(LineDelta, OS);
+ }
+
+ unsigned Offset;
+ std::pair<unsigned, unsigned> Fixup;
+
+ // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
+ // takes a single unsigned half (unencoded) operand. The maximum encodable
+ // value is therefore 65535. Set a conservative upper bound for relaxation.
+ if (Value > 60000) {
+ unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
+
+ OS << uint8_t(dwarf::DW_LNS_extended_op);
+ encodeULEB128(PtrSize + 1, OS);
+
+ OS << uint8_t(dwarf::DW_LNE_set_address);
+ Offset = OS.tell();
+ Fixup = PtrSize == 4 ? std::make_pair(RISCV::fixup_riscv_add_32,
+ RISCV::fixup_riscv_sub_32)
+ : std::make_pair(RISCV::fixup_riscv_add_64,
+ RISCV::fixup_riscv_sub_64);
+ OS.write_zeros(PtrSize);
+ } else {
+ OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
+ Offset = OS.tell();
+ Fixup = {RISCV::fixup_riscv_add_16, RISCV::fixup_riscv_sub_16};
+ support::endian::write<uint16_t>(OS, 0, support::little);
+ }
+
+ const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
+ Fixups.push_back(MCFixup::create(
+ Offset, MBE.getLHS(), static_cast<MCFixupKind>(std::get<0>(Fixup))));
+ Fixups.push_back(MCFixup::create(
+ Offset, MBE.getRHS(), static_cast<MCFixupKind>(std::get<1>(Fixup))));
+
+ if (LineDelta == INT64_MAX) {
+ OS << uint8_t(dwarf::DW_LNS_extended_op);
+ OS << uint8_t(1);
+ OS << uint8_t(dwarf::DW_LNE_end_sequence);
+ } else {
+ OS << uint8_t(dwarf::DW_LNS_copy);
+ }
+
+ WasRelaxed = OldSize != Data.size();
+ return true;
+}
+
+bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
+ MCAsmLayout &Layout,
+ bool &WasRelaxed) const {
+
+ const MCExpr &AddrDelta = DF.getAddrDelta();
+ SmallVectorImpl<char> &Data = DF.getContents();
+ SmallVectorImpl<MCFixup> &Fixups = DF.getFixups();
+ size_t OldSize = Data.size();
+
+ int64_t Value;
+ bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, Layout);
+ assert(IsAbsolute && "CFA with invalid expression");
+ (void)IsAbsolute;
+
+ Data.clear();
+ Fixups.clear();
+ raw_svector_ostream OS(Data);
+
+ assert(
+ Layout.getAssembler().getContext().getAsmInfo()->getMinInstAlignment() ==
+ 1 &&
+ "expected 1-byte alignment");
+ if (Value == 0) {
+ WasRelaxed = OldSize != Data.size();
+ return true;
+ }
+
+ auto AddFixups = [&Fixups, &AddrDelta](unsigned Offset,
+ std::pair<unsigned, unsigned> Fixup) {
+ const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
+ Fixups.push_back(MCFixup::create(
+ Offset, MBE.getLHS(), static_cast<MCFixupKind>(std::get<0>(Fixup))));
+ Fixups.push_back(MCFixup::create(
+ Offset, MBE.getRHS(), static_cast<MCFixupKind>(std::get<1>(Fixup))));
+ };
+
+ if (isUIntN(6, Value)) {
+ OS << uint8_t(dwarf::DW_CFA_advance_loc);
+ AddFixups(0, {RISCV::fixup_riscv_set_6b, RISCV::fixup_riscv_sub_6b});
+ } else if (isUInt<8>(Value)) {
+ OS << uint8_t(dwarf::DW_CFA_advance_loc1);
+ support::endian::write<uint8_t>(OS, 0, support::little);
+ AddFixups(1, {RISCV::fixup_riscv_set_8, RISCV::fixup_riscv_sub_8});
+ } else if (isUInt<16>(Value)) {
+ OS << uint8_t(dwarf::DW_CFA_advance_loc2);
+ support::endian::write<uint16_t>(OS, 0, support::little);
+ AddFixups(1, {RISCV::fixup_riscv_set_16, RISCV::fixup_riscv_sub_16});
+ } else if (isUInt<32>(Value)) {
+ OS << uint8_t(dwarf::DW_CFA_advance_loc4);
+ support::endian::write<uint32_t>(OS, 0, support::little);
+ AddFixups(1, {RISCV::fixup_riscv_set_32, RISCV::fixup_riscv_sub_32});
+ } else {
+ llvm_unreachable("unsupported CFA encoding");
+ }
+
+ WasRelaxed = OldSize != Data.size();
+ return true;
+}
+
// Given a compressed control flow instruction this function returns
// the expanded instruction.
unsigned RISCVAsmBackend::getRelaxedOpcode(unsigned Op) const {
@@ -224,12 +379,25 @@
case RISCV::fixup_riscv_tls_got_hi20:
case RISCV::fixup_riscv_tls_gd_hi20:
llvm_unreachable("Relocation should be unconditionally forced\n");
+ case RISCV::fixup_riscv_set_8:
+ case RISCV::fixup_riscv_add_8:
+ case RISCV::fixup_riscv_sub_8:
+ case RISCV::fixup_riscv_set_16:
+ case RISCV::fixup_riscv_add_16:
+ case RISCV::fixup_riscv_sub_16:
+ case RISCV::fixup_riscv_set_32:
+ case RISCV::fixup_riscv_add_32:
+ case RISCV::fixup_riscv_sub_32:
+ case RISCV::fixup_riscv_add_64:
+ case RISCV::fixup_riscv_sub_64:
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
case FK_Data_6b:
return Value;
+ case RISCV::fixup_riscv_set_6b:
+ return Value & 0x03;
case RISCV::fixup_riscv_lo12_i:
case RISCV::fixup_riscv_pcrel_lo12_i:
case RISCV::fixup_riscv_tprel_lo12_i:
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 56991cc..e162867 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -42,20 +42,6 @@
void setForceRelocs() { ForceRelocs = true; }
- // Returns true if relocations will be forced for shouldForceRelocation by
- // default. This will be true if relaxation is enabled or had previously
- // been enabled.
- bool willForceRelocations() const {
- return ForceRelocs || STI.getFeatureBits()[RISCV::FeatureRelax];
- }
-
- // Generate diff expression relocations if the relax feature is enabled or had
- // previously been enabled, otherwise it is safe for the assembler to
- // calculate these internally.
- bool requiresDiffExpressionRelocations() const override {
- return willForceRelocations();
- }
-
// Return Size with extra Nop Bytes for alignment directive in code section.
bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
unsigned &Size) override;
@@ -108,6 +94,11 @@
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
+ bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, MCAsmLayout &Layout,
+ bool &WasRelaxed) const override;
+ bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,
+ bool &WasRelaxed) const override;
+
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index fa36234..60e8609 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -88,55 +88,78 @@
namespace RISCVFeatures {
void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
+ if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit])
+ report_fatal_error("RV64 target requires an RV64 CPU");
+ if (!TT.isArch64Bit() && FeatureBits[RISCV::Feature64Bit])
+ report_fatal_error("RV32 target requires an RV32 CPU");
if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E])
report_fatal_error("RV32E can't be enabled for an RV64 target");
}
} // namespace RISCVFeatures
-namespace RISCVVPseudosTable {
+// Encode VTYPE into the binary format used by the the VSETVLI instruction which
+// is used by our MC layer representation.
+//
+// Bits | Name | Description
+// -----+------------+------------------------------------------------
+// 7 | vma | Vector mask agnostic
+// 6 | vta | Vector tail agnostic
+// 5:3 | vsew[2:0] | Standard element width (SEW) setting
+// 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting
+unsigned RISCVVType::encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW,
+ bool TailAgnostic, bool MaskAgnostic) {
+ assert(isValidSEW(SEW) && "Invalid SEW");
+ unsigned VLMULBits = static_cast<unsigned>(VLMUL);
+ unsigned VSEWBits = Log2_32(SEW) - 3;
+ unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
+ if (TailAgnostic)
+ VTypeI |= 0x40;
+ if (MaskAgnostic)
+ VTypeI |= 0x80;
-#define GET_RISCVVPseudosTable_IMPL
-#include "RISCVGenSearchableTables.inc"
+ return VTypeI;
+}
-} // namespace RISCVVPseudosTable
+std::pair<unsigned, bool> RISCVVType::decodeVLMUL(RISCVII::VLMUL VLMUL) {
+ switch (VLMUL) {
+ default:
+ llvm_unreachable("Unexpected LMUL value!");
+ case RISCVII::VLMUL::LMUL_1:
+ case RISCVII::VLMUL::LMUL_2:
+ case RISCVII::VLMUL::LMUL_4:
+ case RISCVII::VLMUL::LMUL_8:
+ return std::make_pair(1 << static_cast<unsigned>(VLMUL), false);
+ case RISCVII::VLMUL::LMUL_F2:
+ case RISCVII::VLMUL::LMUL_F4:
+ case RISCVII::VLMUL::LMUL_F8:
+ return std::make_pair(1 << (8 - static_cast<unsigned>(VLMUL)), true);
+ }
+}
void RISCVVType::printVType(unsigned VType, raw_ostream &OS) {
- RISCVVSEW VSEW = getVSEW(VType);
- RISCVVLMUL VLMUL = getVLMUL(VType);
-
- unsigned Sew = 1 << (static_cast<unsigned>(VSEW) + 3);
+ unsigned Sew = getSEW(VType);
OS << "e" << Sew;
- switch (VLMUL) {
- case RISCVVLMUL::LMUL_RESERVED:
- llvm_unreachable("Unexpected LMUL value!");
- case RISCVVLMUL::LMUL_1:
- case RISCVVLMUL::LMUL_2:
- case RISCVVLMUL::LMUL_4:
- case RISCVVLMUL::LMUL_8: {
- unsigned LMul = 1 << static_cast<unsigned>(VLMUL);
- OS << ",m" << LMul;
- break;
- }
- case RISCVVLMUL::LMUL_F2:
- case RISCVVLMUL::LMUL_F4:
- case RISCVVLMUL::LMUL_F8: {
- unsigned LMul = 1 << (8 - static_cast<unsigned>(VLMUL));
- OS << ",mf" << LMul;
- break;
- }
- }
+ unsigned LMul;
+ bool Fractional;
+ std::tie(LMul, Fractional) = decodeVLMUL(getVLMUL(VType));
+
+ if (Fractional)
+ OS << ", mf";
+ else
+ OS << ", m";
+ OS << LMul;
if (isTailAgnostic(VType))
- OS << ",ta";
+ OS << ", ta";
else
- OS << ",tu";
+ OS << ", tu";
if (isMaskAgnostic(VType))
- OS << ",ma";
+ OS << ", ma";
else
- OS << ",mu";
+ OS << ", mu";
}
} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 6c9f860..9bdd200 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -18,7 +18,6 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/MachineValueType.h"
namespace llvm {
@@ -46,8 +45,9 @@
InstFormatOther = 17,
InstFormatMask = 31,
+ InstFormatShift = 0,
- ConstraintShift = 5,
+ ConstraintShift = InstFormatShift + 5,
ConstraintMask = 0b111 << ConstraintShift,
VLMulShift = ConstraintShift + 3,
@@ -57,14 +57,14 @@
HasDummyMaskOpShift = VLMulShift + 3,
HasDummyMaskOpMask = 1 << HasDummyMaskOpShift,
- // Does this instruction only update element 0 the destination register.
- WritesElement0Shift = HasDummyMaskOpShift + 1,
- WritesElement0Mask = 1 << WritesElement0Shift,
+ // Force a tail agnostic policy even this instruction has a tied destination.
+ ForceTailAgnosticShift = HasDummyMaskOpShift + 1,
+ ForceTailAgnosticMask = 1 << ForceTailAgnosticShift,
// Does this instruction have a merge operand that must be removed when
// converting to MCInst. It will be the first explicit use operand. Used by
// RVV Pseudos.
- HasMergeOpShift = WritesElement0Shift + 1,
+ HasMergeOpShift = ForceTailAgnosticShift + 1,
HasMergeOpMask = 1 << HasMergeOpShift,
// Does this instruction have a SEW operand. It will be the last explicit
@@ -79,13 +79,59 @@
};
// Match with the definitions in RISCVInstrFormatsV.td
-enum RVVConstraintType {
+enum VConstraintType {
NoConstraint = 0,
VS2Constraint = 0b001,
VS1Constraint = 0b010,
VMConstraint = 0b100,
};
+enum VLMUL : uint8_t {
+ LMUL_1 = 0,
+ LMUL_2,
+ LMUL_4,
+ LMUL_8,
+ LMUL_RESERVED,
+ LMUL_F8,
+ LMUL_F4,
+ LMUL_F2
+};
+
+// Helper functions to read TSFlags.
+/// \returns the format of the instruction.
+static inline unsigned getFormat(uint64_t TSFlags) {
+ return (TSFlags & InstFormatMask) >> InstFormatShift;
+}
+/// \returns the constraint for the instruction.
+static inline VConstraintType getConstraint(uint64_t TSFlags) {
+ return static_cast<VConstraintType>
+ ((TSFlags & ConstraintMask) >> ConstraintShift);
+}
+/// \returns the LMUL for the instruction.
+static inline VLMUL getLMul(uint64_t TSFlags) {
+ return static_cast<VLMUL>((TSFlags & VLMulMask) >> VLMulShift);
+}
+/// \returns true if there is a dummy mask operand for the instruction.
+static inline bool hasDummyMaskOp(uint64_t TSFlags) {
+ return TSFlags & HasDummyMaskOpMask;
+}
+/// \returns true if tail agnostic is enforced for the instruction.
+static inline bool doesForceTailAgnostic(uint64_t TSFlags) {
+ return TSFlags & ForceTailAgnosticMask;
+}
+/// \returns true if there is a merge operand for the instruction.
+static inline bool hasMergeOp(uint64_t TSFlags) {
+ return TSFlags & HasMergeOpMask;
+}
+/// \returns true if there is a SEW operand for the instruction.
+static inline bool hasSEWOp(uint64_t TSFlags) {
+ return TSFlags & HasSEWOpMask;
+}
+/// \returns true if there is a VL operand for the instruction.
+static inline bool hasVLOp(uint64_t TSFlags) {
+ return TSFlags & HasVLOpMask;
+}
+
// RISC-V Specific Machine Operand Flags
enum {
MO_None = 0,
@@ -118,7 +164,11 @@
OPERAND_SIMM12,
OPERAND_UIMM20,
OPERAND_UIMMLOG2XLEN,
- OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
+ OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN,
+ // Operand is either a register or uimm5, this is used by V extension pseudo
+ // instructions to represent a value that be passed as AVL to either vsetvli
+ // or vsetivli.
+ OPERAND_AVL,
};
} // namespace RISCVOp
@@ -192,8 +242,9 @@
namespace RISCVSysReg {
struct SysReg {
const char *Name;
- unsigned Encoding;
const char *AltName;
+ const char *DeprecatedName;
+ unsigned Encoding;
// FIXME: add these additional fields when needed.
// Privilege Access: Read, Write, Read-Only.
// unsigned ReadWrite;
@@ -206,7 +257,7 @@
FeatureBitset FeaturesRequired;
bool isRV32Only;
- bool haveRequiredFeatures(FeatureBitset ActiveFeatures) const {
+ bool haveRequiredFeatures(const FeatureBitset &ActiveFeatures) const {
// Not in 32-bit mode.
if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
return false;
@@ -257,84 +308,6 @@
} // namespace RISCVFeatures
-namespace RISCVVMVTs {
-
-constexpr MVT vint8mf8_t = MVT::nxv1i8;
-constexpr MVT vint8mf4_t = MVT::nxv2i8;
-constexpr MVT vint8mf2_t = MVT::nxv4i8;
-constexpr MVT vint8m1_t = MVT::nxv8i8;
-constexpr MVT vint8m2_t = MVT::nxv16i8;
-constexpr MVT vint8m4_t = MVT::nxv32i8;
-constexpr MVT vint8m8_t = MVT::nxv64i8;
-
-constexpr MVT vint16mf4_t = MVT::nxv1i16;
-constexpr MVT vint16mf2_t = MVT::nxv2i16;
-constexpr MVT vint16m1_t = MVT::nxv4i16;
-constexpr MVT vint16m2_t = MVT::nxv8i16;
-constexpr MVT vint16m4_t = MVT::nxv16i16;
-constexpr MVT vint16m8_t = MVT::nxv32i16;
-
-constexpr MVT vint32mf2_t = MVT::nxv1i32;
-constexpr MVT vint32m1_t = MVT::nxv2i32;
-constexpr MVT vint32m2_t = MVT::nxv4i32;
-constexpr MVT vint32m4_t = MVT::nxv8i32;
-constexpr MVT vint32m8_t = MVT::nxv16i32;
-
-constexpr MVT vint64m1_t = MVT::nxv1i64;
-constexpr MVT vint64m2_t = MVT::nxv2i64;
-constexpr MVT vint64m4_t = MVT::nxv4i64;
-constexpr MVT vint64m8_t = MVT::nxv8i64;
-
-constexpr MVT vfloat16mf4_t = MVT::nxv1f16;
-constexpr MVT vfloat16mf2_t = MVT::nxv2f16;
-constexpr MVT vfloat16m1_t = MVT::nxv4f16;
-constexpr MVT vfloat16m2_t = MVT::nxv8f16;
-constexpr MVT vfloat16m4_t = MVT::nxv16f16;
-constexpr MVT vfloat16m8_t = MVT::nxv32f16;
-
-constexpr MVT vfloat32mf2_t = MVT::nxv1f32;
-constexpr MVT vfloat32m1_t = MVT::nxv2f32;
-constexpr MVT vfloat32m2_t = MVT::nxv4f32;
-constexpr MVT vfloat32m4_t = MVT::nxv8f32;
-constexpr MVT vfloat32m8_t = MVT::nxv16f32;
-
-constexpr MVT vfloat64m1_t = MVT::nxv1f64;
-constexpr MVT vfloat64m2_t = MVT::nxv2f64;
-constexpr MVT vfloat64m4_t = MVT::nxv4f64;
-constexpr MVT vfloat64m8_t = MVT::nxv8f64;
-
-constexpr MVT vbool1_t = MVT::nxv64i1;
-constexpr MVT vbool2_t = MVT::nxv32i1;
-constexpr MVT vbool4_t = MVT::nxv16i1;
-constexpr MVT vbool8_t = MVT::nxv8i1;
-constexpr MVT vbool16_t = MVT::nxv4i1;
-constexpr MVT vbool32_t = MVT::nxv2i1;
-constexpr MVT vbool64_t = MVT::nxv1i1;
-
-} // namespace RISCVVMVTs
-
-enum class RISCVVSEW {
- SEW_8 = 0,
- SEW_16,
- SEW_32,
- SEW_64,
- SEW_128,
- SEW_256,
- SEW_512,
- SEW_1024,
-};
-
-enum class RISCVVLMUL {
- LMUL_1 = 0,
- LMUL_2,
- LMUL_4,
- LMUL_8,
- LMUL_RESERVED,
- LMUL_F8,
- LMUL_F4,
- LMUL_F2
-};
-
namespace RISCVVType {
// Is this a SEW value that can be encoded into the VTYPE format.
inline static bool isValidSEW(unsigned SEW) {
@@ -346,36 +319,25 @@
return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
}
-// Encode VTYPE into the binary format used by the the VSETVLI instruction which
-// is used by our MC layer representation.
-//
-// Bits | Name | Description
-// -----+------------+------------------------------------------------
-// 7 | vma | Vector mask agnostic
-// 6 | vta | Vector tail agnostic
-// 5:3 | vsew[2:0] | Standard element width (SEW) setting
-// 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting
-inline static unsigned encodeVTYPE(RISCVVLMUL VLMUL, RISCVVSEW VSEW,
- bool TailAgnostic, bool MaskAgnostic) {
- unsigned VLMULBits = static_cast<unsigned>(VLMUL);
- unsigned VSEWBits = static_cast<unsigned>(VSEW);
- unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
- if (TailAgnostic)
- VTypeI |= 0x40;
- if (MaskAgnostic)
- VTypeI |= 0x80;
+unsigned encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
+ bool MaskAgnostic);
- return VTypeI;
-}
-
-inline static RISCVVLMUL getVLMUL(unsigned VType) {
+inline static RISCVII::VLMUL getVLMUL(unsigned VType) {
unsigned VLMUL = VType & 0x7;
- return static_cast<RISCVVLMUL>(VLMUL);
+ return static_cast<RISCVII::VLMUL>(VLMUL);
}
-inline static RISCVVSEW getVSEW(unsigned VType) {
+// Decode VLMUL into 1,2,4,8 and fractional indicator.
+std::pair<unsigned, bool> decodeVLMUL(RISCVII::VLMUL VLMUL);
+
+inline static unsigned decodeVSEW(unsigned VSEW) {
+ assert(VSEW < 8 && "Unexpected VSEW value");
+ return 1 << (VSEW + 3);
+}
+
+inline static unsigned getSEW(unsigned VType) {
unsigned VSEW = (VType >> 3) & 0x7;
- return static_cast<RISCVVSEW>(VSEW);
+ return decodeVSEW(VSEW);
}
inline static bool isTailAgnostic(unsigned VType) { return VType & 0x40; }
@@ -386,21 +348,6 @@
} // namespace RISCVVType
-namespace RISCVVPseudosTable {
-
-struct PseudoInfo {
-#include "MCTargetDesc/RISCVBaseInfo.h"
- uint16_t Pseudo;
- uint16_t BaseInstr;
-};
-
-using namespace RISCV;
-
-#define GET_RISCVVPseudosTable_DECL
-#include "RISCVGenSearchableTables.inc"
-
-} // end namespace RISCVVPseudosTable
-
} // namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index b38ba2b..fb1ce19 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -86,6 +86,22 @@
return ELF::R_RISCV_CALL;
case RISCV::fixup_riscv_call_plt:
return ELF::R_RISCV_CALL_PLT;
+ case RISCV::fixup_riscv_add_8:
+ return ELF::R_RISCV_ADD8;
+ case RISCV::fixup_riscv_sub_8:
+ return ELF::R_RISCV_SUB8;
+ case RISCV::fixup_riscv_add_16:
+ return ELF::R_RISCV_ADD16;
+ case RISCV::fixup_riscv_sub_16:
+ return ELF::R_RISCV_SUB16;
+ case RISCV::fixup_riscv_add_32:
+ return ELF::R_RISCV_ADD32;
+ case RISCV::fixup_riscv_sub_32:
+ return ELF::R_RISCV_SUB32;
+ case RISCV::fixup_riscv_add_64:
+ return ELF::R_RISCV_ADD64;
+ case RISCV::fixup_riscv_sub_64:
+ return ELF::R_RISCV_SUB64;
}
}
@@ -106,26 +122,6 @@
return ELF::R_RISCV_32;
case FK_Data_8:
return ELF::R_RISCV_64;
- case FK_Data_Add_1:
- return ELF::R_RISCV_ADD8;
- case FK_Data_Add_2:
- return ELF::R_RISCV_ADD16;
- case FK_Data_Add_4:
- return ELF::R_RISCV_ADD32;
- case FK_Data_Add_8:
- return ELF::R_RISCV_ADD64;
- case FK_Data_Add_6b:
- return ELF::R_RISCV_SET6;
- case FK_Data_Sub_1:
- return ELF::R_RISCV_SUB8;
- case FK_Data_Sub_2:
- return ELF::R_RISCV_SUB16;
- case FK_Data_Sub_4:
- return ELF::R_RISCV_SUB32;
- case FK_Data_Sub_8:
- return ELF::R_RISCV_SUB64;
- case FK_Data_Sub_6b:
- return ELF::R_RISCV_SUB6;
case RISCV::fixup_riscv_hi20:
return ELF::R_RISCV_HI20;
case RISCV::fixup_riscv_lo12_i:
@@ -144,6 +140,32 @@
return ELF::R_RISCV_RELAX;
case RISCV::fixup_riscv_align:
return ELF::R_RISCV_ALIGN;
+ case RISCV::fixup_riscv_set_6b:
+ return ELF::R_RISCV_SET6;
+ case RISCV::fixup_riscv_sub_6b:
+ return ELF::R_RISCV_SUB6;
+ case RISCV::fixup_riscv_add_8:
+ return ELF::R_RISCV_ADD8;
+ case RISCV::fixup_riscv_set_8:
+ return ELF::R_RISCV_SET8;
+ case RISCV::fixup_riscv_sub_8:
+ return ELF::R_RISCV_SUB8;
+ case RISCV::fixup_riscv_set_16:
+ return ELF::R_RISCV_SET16;
+ case RISCV::fixup_riscv_add_16:
+ return ELF::R_RISCV_ADD16;
+ case RISCV::fixup_riscv_sub_16:
+ return ELF::R_RISCV_SUB16;
+ case RISCV::fixup_riscv_set_32:
+ return ELF::R_RISCV_SET32;
+ case RISCV::fixup_riscv_add_32:
+ return ELF::R_RISCV_ADD32;
+ case RISCV::fixup_riscv_sub_32:
+ return ELF::R_RISCV_SUB32;
+ case RISCV::fixup_riscv_add_64:
+ return ELF::R_RISCV_ADD64;
+ case RISCV::fixup_riscv_sub_64:
+ return ELF::R_RISCV_SUB64;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 7df454b..d88ba9e 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -15,9 +15,13 @@
#include "RISCVBaseInfo.h"
#include "RISCVMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/RISCVAttributes.h"
@@ -167,3 +171,92 @@
}
return Result;
}
+
+namespace {
+class RISCVELFStreamer : public MCELFStreamer {
+ static std::pair<unsigned, unsigned> getRelocPairForSize(unsigned Size) {
+ switch (Size) {
+ default:
+ llvm_unreachable("unsupported fixup size");
+ case 1:
+ return std::make_pair(RISCV::fixup_riscv_add_8, RISCV::fixup_riscv_sub_8);
+ case 2:
+ return std::make_pair(RISCV::fixup_riscv_add_16,
+ RISCV::fixup_riscv_sub_16);
+ case 4:
+ return std::make_pair(RISCV::fixup_riscv_add_32,
+ RISCV::fixup_riscv_sub_32);
+ case 8:
+ return std::make_pair(RISCV::fixup_riscv_add_64,
+ RISCV::fixup_riscv_sub_64);
+ }
+ }
+
+ static bool requiresFixups(MCContext &C, const MCExpr *Value,
+ const MCExpr *&LHS, const MCExpr *&RHS) {
+ const auto *MBE = dyn_cast<MCBinaryExpr>(Value);
+ if (MBE == nullptr)
+ return false;
+
+ MCValue E;
+ if (!Value->evaluateAsRelocatable(E, nullptr, nullptr))
+ return false;
+ if (E.getSymA() == nullptr || E.getSymB() == nullptr)
+ return false;
+
+ const auto &A = E.getSymA()->getSymbol();
+ const auto &B = E.getSymB()->getSymbol();
+
+ LHS =
+ MCBinaryExpr::create(MCBinaryExpr::Add, MCSymbolRefExpr::create(&A, C),
+ MCConstantExpr::create(E.getConstant(), C), C);
+ RHS = E.getSymB();
+
+ return (A.isInSection() ? A.getSection().hasInstructions()
+ : !A.getName().empty()) ||
+ (B.isInSection() ? B.getSection().hasInstructions()
+ : !B.getName().empty());
+ }
+
+public:
+ RISCVELFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> MOW,
+ std::unique_ptr<MCCodeEmitter> MCE)
+ : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
+
+ void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+ const MCExpr *A, *B;
+ if (!requiresFixups(getContext(), Value, A, B))
+ return MCELFStreamer::emitValueImpl(Value, Size, Loc);
+
+ MCStreamer::emitValueImpl(Value, Size, Loc);
+
+ MCDataFragment *DF = getOrCreateDataFragment();
+ flushPendingLabels(DF, DF->getContents().size());
+ MCDwarfLineEntry::make(this, getCurrentSectionOnly());
+
+ unsigned Add, Sub;
+ std::tie(Add, Sub) = getRelocPairForSize(Size);
+
+ DF->getFixups().push_back(MCFixup::create(
+ DF->getContents().size(), A, static_cast<MCFixupKind>(Add), Loc));
+ DF->getFixups().push_back(MCFixup::create(
+ DF->getContents().size(), B, static_cast<MCFixupKind>(Sub), Loc));
+
+ DF->getContents().resize(DF->getContents().size() + Size, 0);
+ }
+};
+} // namespace
+
+namespace llvm {
+MCELFStreamer *createRISCVELFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> MOW,
+ std::unique_ptr<MCCodeEmitter> MCE,
+ bool RelaxAll) {
+ RISCVELFStreamer *S =
+ new RISCVELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE));
+ S->getAssembler().setRelaxAll(RelaxAll);
+ return S;
+}
+} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 392c870..59d8bb0 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -104,5 +104,11 @@
void emitDirectiveOptionRelax() override;
void emitDirectiveOptionNoRelax() override;
};
+
+MCELFStreamer *createRISCVELFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> MOW,
+ std::unique_ptr<MCCodeEmitter> MCE,
+ bool RelaxAll);
}
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 6c79333..7953b60 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -16,73 +16,96 @@
namespace llvm {
namespace RISCV {
enum Fixups {
- // fixup_riscv_hi20 - 20-bit fixup corresponding to hi(foo) for
- // instructions like lui
+ // 20-bit fixup corresponding to %hi(foo) for instructions like lui
fixup_riscv_hi20 = FirstTargetFixupKind,
- // fixup_riscv_lo12_i - 12-bit fixup corresponding to lo(foo) for
- // instructions like addi
+ // 12-bit fixup corresponding to %lo(foo) for instructions like addi
fixup_riscv_lo12_i,
- // fixup_riscv_lo12_s - 12-bit fixup corresponding to lo(foo) for
- // the S-type store instructions
+ // 12-bit fixup corresponding to %lo(foo) for the S-type store instructions
fixup_riscv_lo12_s,
- // fixup_riscv_pcrel_hi20 - 20-bit fixup corresponding to pcrel_hi(foo) for
- // instructions like auipc
+ // 20-bit fixup corresponding to %pcrel_hi(foo) for instructions like auipc
fixup_riscv_pcrel_hi20,
- // fixup_riscv_pcrel_lo12_i - 12-bit fixup corresponding to pcrel_lo(foo) for
- // instructions like addi
+ // 12-bit fixup corresponding to %pcrel_lo(foo) for instructions like addi
fixup_riscv_pcrel_lo12_i,
- // fixup_riscv_pcrel_lo12_s - 12-bit fixup corresponding to pcrel_lo(foo) for
- // the S-type store instructions
- fixup_riscv_pcrel_lo12_s,
- // fixup_riscv_got_hi20 - 20-bit fixup corresponding to got_pcrel_hi(foo) for
- // instructions like auipc
- fixup_riscv_got_hi20,
- // fixup_riscv_tprel_hi20 - 20-bit fixup corresponding to tprel_hi(foo) for
- // instructions like lui
- fixup_riscv_tprel_hi20,
- // fixup_riscv_tprel_lo12_i - 12-bit fixup corresponding to tprel_lo(foo) for
- // instructions like addi
- fixup_riscv_tprel_lo12_i,
- // fixup_riscv_tprel_lo12_s - 12-bit fixup corresponding to tprel_lo(foo) for
- // the S-type store instructions
- fixup_riscv_tprel_lo12_s,
- // fixup_riscv_tprel_add - A fixup corresponding to %tprel_add(foo) for the
- // add_tls instruction. Used to provide a hint to the linker.
- fixup_riscv_tprel_add,
- // fixup_riscv_tls_got_hi20 - 20-bit fixup corresponding to
- // tls_ie_pcrel_hi(foo) for instructions like auipc
- fixup_riscv_tls_got_hi20,
- // fixup_riscv_tls_gd_hi20 - 20-bit fixup corresponding to
- // tls_gd_pcrel_hi(foo) for instructions like auipc
- fixup_riscv_tls_gd_hi20,
- // fixup_riscv_jal - 20-bit fixup for symbol references in the jal
- // instruction
- fixup_riscv_jal,
- // fixup_riscv_branch - 12-bit fixup for symbol references in the branch
+ // 12-bit fixup corresponding to %pcrel_lo(foo) for the S-type store
// instructions
+ fixup_riscv_pcrel_lo12_s,
+ // 20-bit fixup corresponding to %got_pcrel_hi(foo) for instructions like
+ // auipc
+ fixup_riscv_got_hi20,
+ // 20-bit fixup corresponding to %tprel_hi(foo) for instructions like lui
+ fixup_riscv_tprel_hi20,
+ // 12-bit fixup corresponding to %tprel_lo(foo) for instructions like addi
+ fixup_riscv_tprel_lo12_i,
+ // 12-bit fixup corresponding to %tprel_lo(foo) for the S-type store
+ // instructions
+ fixup_riscv_tprel_lo12_s,
+ // Fixup corresponding to %tprel_add(foo) for PseudoAddTPRel, used as a linker
+ // hint
+ fixup_riscv_tprel_add,
+ // 20-bit fixup corresponding to %tls_ie_pcrel_hi(foo) for instructions like
+ // auipc
+ fixup_riscv_tls_got_hi20,
+ // 20-bit fixup corresponding to %tls_gd_pcrel_hi(foo) for instructions like
+ // auipc
+ fixup_riscv_tls_gd_hi20,
+ // 20-bit fixup for symbol references in the jal instruction
+ fixup_riscv_jal,
+ // 12-bit fixup for symbol references in the branch instructions
fixup_riscv_branch,
- // fixup_riscv_rvc_jump - 11-bit fixup for symbol references in the
- // compressed jump instruction
+ // 11-bit fixup for symbol references in the compressed jump instruction
fixup_riscv_rvc_jump,
- // fixup_riscv_rvc_branch - 8-bit fixup for symbol references in the
- // compressed branch instruction
+ // 8-bit fixup for symbol references in the compressed branch instruction
fixup_riscv_rvc_branch,
- // fixup_riscv_call - A fixup representing a call attached to the auipc
+ // Fixup representing a legacy no-pic function call attached to the auipc
// instruction in a pair composed of adjacent auipc+jalr instructions.
fixup_riscv_call,
- // fixup_riscv_call_plt - A fixup representing a procedure linkage table call
- // attached to the auipc instruction in a pair composed of adjacent auipc+jalr
- // instructions.
+ // Fixup representing a function call attached to the auipc instruction in a
+ // pair composed of adjacent auipc+jalr instructions.
fixup_riscv_call_plt,
- // fixup_riscv_relax - Used to generate an R_RISCV_RELAX relocation type,
- // which indicates the linker may relax the instruction pair.
+ // Used to generate an R_RISCV_RELAX relocation, which indicates the linker
+ // may relax the instruction pair.
fixup_riscv_relax,
- // fixup_riscv_align - Used to generate an R_RISCV_ALIGN relocation type,
- // which indicates the linker should fixup the alignment after linker
- // relaxation.
+ // Used to generate an R_RISCV_ALIGN relocation, which indicates the linker
+ // should fixup the alignment after linker relaxation.
fixup_riscv_align,
+ // 8-bit fixup corresponding to R_RISCV_SET8 for local label assignment.
+ fixup_riscv_set_8,
+ // 8-bit fixup corresponding to R_RISCV_ADD8 for 8-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_add_8,
+ // 8-bit fixup corresponding to R_RISCV_SUB8 for 8-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_sub_8,
+ // 16-bit fixup corresponding to R_RISCV_SET16 for local label assignment.
+ fixup_riscv_set_16,
+ // 16-bit fixup corresponding to R_RISCV_ADD16 for 16-bit symbolic difference
+ // paired reloctions.
+ fixup_riscv_add_16,
+ // 16-bit fixup corresponding to R_RISCV_SUB16 for 16-bit symbolic difference
+ // paired reloctions.
+ fixup_riscv_sub_16,
+ // 32-bit fixup corresponding to R_RISCV_SET32 for local label assignment.
+ fixup_riscv_set_32,
+ // 32-bit fixup corresponding to R_RISCV_ADD32 for 32-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_add_32,
+ // 32-bit fixup corresponding to R_RISCV_SUB32 for 32-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_sub_32,
+ // 64-bit fixup corresponding to R_RISCV_ADD64 for 64-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_add_64,
+ // 64-bit fixup corresponding to R_RISCV_SUB64 for 64-bit symbolic difference
+ // paired relocations.
+ fixup_riscv_sub_64,
+ // 6-bit fixup corresponding to R_RISCV_SET6 for local label assignment in
+ // DWARF CFA.
+ fixup_riscv_set_6b,
+ // 6-bit fixup corresponding to R_RISCV_SUB6 for local label assignment in
+ // DWARF CFA.
+ fixup_riscv_sub_6b,
- // fixup_riscv_invalid - used as a sentinel and a marker, must be last fixup
+ // Used as a sentinel, must be the last
fixup_riscv_invalid,
NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind
};
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 5f8d6e1..d1979b5 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -39,11 +39,11 @@
cl::desc("Disable the emission of assembler pseudo instructions"),
cl::init(false), cl::Hidden);
-static cl::opt<bool>
- ArchRegNames("riscv-arch-reg-names",
- cl::desc("Print architectural register names rather than the "
- "ABI names (such as x2 instead of sp)"),
- cl::init(false), cl::Hidden);
+// Print architectural register names rather than the ABI names (such as x2
+// instead of sp).
+// TODO: Make RISCVInstPrinter::getRegisterName non-static so that this can a
+// member.
+static bool ArchRegNames;
// The command-line flags above are used by llvm-mc and llc. They can be used by
// `llvm-objdump`, but we override their values here to handle options passed to
@@ -52,7 +52,7 @@
// this way.
bool RISCVInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
if (Opt == "no-aliases") {
- NoAliases = true;
+ PrintAliases = false;
return true;
}
if (Opt == "numeric") {
@@ -69,11 +69,11 @@
bool Res = false;
const MCInst *NewMI = MI;
MCInst UncompressedMI;
- if (!NoAliases)
+ if (PrintAliases && !NoAliases)
Res = uncompressInst(UncompressedMI, *MI, MRI, STI);
if (Res)
NewMI = const_cast<MCInst *>(&UncompressedMI);
- if (NoAliases || !printAliasInstr(NewMI, Address, STI, O))
+ if (!PrintAliases || NoAliases || !printAliasInstr(NewMI, Address, STI, O))
printInstruction(NewMI, Address, STI, O);
printAnnotation(O, Annot);
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index b299541..1ef276b 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -270,7 +270,7 @@
const MCOperand &MO = MI.getOperand(OpNo);
MCInstrDesc const &Desc = MCII.get(MI.getOpcode());
- unsigned MIFrm = Desc.TSFlags & RISCVII::InstFormatMask;
+ unsigned MIFrm = RISCVII::getFormat(Desc.TSFlags);
// If the destination is an immediate, there is nothing to do.
if (MO.isImm())
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 8ce2184..65714b9 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -92,29 +92,16 @@
bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
const MCAsmLayout *Layout,
const MCFixup *Fixup) const {
- if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ // Explicitly drop the layout and assembler to prevent any symbolic folding in
+ // the expression handling. This is required to preserve symbolic difference
+ // expressions to emit the paired relocations.
+ if (!getSubExpr()->evaluateAsRelocatable(Res, nullptr, nullptr))
return false;
- // Some custom fixup types are not valid with symbol difference expressions
- if (Res.getSymA() && Res.getSymB()) {
- switch (getKind()) {
- default:
- return true;
- case VK_RISCV_LO:
- case VK_RISCV_HI:
- case VK_RISCV_PCREL_LO:
- case VK_RISCV_PCREL_HI:
- case VK_RISCV_GOT_HI:
- case VK_RISCV_TPREL_LO:
- case VK_RISCV_TPREL_HI:
- case VK_RISCV_TPREL_ADD:
- case VK_RISCV_TLS_GOT_HI:
- case VK_RISCV_TLS_GD_HI:
- return false;
- }
- }
-
- return true;
+ Res =
+ MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+ // Custom fixup types are not valid with symbol difference expressions.
+ return Res.getSymB() ? getKind() == VK_RISCV_None : true;
}
void RISCVMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
@@ -138,7 +125,8 @@
StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
switch (Kind) {
- default:
+ case VK_RISCV_Invalid:
+ case VK_RISCV_None:
llvm_unreachable("Invalid ELF symbol kind");
case VK_RISCV_LO:
return "lo";
@@ -160,7 +148,14 @@
return "tls_ie_pcrel_hi";
case VK_RISCV_TLS_GD_HI:
return "tls_gd_pcrel_hi";
+ case VK_RISCV_CALL:
+ return "call";
+ case VK_RISCV_CALL_PLT:
+ return "call_plt";
+ case VK_RISCV_32_PCREL:
+ return "32_pcrel";
}
+ llvm_unreachable("Invalid ELF symbol kind");
}
static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 77038ce..2e752c1 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -37,7 +37,7 @@
VK_RISCV_CALL,
VK_RISCV_CALL_PLT,
VK_RISCV_32_PCREL,
- VK_RISCV_Invalid
+ VK_RISCV_Invalid // Must be the last item
};
private:
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 0931185..38c3253 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -18,9 +18,12 @@
#include "RISCVTargetStreamer.h"
#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -64,10 +67,12 @@
static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
- std::string CPUName = std::string(CPU);
- if (CPUName.empty())
- CPUName = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
- return createRISCVMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU*/ CPUName, FS);
+ if (CPU.empty())
+ CPU = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
+ if (CPU == "generic")
+ report_fatal_error(Twine("CPU 'generic' is not supported. Use ") +
+ (TT.isArch64Bit() ? "generic-rv64" : "generic-rv32"));
+ return createRISCVMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createRISCVMCInstPrinter(const Triple &T,
@@ -136,6 +141,17 @@
return new RISCVMCInstrAnalysis(Info);
}
+namespace {
+MCStreamer *createRISCVELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> &&MAB,
+ std::unique_ptr<MCObjectWriter> &&MOW,
+ std::unique_ptr<MCCodeEmitter> &&MCE,
+ bool RelaxAll) {
+ return createRISCVELFStreamer(Context, std::move(MAB), std::move(MOW),
+ std::move(MCE), RelaxAll);
+}
+} // end anonymous namespace
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
@@ -145,6 +161,7 @@
TargetRegistry::RegisterMCCodeEmitter(*T, createRISCVMCCodeEmitter);
TargetRegistry::RegisterMCInstPrinter(*T, createRISCVMCInstPrinter);
TargetRegistry::RegisterMCSubtargetInfo(*T, createRISCVMCSubtargetInfo);
+ TargetRegistry::RegisterELFStreamer(*T, createRISCVELFStreamer);
TargetRegistry::RegisterObjectTargetStreamer(
*T, createRISCVObjectTargetStreamer);
TargetRegistry::RegisterMCInstrAnalysis(*T, createRISCVInstrAnalysis);
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 1f3dead..2ca5eeb 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -10,11 +10,49 @@
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "llvm/ADT/APInt.h"
#include "llvm/Support/MathExtras.h"
+using namespace llvm;
-namespace llvm {
+static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {
+ if (!HasRVC)
+ return Res.size();
-namespace RISCVMatInt {
-void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) {
+ int Cost = 0;
+ for (auto Instr : Res) {
+ bool Compressed;
+ switch (Instr.Opc) {
+ default: llvm_unreachable("Unexpected opcode");
+ case RISCV::SLLI:
+ case RISCV::SRLI:
+ Compressed = true;
+ break;
+ case RISCV::ADDI:
+ case RISCV::ADDIW:
+ case RISCV::LUI:
+ Compressed = isInt<6>(Instr.Imm);
+ break;
+ case RISCV::ADDUW:
+ Compressed = false;
+ break;
+ }
+ // Two RVC instructions take the same space as one RVI instruction, but
+ // can take longer to execute than the single RVI instruction. Thus, we
+ // consider that two RVC instruction are slightly more costly than one
+ // RVI instruction. For longer sequences of RVC instructions the space
+ // savings can be worth it, though. The costs below try to model that.
+ if (!Compressed)
+ Cost += 100; // Baseline cost of one RVI instruction: 100%.
+ else
+ Cost += 70; // 70% cost of baseline.
+ }
+ return Cost;
+}
+
+// Recursively generate a sequence for materializing an integer.
+static void generateInstSeqImpl(int64_t Val,
+ const FeatureBitset &ActiveFeatures,
+ RISCVMatInt::InstSeq &Res) {
+ bool IsRV64 = ActiveFeatures[RISCV::Feature64Bit];
+
if (isInt<32>(Val)) {
// Depending on the active bits in the immediate Value v, the following
// instruction sequences are emitted:
@@ -27,11 +65,11 @@
int64_t Lo12 = SignExtend64<12>(Val);
if (Hi20)
- Res.push_back(Inst(RISCV::LUI, Hi20));
+ Res.push_back(RISCVMatInt::Inst(RISCV::LUI, Hi20));
if (Lo12 || Hi20 == 0) {
unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
- Res.push_back(Inst(AddiOpc, Lo12));
+ Res.push_back(RISCVMatInt::Inst(AddiOpc, Lo12));
}
return;
}
@@ -66,14 +104,92 @@
int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
- generateInstSeq(Hi52, IsRV64, Res);
+ // If the remaining bits don't fit in 12 bits, we might be able to reduce the
+ // shift amount in order to use LUI which will zero the lower 12 bits.
+ if (ShiftAmount > 12 && !isInt<12>(Hi52) && isInt<32>((uint64_t)Hi52 << 12)) {
+ // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
+ ShiftAmount -= 12;
+ Hi52 = (uint64_t)Hi52 << 12;
+ }
- Res.push_back(Inst(RISCV::SLLI, ShiftAmount));
+ generateInstSeqImpl(Hi52, ActiveFeatures, Res);
+
+ Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
if (Lo12)
- Res.push_back(Inst(RISCV::ADDI, Lo12));
+ Res.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
}
-int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) {
+namespace llvm {
+namespace RISCVMatInt {
+InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
+ RISCVMatInt::InstSeq Res;
+ generateInstSeqImpl(Val, ActiveFeatures, Res);
+
+ // If the constant is positive we might be able to generate a shifted constant
+ // with no leading zeros and use a final SRLI to restore them.
+ if (Val > 0 && Res.size() > 2) {
+ assert(ActiveFeatures[RISCV::Feature64Bit] &&
+ "Expected RV32 to only need 2 instructions");
+ unsigned LeadingZeros = countLeadingZeros((uint64_t)Val);
+ uint64_t ShiftedVal = (uint64_t)Val << LeadingZeros;
+ // Fill in the bits that will be shifted out with 1s. An example where this
+ // helps is trailing one masks with 32 or more ones. This will generate
+ // ADDI -1 and an SRLI.
+ ShiftedVal |= maskTrailingOnes<uint64_t>(LeadingZeros);
+
+ RISCVMatInt::InstSeq TmpSeq;
+ generateInstSeqImpl(ShiftedVal, ActiveFeatures, TmpSeq);
+ TmpSeq.push_back(RISCVMatInt::Inst(RISCV::SRLI, LeadingZeros));
+
+ // Keep the new sequence if it is an improvement.
+ if (TmpSeq.size() < Res.size()) {
+ Res = TmpSeq;
+ // A 2 instruction sequence is the best we can do.
+ if (Res.size() <= 2)
+ return Res;
+ }
+
+ // Some cases can benefit from filling the lower bits with zeros instead.
+ ShiftedVal &= maskTrailingZeros<uint64_t>(LeadingZeros);
+ TmpSeq.clear();
+ generateInstSeqImpl(ShiftedVal, ActiveFeatures, TmpSeq);
+ TmpSeq.push_back(RISCVMatInt::Inst(RISCV::SRLI, LeadingZeros));
+
+ // Keep the new sequence if it is an improvement.
+ if (TmpSeq.size() < Res.size()) {
+ Res = TmpSeq;
+ // A 2 instruction sequence is the best we can do.
+ if (Res.size() <= 2)
+ return Res;
+ }
+
+ // If we have exactly 32 leading zeros and Zba, we can try using zext.w at
+ // the end of the sequence.
+ if (LeadingZeros == 32 && ActiveFeatures[RISCV::FeatureExtZba]) {
+ // Try replacing upper bits with 1.
+ uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros);
+ TmpSeq.clear();
+ generateInstSeqImpl(LeadingOnesVal, ActiveFeatures, TmpSeq);
+ TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDUW, 0));
+
+ // Keep the new sequence if it is an improvement.
+ if (TmpSeq.size() < Res.size()) {
+ Res = TmpSeq;
+ // A 2 instruction sequence is the best we can do.
+ if (Res.size() <= 2)
+ return Res;
+ }
+ }
+ }
+
+ return Res;
+}
+
+int getIntMatCost(const APInt &Val, unsigned Size,
+ const FeatureBitset &ActiveFeatures,
+ bool CompressionCost) {
+ bool IsRV64 = ActiveFeatures[RISCV::Feature64Bit];
+ bool HasRVC = CompressionCost && ActiveFeatures[RISCV::FeatureStdExtC];
int PlatRegSize = IsRV64 ? 64 : 32;
// Split the constant into platform register sized chunks, and calculate cost
@@ -81,9 +197,8 @@
int Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) {
APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize);
- InstSeq MatSeq;
- generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq);
- Cost += MatSeq.size();
+ InstSeq MatSeq = generateInstSeq(Chunk.getSExtValue(), ActiveFeatures);
+ Cost += getInstSeqCost(MatSeq, HasRVC);
}
return std::max(1, Cost);
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index 17ca574..02b4b18 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -10,10 +10,12 @@
#define LLVM_LIB_TARGET_RISCV_MATINT_H
#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/SubtargetFeature.h"
#include <cstdint>
namespace llvm {
class APInt;
+class MCSubtargetInfo;
namespace RISCVMatInt {
struct Inst {
@@ -25,19 +27,25 @@
using InstSeq = SmallVector<Inst, 8>;
// Helper to generate an instruction sequence that will materialise the given
-// immediate value into a register. A sequence of instructions represented by
-// a simple struct produced rather than directly emitting the instructions in
+// immediate value into a register. A sequence of instructions represented by a
+// simple struct is produced rather than directly emitting the instructions in
// order to allow this helper to be used from both the MC layer and during
// instruction selection.
-void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res);
+InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures);
// Helper to estimate the number of instructions required to materialise the
// given immediate value into a register. This estimate does not account for
// `Val` possibly fitting into an immediate, and so may over-estimate.
//
// This will attempt to produce instructions to materialise `Val` as an
-// `Size`-bit immediate. `IsRV64` should match the target architecture.
-int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64);
+// `Size`-bit immediate.
+//
+// If CompressionCost is true it will use a different cost calculation if RVC is
+// enabled. This should be used to compare two different sequences to determine
+// which is more compressible.
+int getIntMatCost(const APInt &Val, unsigned Size,
+ const FeatureBitset &ActiveFeatures,
+ bool CompressionCost = false);
} // namespace RISCVMatInt
} // namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 32fa20f..0bda3de 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -14,6 +14,8 @@
namespace llvm {
+class formatted_raw_ostream;
+
class RISCVTargetStreamer : public MCTargetStreamer {
public:
RISCVTargetStreamer(MCStreamer &S);
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCV.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCV.h
index 2538d99..ef1f970 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCV.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCV.h
@@ -30,8 +30,8 @@
class MachineOperand;
class PassRegistry;
-void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
- const AsmPrinter &AP);
+bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ AsmPrinter &AP);
bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp, const AsmPrinter &AP);
@@ -46,8 +46,8 @@
FunctionPass *createRISCVExpandAtomicPseudoPass();
void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
-FunctionPass *createRISCVCleanupVSETVLIPass();
-void initializeRISCVCleanupVSETVLIPass(PassRegistry &);
+FunctionPass *createRISCVInsertVSETVLIPass();
+void initializeRISCVInsertVSETVLIPass(PassRegistry &);
InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
RISCVSubtarget &,
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCV.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCV.td
index 83811da..52e8d8c 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCV.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCV.td
@@ -178,15 +178,16 @@
"'Zvlsseg' (Vector segment load/store instructions)",
[FeatureStdExtV]>;
def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
- AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
- "'Zvlsseg' (Vector segment load/store instructions)">;
+ AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
+ "'Zvlsseg' (Vector segment load/store instructions)">;
+
def FeatureExtZvamo
: SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true",
- "'Zvamo'(Vector AMO Operations)",
+ "'Zvamo' (Vector AMO Operations)",
[FeatureStdExtV]>;
def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">,
- AssemblerPredicate<(all_of FeatureExtZvamo),
- "'Zvamo'(Vector AMO Operations)">;
+ AssemblerPredicate<(all_of FeatureExtZvamo),
+ "'Zvamo' (Vector AMO Operations)">;
def Feature64Bit
: SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 0a915cb..bdf30f8 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -93,8 +93,8 @@
return;
MCInst TmpInst;
- LowerRISCVMachineInstrToMCInst(MI, TmpInst, *this);
- EmitToStreamer(*OutStreamer, TmpInst);
+ if (!lowerRISCVMachineInstrToMCInst(MI, TmpInst, *this))
+ EmitToStreamer(*OutStreamer, TmpInst);
}
bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -193,16 +193,7 @@
void RISCVAsmPrinter::emitAttributes() {
RISCVTargetStreamer &RTS =
static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
-
- const Triple &TT = TM.getTargetTriple();
- StringRef CPU = TM.getTargetCPU();
- StringRef FS = TM.getTargetFeatureString();
- const RISCVTargetMachine &RTM = static_cast<const RISCVTargetMachine &>(TM);
- /* TuneCPU doesn't impact emission of ELF attributes, ELF attributes only
- care about arch related features, so we can set TuneCPU as CPU. */
- const RISCVSubtarget STI(TT, CPU, /*TuneCPU=*/CPU, FS, /*ABIName=*/"", RTM);
-
- RTS.emitTargetAttributes(STI);
+ RTS.emitTargetAttributes(*STI);
}
// Force static initialization.
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
deleted file mode 100644
index ae32cbd..0000000
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-//===- RISCVCleanupVSETVLI.cpp - Cleanup unneeded VSETVLI instructions ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a function pass that removes duplicate vsetvli
-// instructions within a basic block.
-//
-//===----------------------------------------------------------------------===//
-
-#include "RISCV.h"
-#include "RISCVSubtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "riscv-cleanup-vsetvli"
-#define RISCV_CLEANUP_VSETVLI_NAME "RISCV Cleanup VSETVLI pass"
-
-namespace {
-
-class RISCVCleanupVSETVLI : public MachineFunctionPass {
-public:
- static char ID;
-
- RISCVCleanupVSETVLI() : MachineFunctionPass(ID) {
- initializeRISCVCleanupVSETVLIPass(*PassRegistry::getPassRegistry());
- }
- bool runOnMachineFunction(MachineFunction &MF) override;
- bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::IsSSA);
- }
-
- // This pass modifies the program, but does not modify the CFG
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- StringRef getPassName() const override { return RISCV_CLEANUP_VSETVLI_NAME; }
-};
-
-} // end anonymous namespace
-
-char RISCVCleanupVSETVLI::ID = 0;
-
-INITIALIZE_PASS(RISCVCleanupVSETVLI, DEBUG_TYPE,
- RISCV_CLEANUP_VSETVLI_NAME, false, false)
-
-bool RISCVCleanupVSETVLI::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
- bool Changed = false;
- MachineInstr *PrevVSETVLI = nullptr;
-
- for (auto MII = MBB.begin(), MIE = MBB.end(); MII != MIE;) {
- MachineInstr &MI = *MII++;
-
- if (MI.getOpcode() != RISCV::PseudoVSETVLI &&
- MI.getOpcode() != RISCV::PseudoVSETIVLI) {
- if (PrevVSETVLI &&
- (MI.isCall() || MI.modifiesRegister(RISCV::VL) ||
- MI.modifiesRegister(RISCV::VTYPE))) {
- // Old VL/VTYPE is overwritten.
- PrevVSETVLI = nullptr;
- }
- continue;
- }
-
- // If we don't have a previous VSET{I}VLI or the VL output isn't dead, we
- // can't remove this VSETVLI.
- if (!PrevVSETVLI || !MI.getOperand(0).isDead()) {
- PrevVSETVLI = &MI;
- continue;
- }
-
- // If a previous "set vl" instruction opcode is different from this one, we
- // can't differentiate the AVL values.
- if (PrevVSETVLI->getOpcode() != MI.getOpcode()) {
- PrevVSETVLI = &MI;
- continue;
- }
-
- // The remaining two cases are
- // 1. PrevVSETVLI = PseudoVSETVLI
- // MI = PseudoVSETVLI
- //
- // 2. PrevVSETVLI = PseudoVSETIVLI
- // MI = PseudoVSETIVLI
- Register AVLReg;
- bool SameAVL = false;
- if (MI.getOpcode() == RISCV::PseudoVSETVLI) {
- AVLReg = MI.getOperand(1).getReg();
- SameAVL = PrevVSETVLI->getOperand(1).getReg() == AVLReg;
- } else { // RISCV::PseudoVSETIVLI
- SameAVL =
- PrevVSETVLI->getOperand(1).getImm() == MI.getOperand(1).getImm();
- }
- int64_t PrevVTYPEImm = PrevVSETVLI->getOperand(2).getImm();
- int64_t VTYPEImm = MI.getOperand(2).getImm();
-
- // Does this VSET{I}VLI use the same AVL register/value and VTYPE immediate?
- if (!SameAVL || PrevVTYPEImm != VTYPEImm) {
- PrevVSETVLI = &MI;
- continue;
- }
-
- // If the AVLReg is X0 we need to look at the output VL of both VSETVLIs.
- if ((MI.getOpcode() == RISCV::PseudoVSETVLI) && (AVLReg == RISCV::X0)) {
- assert((PrevVSETVLI->getOpcode() == RISCV::PseudoVSETVLI) &&
- "Unexpected vsetvli opcode.");
- Register PrevOutVL = PrevVSETVLI->getOperand(0).getReg();
- Register OutVL = MI.getOperand(0).getReg();
- // We can't remove if the previous VSETVLI left VL unchanged and the
- // current instruction is setting it to VLMAX. Without knowing the VL
- // before the previous instruction we don't know if this is a change.
- if (PrevOutVL == RISCV::X0 && OutVL != RISCV::X0) {
- PrevVSETVLI = &MI;
- continue;
- }
- }
-
- // This VSETVLI is redundant, remove it.
- MI.eraseFromParent();
- Changed = true;
- }
-
- return Changed;
-}
-
-bool RISCVCleanupVSETVLI::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(MF.getFunction()))
- return false;
-
- // Skip if the vector extension is not enabled.
- const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
- if (!ST.hasStdExtV())
- return false;
-
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : MF)
- Changed |= runOnMachineBasicBlock(MBB);
-
- return Changed;
-}
-
-/// Returns an instance of the Cleanup VSETVLI pass.
-FunctionPass *llvm::createRISCVCleanupVSETVLIPass() {
- return new RISCVCleanupVSETVLI();
-}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index ec9a395..31ef752 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -62,6 +62,8 @@
bool expandVSetVL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned Opcode);
+ bool expandVSPILL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandVRELOAD(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
};
char RISCVExpandPseudo::ID = 0;
@@ -123,6 +125,30 @@
case RISCV::PseudoVMSET_M_B64:
// vmset.m vd => vmxnor.mm vd, vd, vd
return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXNOR_MM);
+ case RISCV::PseudoVSPILL2_M1:
+ case RISCV::PseudoVSPILL2_M2:
+ case RISCV::PseudoVSPILL2_M4:
+ case RISCV::PseudoVSPILL3_M1:
+ case RISCV::PseudoVSPILL3_M2:
+ case RISCV::PseudoVSPILL4_M1:
+ case RISCV::PseudoVSPILL4_M2:
+ case RISCV::PseudoVSPILL5_M1:
+ case RISCV::PseudoVSPILL6_M1:
+ case RISCV::PseudoVSPILL7_M1:
+ case RISCV::PseudoVSPILL8_M1:
+ return expandVSPILL(MBB, MBBI);
+ case RISCV::PseudoVRELOAD2_M1:
+ case RISCV::PseudoVRELOAD2_M2:
+ case RISCV::PseudoVRELOAD2_M4:
+ case RISCV::PseudoVRELOAD3_M1:
+ case RISCV::PseudoVRELOAD3_M2:
+ case RISCV::PseudoVRELOAD4_M1:
+ case RISCV::PseudoVRELOAD4_M2:
+ case RISCV::PseudoVRELOAD5_M1:
+ case RISCV::PseudoVRELOAD6_M1:
+ case RISCV::PseudoVRELOAD7_M1:
+ case RISCV::PseudoVRELOAD8_M1:
+ return expandVRELOAD(MBB, MBBI);
}
return false;
@@ -214,7 +240,8 @@
bool RISCVExpandPseudo::expandVSetVL(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) {
- assert(MBBI->getNumOperands() == 5 && "Unexpected instruction format");
+ assert(MBBI->getNumExplicitOperands() == 3 && MBBI->getNumOperands() >= 5 &&
+ "Unexpected instruction format");
DebugLoc DL = MBBI->getDebugLoc();
@@ -253,6 +280,96 @@
return true;
}
+bool RISCVExpandPseudo::expandVSPILL(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ DebugLoc DL = MBBI->getDebugLoc();
+ Register SrcReg = MBBI->getOperand(0).getReg();
+ Register Base = MBBI->getOperand(1).getReg();
+ Register VL = MBBI->getOperand(2).getReg();
+ auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode());
+ if (!ZvlssegInfo)
+ return false;
+ unsigned NF = ZvlssegInfo->first;
+ unsigned LMUL = ZvlssegInfo->second;
+ assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations.");
+ unsigned Opcode = RISCV::VS1R_V;
+ unsigned SubRegIdx = RISCV::sub_vrm1_0;
+ static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+ "Unexpected subreg numbering");
+ if (LMUL == 2) {
+ Opcode = RISCV::VS2R_V;
+ SubRegIdx = RISCV::sub_vrm2_0;
+ static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+ "Unexpected subreg numbering");
+ } else if (LMUL == 4) {
+ Opcode = RISCV::VS4R_V;
+ SubRegIdx = RISCV::sub_vrm4_0;
+ static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+ "Unexpected subreg numbering");
+ } else
+ assert(LMUL == 1 && "LMUL must be 1, 2, or 4.");
+
+ for (unsigned I = 0; I < NF; ++I) {
+ BuildMI(MBB, MBBI, DL, TII->get(Opcode))
+ .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I))
+ .addReg(Base)
+ .addMemOperand(*(MBBI->memoperands_begin()));
+ if (I != NF - 1)
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), Base)
+ .addReg(Base)
+ .addReg(VL);
+ }
+ MBBI->eraseFromParent();
+ return true;
+}
+
+bool RISCVExpandPseudo::expandVRELOAD(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ DebugLoc DL = MBBI->getDebugLoc();
+ Register DestReg = MBBI->getOperand(0).getReg();
+ Register Base = MBBI->getOperand(1).getReg();
+ Register VL = MBBI->getOperand(2).getReg();
+ auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode());
+ if (!ZvlssegInfo)
+ return false;
+ unsigned NF = ZvlssegInfo->first;
+ unsigned LMUL = ZvlssegInfo->second;
+ assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations.");
+ unsigned Opcode = RISCV::VL1RE8_V;
+ unsigned SubRegIdx = RISCV::sub_vrm1_0;
+ static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+ "Unexpected subreg numbering");
+ if (LMUL == 2) {
+ Opcode = RISCV::VL2RE8_V;
+ SubRegIdx = RISCV::sub_vrm2_0;
+ static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+ "Unexpected subreg numbering");
+ } else if (LMUL == 4) {
+ Opcode = RISCV::VL4RE8_V;
+ SubRegIdx = RISCV::sub_vrm4_0;
+ static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+ "Unexpected subreg numbering");
+ } else
+ assert(LMUL == 1 && "LMUL must be 1, 2, or 4.");
+
+ for (unsigned I = 0; I < NF; ++I) {
+ BuildMI(MBB, MBBI, DL, TII->get(Opcode),
+ TRI->getSubReg(DestReg, SubRegIdx + I))
+ .addReg(Base)
+ .addMemOperand(*(MBBI->memoperands_begin()));
+ if (I != NF - 1)
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), Base)
+ .addReg(Base)
+ .addReg(VL);
+ }
+ MBBI->eraseFromParent();
+ return true;
+}
+
} // end of anonymous namespace
INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 564d97f..188bd49 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -67,11 +67,13 @@
BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
.addReg(RAReg)
.addReg(SCSPReg)
- .addImm(0);
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
.addReg(SCSPReg, RegState::Define)
.addReg(SCSPReg)
- .addImm(SlotSize);
+ .addImm(SlotSize)
+ .setMIFlag(MachineInstr::FrameSetup);
}
static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -115,11 +117,13 @@
BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::LD : RISCV::LW))
.addReg(RAReg, RegState::Define)
.addReg(SCSPReg)
- .addImm(-SlotSize);
+ .addImm(-SlotSize)
+ .setMIFlag(MachineInstr::FrameDestroy);
BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
.addReg(SCSPReg, RegState::Define)
.addReg(SCSPReg)
- .addImm(-SlotSize);
+ .addImm(-SlotSize)
+ .setMIFlag(MachineInstr::FrameDestroy);
}
// Get the ID of the libcall used for spilling and restoring callee saved
@@ -221,7 +225,7 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- RegInfo->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken();
}
@@ -229,7 +233,7 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
- return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
}
// Determines the size of the frame and maximum call frame size.
@@ -242,10 +246,6 @@
// Get the alignment.
Align StackAlign = getStackAlign();
- // Set Max Call Frame Size
- uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
- MFI.setMaxCallFrameSize(MaxCallSize);
-
// Make sure the frame is aligned.
FrameSize = alignTo(FrameSize, StackAlign);
@@ -293,16 +293,44 @@
static Register getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
static SmallVector<CalleeSavedInfo, 8>
-getNonLibcallCSI(const std::vector<CalleeSavedInfo> &CSI) {
+getNonLibcallCSI(const MachineFunction &MF,
+ const std::vector<CalleeSavedInfo> &CSI) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
SmallVector<CalleeSavedInfo, 8> NonLibcallCSI;
- for (auto &CS : CSI)
- if (CS.getFrameIdx() >= 0)
+ for (auto &CS : CSI) {
+ int FI = CS.getFrameIdx();
+ if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::Default)
NonLibcallCSI.push_back(CS);
+ }
return NonLibcallCSI;
}
+void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag) const {
+ assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ Register SPReg = getSPReg(STI);
+ unsigned Opc = RISCV::ADD;
+ if (Amount < 0) {
+ Amount = -Amount;
+ Opc = RISCV::SUB;
+ }
+ // 1. Multiply the number of v-slots to the length of registers
+ Register FactorRegister =
+ TII->getVLENFactoredAmount(MF, MBB, MBBI, DL, Amount, Flag);
+ // 2. SP = SP - RVV stack size
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), SPReg)
+ .addReg(SPReg)
+ .addReg(FactorRegister, RegState::Kill)
+ .setMIFlag(Flag);
+}
+
void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -362,11 +390,12 @@
// FIXME (note copied from Lanai): This appears to be overallocating. Needs
// investigation. Get the number of bytes to allocate from the FrameInfo.
- uint64_t StackSize = MFI.getStackSize();
+ uint64_t StackSize = MFI.getStackSize() + RVFI->getRVVPadding();
uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
+ uint64_t RVVStackSize = RVFI->getRVVStackSize();
// Early exit if there is no need to allocate on the stack
- if (RealStackSize == 0 && !MFI.adjustsStack())
+ if (RealStackSize == 0 && !MFI.adjustsStack() && RVVStackSize == 0)
return;
// If the stack pointer has been marked as reserved, then produce an error if
@@ -389,7 +418,8 @@
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
const auto &CSI = MFI.getCalleeSavedInfo();
@@ -399,7 +429,7 @@
// to the stack, not before.
// FIXME: assumes exactly one instruction is used to save each callee-saved
// register.
- std::advance(MBBI, getNonLibcallCSI(CSI).size());
+ std::advance(MBBI, getNonLibcallCSI(MF, CSI).size());
// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
@@ -417,7 +447,8 @@
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, RI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
}
// Generate new FP.
@@ -434,7 +465,8 @@
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, RI->getDwarfRegNum(FPReg, true), RVFI->getVarArgsSaveSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
}
// Emit the second SP adjustment after saving callee saved registers.
@@ -452,31 +484,39 @@
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
}
}
+ if (RVVStackSize)
+ adjustStackForRVV(MF, MBB, MBBI, DL, -RVVStackSize,
+ MachineInstr::FrameSetup);
+
if (hasFP(MF)) {
// Realign Stack
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
- if (RI->needsStackRealignment(MF)) {
+ if (RI->hasStackRealignment(MF)) {
Align MaxAlignment = MFI.getMaxAlign();
const RISCVInstrInfo *TII = STI.getInstrInfo();
if (isInt<12>(-(int)MaxAlignment.value())) {
BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
.addReg(SPReg)
- .addImm(-(int)MaxAlignment.value());
+ .addImm(-(int)MaxAlignment.value())
+ .setMIFlag(MachineInstr::FrameSetup);
} else {
unsigned ShiftAmount = Log2(MaxAlignment);
Register VR =
MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
.addReg(SPReg)
- .addImm(ShiftAmount);
+ .addImm(ShiftAmount)
+ .setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
.addReg(VR)
- .addImm(ShiftAmount);
+ .addImm(ShiftAmount)
+ .setMIFlag(MachineInstr::FrameSetup);
}
// FP will be used to restore the frame in the epilogue, so we need
// another base register BP to record SP after re-alignment. SP will
@@ -485,7 +525,8 @@
// move BP, SP
BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), BPReg)
.addReg(SPReg)
- .addImm(0);
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
}
}
}
@@ -526,7 +567,7 @@
--MBBI;
}
- const auto &CSI = getNonLibcallCSI(MFI.getCalleeSavedInfo());
+ const auto &CSI = getNonLibcallCSI(MF, MFI.getCalleeSavedInfo());
// Skip to before the restores of callee-saved registers
// FIXME: assumes exactly one instruction is used to restore each
@@ -535,17 +576,22 @@
if (!CSI.empty())
LastFrameDestroy = std::prev(MBBI, CSI.size());
- uint64_t StackSize = MFI.getStackSize();
+ uint64_t StackSize = MFI.getStackSize() + RVFI->getRVVPadding();
uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
uint64_t FPOffset = RealStackSize - RVFI->getVarArgsSaveSize();
+ uint64_t RVVStackSize = RVFI->getRVVStackSize();
// Restore the stack pointer using the value of the frame pointer. Only
// necessary if the stack pointer was modified, meaning the stack size is
// unknown.
- if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
+ if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects()) {
assert(hasFP(MF) && "frame pointer should not have been eliminated");
adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
MachineInstr::FrameDestroy);
+ } else {
+ if (RVVStackSize)
+ adjustStackForRVV(MF, MBB, LastFrameDestroy, DL, RVVStackSize,
+ MachineInstr::FrameDestroy);
}
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
@@ -578,12 +624,22 @@
// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
// offset).
- const auto &CSI = getNonLibcallCSI(MFI.getCalleeSavedInfo());
+ const auto &CSI = getNonLibcallCSI(MF, MFI.getCalleeSavedInfo());
int MinCSFI = 0;
int MaxCSFI = -1;
+ StackOffset Offset;
+ auto StackID = MFI.getStackID(FI);
- int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
- MFI.getOffsetAdjustment();
+ assert((StackID == TargetStackID::Default ||
+ StackID == TargetStackID::ScalableVector) &&
+ "Unexpected stack ID for the frame object.");
+ if (StackID == TargetStackID::Default) {
+ Offset =
+ StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
+ MFI.getOffsetAdjustment());
+ } else if (StackID == TargetStackID::ScalableVector) {
+ Offset = StackOffset::getScalable(MFI.getObjectOffset(FI));
+ }
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
@@ -596,33 +652,135 @@
FrameReg = RISCV::X2;
if (FirstSPAdjustAmount)
- Offset += FirstSPAdjustAmount;
+ Offset += StackOffset::getFixed(FirstSPAdjustAmount);
else
- Offset += MFI.getStackSize();
- } else if (RI->needsStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
+ Offset +=
+ StackOffset::getFixed(MFI.getStackSize() + RVFI->getRVVPadding());
+ } else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
// If the stack was realigned, the frame pointer is set in order to allow
// SP to be restored, so we need another base register to record the stack
// after realignment.
- if (hasBP(MF))
+ if (hasBP(MF)) {
FrameReg = RISCVABI::getBPReg();
- else
+ // |--------------------------| -- <-- FP
+ // | callee-saved registers | | <----.
+ // |--------------------------| -- |
+ // | realignment (the size of | | |
+ // | this area is not counted | | |
+ // | in MFI.getStackSize()) | | |
+ // |--------------------------| -- |
+ // | Padding after RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |-- MFI.getStackSize()
+ // | RVV objects | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | Padding before RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | scalar local variables | | <----'
+ // |--------------------------| -- <-- BP
+ // | VarSize objects | |
+ // |--------------------------| -- <-- SP
+ } else {
FrameReg = RISCV::X2;
- Offset += MFI.getStackSize();
- if (FI < 0)
- Offset += RVFI->getLibCallStackSize();
+ // |--------------------------| -- <-- FP
+ // | callee-saved registers | | <----.
+ // |--------------------------| -- |
+ // | realignment (the size of | | |
+ // | this area is not counted | | |
+ // | in MFI.getStackSize()) | | |
+ // |--------------------------| -- |
+ // | Padding after RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |-- MFI.getStackSize()
+ // | RVV objects | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | Padding before RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | scalar local variables | | <----'
+ // |--------------------------| -- <-- SP
+ }
+ // The total amount of padding surrounding RVV objects is described by
+ // RVV->getRVVPadding() and it can be zero. It allows us to align the RVV
+ // objects to 8 bytes.
+ if (MFI.getStackID(FI) == TargetStackID::Default) {
+ Offset += StackOffset::getFixed(MFI.getStackSize());
+ if (FI < 0)
+ Offset += StackOffset::getFixed(RVFI->getLibCallStackSize());
+ } else if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ Offset += StackOffset::get(
+ alignTo(MFI.getStackSize() - RVFI->getCalleeSavedStackSize(), 8),
+ RVFI->getRVVStackSize());
+ }
} else {
FrameReg = RI->getFrameRegister(MF);
if (hasFP(MF)) {
- Offset += RVFI->getVarArgsSaveSize();
+ Offset += StackOffset::getFixed(RVFI->getVarArgsSaveSize());
if (FI >= 0)
- Offset -= RVFI->getLibCallStackSize();
+ Offset -= StackOffset::getFixed(RVFI->getLibCallStackSize());
+ // When using FP to access scalable vector objects, we need to minus
+ // the frame size.
+ //
+ // |--------------------------| -- <-- FP
+ // | callee-saved registers | |
+ // |--------------------------| | MFI.getStackSize()
+ // | scalar local variables | |
+ // |--------------------------| -- (Offset of RVV objects is from here.)
+ // | RVV objects |
+ // |--------------------------|
+ // | VarSize objects |
+ // |--------------------------| <-- SP
+ if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ Offset -= StackOffset::getFixed(MFI.getStackSize());
} else {
- Offset += MFI.getStackSize();
- if (FI < 0)
- Offset += RVFI->getLibCallStackSize();
+ // When using SP to access frame objects, we need to add RVV stack size.
+ //
+ // |--------------------------| -- <-- FP
+ // | callee-saved registers | | <----.
+ // |--------------------------| -- |
+ // | Padding after RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | RVV objects | | |-- MFI.getStackSize()
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | Padding before RVV | | |
+ // | (not counted in | | |
+ // | MFI.getStackSize() | | |
+ // |--------------------------| -- |
+ // | scalar local variables | | <----'
+ // |--------------------------| -- <-- SP
+ //
+ // The total amount of padding surrounding RVV objects is described by
+ // RVV->getRVVPadding() and it can be zero. It allows us to align the RVV
+ // objects to 8 bytes.
+ if (MFI.getStackID(FI) == TargetStackID::Default) {
+ if (MFI.isFixedObjectIndex(FI)) {
+ Offset += StackOffset::get(MFI.getStackSize() + RVFI->getRVVPadding()
+ + RVFI->getLibCallStackSize(), RVFI->getRVVStackSize());
+ } else {
+ Offset += StackOffset::getFixed(MFI.getStackSize());
+ }
+ } else if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ Offset += StackOffset::get(
+ alignTo(MFI.getStackSize() - RVFI->getCalleeSavedStackSize(), 8),
+ RVFI->getRVVStackSize());
+ }
}
}
- return StackOffset::getFixed(Offset);
+
+ return Offset;
}
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -670,29 +828,120 @@
}
}
+int64_t
+RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
+ int64_t Offset = 0;
+ // Create a buffer of RVV objects to allocate.
+ SmallVector<int, 8> ObjectsToAllocate;
+ for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
+ unsigned StackID = MFI.getStackID(I);
+ if (StackID != TargetStackID::ScalableVector)
+ continue;
+ if (MFI.isDeadObjectIndex(I))
+ continue;
+
+ ObjectsToAllocate.push_back(I);
+ }
+
+ // Allocate all RVV locals and spills
+ for (int FI : ObjectsToAllocate) {
+ // ObjectSize in bytes.
+ int64_t ObjectSize = MFI.getObjectSize(FI);
+ // If the data type is the fractional vector type, reserve one vector
+ // register for it.
+ if (ObjectSize < 8)
+ ObjectSize = 8;
+ // Currently, all scalable vector types are aligned to 8 bytes.
+ Offset = alignTo(Offset + ObjectSize, 8);
+ MFI.setObjectOffset(FI, -Offset);
+ }
+
+ return Offset;
+}
+
+static bool hasRVVSpillWithFIs(MachineFunction &MF, const RISCVInstrInfo &TII) {
+ if (!MF.getSubtarget<RISCVSubtarget>().hasStdExtV())
+ return false;
+ return any_of(MF, [&TII](const MachineBasicBlock &MBB) {
+ return any_of(MBB, [&TII](const MachineInstr &MI) {
+ return TII.isRVVSpill(MI, /*CheckFIs*/ true);
+ });
+ });
+}
+
void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
- const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ const RISCVRegisterInfo *RegInfo =
+ MF.getSubtarget<RISCVSubtarget>().getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterClass *RC = &RISCV::GPRRegClass;
+ auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+
+ int64_t RVVStackSize = assignRVVStackObjectOffsets(MFI);
+ RVFI->setRVVStackSize(RVVStackSize);
+ const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
+
// estimateStackSize has been observed to under-estimate the final stack
// size, so give ourselves wiggle-room by checking for stack size
// representable an 11-bit signed field rather than 12-bits.
// FIXME: It may be possible to craft a function with a small stack that
// still needs an emergency spill slot for branch relaxation. This case
// would currently be missed.
- if (!isInt<11>(MFI.estimateStackSize(MF))) {
+ // RVV loads & stores have no capacity to hold the immediate address offsets
+ // so we must always reserve an emergency spill slot if the MachineFunction
+ // contains any RVV spills.
+ if (!isInt<11>(MFI.estimateStackSize(MF)) || hasRVVSpillWithFIs(MF, TII)) {
int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
RegInfo->getSpillAlign(*RC), false);
RS->addScavengingFrameIndex(RegScavFI);
+ // For RVV, scalable stack offsets require up to two scratch registers to
+ // compute the final offset. Reserve an additional emergency spill slot.
+ if (RVVStackSize != 0) {
+ int RVVRegScavFI = MFI.CreateStackObject(
+ RegInfo->getSpillSize(*RC), RegInfo->getSpillAlign(*RC), false);
+ RS->addScavengingFrameIndex(RVVRegScavFI);
+ }
+ }
+
+ if (MFI.getCalleeSavedInfo().empty() || RVFI->useSaveRestoreLibCalls(MF)) {
+ RVFI->setCalleeSavedStackSize(0);
+ return;
+ }
+
+ unsigned Size = 0;
+ for (const auto &Info : MFI.getCalleeSavedInfo()) {
+ int FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
+ continue;
+
+ Size += MFI.getObjectSize(FrameIdx);
+ }
+ RVFI->setCalleeSavedStackSize(Size);
+
+ // Padding required to keep the RVV stack aligned to 8 bytes
+ // within the main stack. We only need this when not using FP.
+ if (RVVStackSize && !hasFP(MF) && Size % 8 != 0) {
+ // Because we add the padding to the size of the stack, adding
+ // getStackAlign() will keep it aligned.
+ RVFI->setRVVPadding(getStackAlign().value());
}
}
+static bool hasRVVFrameObject(const MachineFunction &MF) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
+ if (MFI.getStackID(I) == TargetStackID::ScalableVector)
+ return true;
+ return false;
+}
+
// Not preserve stack space within prologue for outgoing variables when the
-// function contains variable size objects and let eliminateCallFramePseudoInstr
-// preserve stack space for it.
+// function contains variable size objects or there are vector objects accessed
+// by the frame pointer.
+// Let eliminateCallFramePseudoInstr preserve stack space for it.
bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo().hasVarSizedObjects();
+ return !MF.getFrameInfo().hasVarSizedObjects() &&
+ !(hasFP(MF) && hasRVVFrameObject(MF));
}
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
@@ -786,7 +1035,7 @@
}
// Manually spill values not spilled by libcall.
- const auto &NonLibcallCSI = getNonLibcallCSI(CSI);
+ const auto &NonLibcallCSI = getNonLibcallCSI(*MF, CSI);
for (auto &CS : NonLibcallCSI) {
// Insert the spill to the stack frame.
Register Reg = CS.getReg();
@@ -811,7 +1060,7 @@
// Manually restore values not restored by libcall. Insert in reverse order.
// loadRegFromStackSlot can insert multiple instructions.
- const auto &NonLibcallCSI = getNonLibcallCSI(CSI);
+ const auto &NonLibcallCSI = getNonLibcallCSI(*MF, CSI);
for (auto &CS : reverse(NonLibcallCSI)) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
@@ -883,3 +1132,20 @@
// replacing the successor with our own tail return at the end of our block.
return SuccMBB->isReturnBlock() && SuccMBB->size() == 1;
}
+
+bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
+ switch (ID) {
+ case TargetStackID::Default:
+ case TargetStackID::ScalableVector:
+ return true;
+ case TargetStackID::NoAlloc:
+ case TargetStackID::SGPRSpill:
+ case TargetStackID::WasmLocal:
+ return false;
+ }
+ llvm_unreachable("Invalid TargetStackID::Value");
+}
+
+TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
+ return TargetStackID::ScalableVector;
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 889b9ce..bc3ace7 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -65,6 +65,9 @@
bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+ bool isSupportedStackID(TargetStackID::Value ID) const override;
+ TargetStackID::Value getStackIDForScalableVectors() const override;
+
protected:
const RISCVSubtarget &STI;
@@ -73,6 +76,10 @@
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register DestReg, Register SrcReg,
int64_t Val, MachineInstr::MIFlag Flag) const;
+ void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ int64_t Amount, MachineInstr::MIFlag Flag) const;
+ int64_t assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const;
};
}
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 43bf16c..9866567 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -13,10 +13,13 @@
#include "RISCVISelDAGToDAG.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "MCTargetDesc/RISCVMatInt.h"
+#include "RISCVISelLowering.h"
+#include "RISCVMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -24,14 +27,95 @@
#define DEBUG_TYPE "riscv-isel"
+namespace llvm {
+namespace RISCV {
+#define GET_RISCVVSSEGTable_IMPL
+#define GET_RISCVVLSEGTable_IMPL
+#define GET_RISCVVLXSEGTable_IMPL
+#define GET_RISCVVSXSEGTable_IMPL
+#define GET_RISCVVLETable_IMPL
+#define GET_RISCVVSETable_IMPL
+#define GET_RISCVVLXTable_IMPL
+#define GET_RISCVVSXTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+} // namespace RISCV
+} // namespace llvm
+
+void RISCVDAGToDAGISel::PreprocessISelDAG() {
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end();
+ I != E;) {
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+ // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector
+ // load. Done after lowering and combining so that we have a chance to
+ // optimize this to VMV_V_X_VL when the upper bits aren't needed.
+ if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL)
+ continue;
+
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands");
+ MVT VT = N->getSimpleValueType(0);
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+ SDValue VL = N->getOperand(2);
+ assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() &&
+ Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 &&
+ "Unexpected VTs!");
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
+ SDLoc DL(N);
+
+ // We use the same frame index we use for moving two i32s into 64-bit FPR.
+ // This is an analogous operation.
+ int FI = FuncInfo->getMoveF64FrameIndex(MF);
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+ const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+ SDValue StackSlot =
+ CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout()));
+
+ SDValue Chain = CurDAG->getEntryNode();
+ Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8));
+
+ SDValue OffsetSlot =
+ CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL);
+ Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4),
+ Align(8));
+
+ Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+
+ SDVTList VTs = CurDAG->getVTList({VT, MVT::Other});
+ SDValue IntID =
+ CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
+ SDValue Ops[] = {Chain, IntID, StackSlot,
+ CurDAG->getRegister(RISCV::X0, MVT::i64), VL};
+
+ SDValue Result = CurDAG->getMemIntrinsicNode(
+ ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8),
+ MachineMemOperand::MOLoad);
+
+ // We're about to replace all uses of the SPLAT_VECTOR_SPLIT_I64 with the
+ // vlse we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
void RISCVDAGToDAGISel::PostprocessISelDAG() {
doPeepholeLoadStoreADDI();
}
static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
- MVT XLenVT) {
- RISCVMatInt::InstSeq Seq;
- RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);
+ const RISCVSubtarget &Subtarget) {
+ MVT XLenVT = Subtarget.getXLenVT();
+ RISCVMatInt::InstSeq Seq =
+ RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
SDNode *Result = nullptr;
SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
@@ -39,6 +123,9 @@
SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
if (Inst.Opc == RISCV::LUI)
Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
+ else if (Inst.Opc == RISCV::ADDUW)
+ Result = CurDAG->getMachineNode(RISCV::ADDUW, DL, XLenVT, SrcReg,
+ CurDAG->getRegister(RISCV::X0, XLenVT));
else
Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);
@@ -49,46 +136,6 @@
return Result;
}
-static RISCVVLMUL getLMUL(EVT VT) {
- switch (VT.getSizeInBits().getKnownMinValue() / 8) {
- default:
- llvm_unreachable("Invalid LMUL.");
- case 1:
- return RISCVVLMUL::LMUL_F8;
- case 2:
- return RISCVVLMUL::LMUL_F4;
- case 4:
- return RISCVVLMUL::LMUL_F2;
- case 8:
- return RISCVVLMUL::LMUL_1;
- case 16:
- return RISCVVLMUL::LMUL_2;
- case 32:
- return RISCVVLMUL::LMUL_4;
- case 64:
- return RISCVVLMUL::LMUL_8;
- }
-}
-
-static unsigned getSubregIndexByEVT(EVT VT, unsigned Index) {
- RISCVVLMUL LMUL = getLMUL(VT);
- if (LMUL == RISCVVLMUL::LMUL_F8 || LMUL == RISCVVLMUL::LMUL_F4 ||
- LMUL == RISCVVLMUL::LMUL_F2 || LMUL == RISCVVLMUL::LMUL_1) {
- static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
- "Unexpected subreg numbering");
- return RISCV::sub_vrm1_0 + Index;
- } else if (LMUL == RISCVVLMUL::LMUL_2) {
- static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
- "Unexpected subreg numbering");
- return RISCV::sub_vrm2_0 + Index;
- } else if (LMUL == RISCVVLMUL::LMUL_4) {
- static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
- "Unexpected subreg numbering");
- return RISCV::sub_vrm4_0 + Index;
- }
- llvm_unreachable("Invalid vector type.");
-}
-
static SDValue createTupleImpl(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
unsigned RegClassID, unsigned SubReg0) {
assert(Regs.size() >= 2 && Regs.size() <= 8);
@@ -133,356 +180,260 @@
}
static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
- unsigned NF, RISCVVLMUL LMUL) {
+ unsigned NF, RISCVII::VLMUL LMUL) {
switch (LMUL) {
default:
llvm_unreachable("Invalid LMUL.");
- case RISCVVLMUL::LMUL_F8:
- case RISCVVLMUL::LMUL_F4:
- case RISCVVLMUL::LMUL_F2:
- case RISCVVLMUL::LMUL_1:
+ case RISCVII::VLMUL::LMUL_F8:
+ case RISCVII::VLMUL::LMUL_F4:
+ case RISCVII::VLMUL::LMUL_F2:
+ case RISCVII::VLMUL::LMUL_1:
return createM1Tuple(CurDAG, Regs, NF);
- case RISCVVLMUL::LMUL_2:
+ case RISCVII::VLMUL::LMUL_2:
return createM2Tuple(CurDAG, Regs, NF);
- case RISCVVLMUL::LMUL_4:
+ case RISCVII::VLMUL::LMUL_4:
return createM4Tuple(CurDAG, Regs, NF);
}
}
-void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned IntNo,
+void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
+ SDNode *Node, unsigned Log2SEW, const SDLoc &DL, unsigned CurOp,
+ bool IsMasked, bool IsStridedOrIndexed, SmallVectorImpl<SDValue> &Operands,
+ MVT *IndexVT) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue Glue;
+
+ SDValue Base;
+ SelectBaseAddr(Node->getOperand(CurOp++), Base);
+ Operands.push_back(Base); // Base pointer.
+
+ if (IsStridedOrIndexed) {
+ Operands.push_back(Node->getOperand(CurOp++)); // Index.
+ if (IndexVT)
+ *IndexVT = Operands.back()->getSimpleValueType(0);
+ }
+
+ if (IsMasked) {
+ // Mask needs to be copied to V0.
+ SDValue Mask = Node->getOperand(CurOp++);
+ Chain = CurDAG->getCopyToReg(Chain, DL, RISCV::V0, Mask, SDValue());
+ Glue = Chain.getValue(1);
+ Operands.push_back(CurDAG->getRegister(RISCV::V0, Mask.getValueType()));
+ }
+ SDValue VL;
+ selectVLOp(Node->getOperand(CurOp++), VL);
+ Operands.push_back(VL);
+
+ MVT XLenVT = Subtarget->getXLenVT();
+ SDValue SEWOp = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
+ Operands.push_back(SEWOp);
+
+ Operands.push_back(Chain); // Chain.
+ if (Glue)
+ Operands.push_back(Glue);
+}
+
+void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, bool IsMasked,
bool IsStrided) {
SDLoc DL(Node);
unsigned NF = Node->getNumValues() - 1;
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 5> Operands;
- Operands.push_back(Node->getOperand(2)); // Base pointer.
- if (IsStrided) {
- Operands.push_back(Node->getOperand(3)); // Stride.
- Operands.push_back(Node->getOperand(4)); // VL.
- } else {
- Operands.push_back(Node->getOperand(3)); // VL.
+ MVT VT = Node->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ if (IsMasked) {
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+ Node->op_begin() + CurOp + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ Operands.push_back(MaskedOff);
+ CurOp += NF;
}
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); // Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Load =
+
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
+ Operands);
+
+ const RISCV::VLSEGPseudo *P =
+ RISCV::getVLSEGPseudo(NF, IsMasked, IsStrided, /*FF*/ false, Log2SEW,
+ static_cast<unsigned>(LMUL));
+ MachineSDNode *Load =
CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
+ for (unsigned I = 0; I < NF; ++I) {
+ unsigned SubRegIdx = RISCVTargetLowering::getSubregIndexByMVT(VT, I);
ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
-
- ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
- CurDAG->RemoveDeadNode(Node);
-}
-
-void RISCVDAGToDAGISel::selectVLSEGMask(SDNode *Node, unsigned IntNo,
- bool IsStrided) {
- SDLoc DL(Node);
- unsigned NF = Node->getNumValues() - 1;
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
- SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
- SmallVector<SDValue, 7> Operands;
- Operands.push_back(MaskedOff);
- Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
- if (IsStrided) {
- Operands.push_back(Node->getOperand(NF + 3)); // Stride.
- Operands.push_back(Node->getOperand(NF + 4)); // Mask.
- Operands.push_back(Node->getOperand(NF + 5)); // VL.
- } else {
- Operands.push_back(Node->getOperand(NF + 3)); // Mask.
- Operands.push_back(Node->getOperand(NF + 4)); // VL.
+ CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, SuperReg));
}
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); /// Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Load =
- CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
- SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
- ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
CurDAG->RemoveDeadNode(Node);
}
-void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node) {
+void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node, bool IsMasked) {
SDLoc DL(Node);
- unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
- unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
+ unsigned NF = Node->getNumValues() - 2; // Do not count VL and Chain.
+ MVT VT = Node->getSimpleValueType(0);
MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 5> Operands;
- Operands.push_back(Node->getOperand(2)); // Base pointer.
- Operands.push_back(Node->getOperand(3)); // VL.
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); // Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
- MVT::Glue, Operands);
- SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
- ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
- ReplaceUses(SDValue(Node, NF), SDValue(Load, 1)); // Chain.
- ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
- CurDAG->RemoveDeadNode(Node);
-}
-
-void RISCVDAGToDAGISel::selectVLSEGFFMask(SDNode *Node) {
- SDLoc DL(Node);
- unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
- unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
- SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ unsigned CurOp = 2;
SmallVector<SDValue, 7> Operands;
- Operands.push_back(MaskedOff);
- Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
- Operands.push_back(Node->getOperand(NF + 3)); // Mask.
- Operands.push_back(Node->getOperand(NF + 4)); // VL.
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); /// Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
- MVT::Glue, Operands);
- SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
- ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
+ if (IsMasked) {
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+ Node->op_begin() + CurOp + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ Operands.push_back(MaskedOff);
+ CurOp += NF;
+ }
- ReplaceUses(SDValue(Node, NF), SDValue(Load, 1)); // Chain.
- ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ false, Operands);
+
+ const RISCV::VLSEGPseudo *P =
+ RISCV::getVLSEGPseudo(NF, IsMasked, /*Strided*/ false, /*FF*/ true,
+ Log2SEW, static_cast<unsigned>(LMUL));
+ MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped,
+ MVT::Other, MVT::Glue, Operands);
+ SDNode *ReadVL = CurDAG->getMachineNode(RISCV::PseudoReadVL, DL, XLenVT,
+ /*Glue*/ SDValue(Load, 2));
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I) {
+ unsigned SubRegIdx = RISCVTargetLowering::getSubregIndexByMVT(VT, I);
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, SuperReg));
+ }
+
+ ReplaceUses(SDValue(Node, NF), SDValue(ReadVL, 0)); // VL
+ ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 1)); // Chain
CurDAG->RemoveDeadNode(Node);
}
-void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned IntNo) {
+void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked,
+ bool IsOrdered) {
SDLoc DL(Node);
unsigned NF = Node->getNumValues() - 1;
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SDValue Operands[] = {
- Node->getOperand(2), // Base pointer.
- Node->getOperand(3), // Index.
- Node->getOperand(4), // VL.
- SEW, Node->getOperand(0) // Chain.
- };
+ MVT VT = Node->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
- EVT IndexVT = Node->getOperand(3)->getValueType(0);
- RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
- unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ if (IsMasked) {
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+ Node->op_begin() + CurOp + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ Operands.push_back(MaskedOff);
+ CurOp += NF;
+ }
+
+ MVT IndexVT;
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ true, Operands, &IndexVT);
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Element count mismatch");
+
+ RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+ unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+ const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
+ NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
static_cast<unsigned>(IndexLMUL));
- SDNode *Load =
+ MachineSDNode *Load =
CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
+ for (unsigned I = 0; I < NF; ++I) {
+ unsigned SubRegIdx = RISCVTargetLowering::getSubregIndexByMVT(VT, I);
ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
+ CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, SuperReg));
+ }
ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
CurDAG->RemoveDeadNode(Node);
}
-void RISCVDAGToDAGISel::selectVLXSEGMask(SDNode *Node, unsigned IntNo) {
- SDLoc DL(Node);
- unsigned NF = Node->getNumValues() - 1;
- EVT VT = Node->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
- SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
- SDValue Operands[] = {
- MaskedOff,
- Node->getOperand(NF + 2), // Base pointer.
- Node->getOperand(NF + 3), // Index.
- Node->getOperand(NF + 4), // Mask.
- Node->getOperand(NF + 5), // VL.
- SEW,
- Node->getOperand(0) // Chain.
- };
-
- EVT IndexVT = Node->getOperand(NF + 3)->getValueType(0);
- RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
- unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(IndexLMUL));
- SDNode *Load =
- CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
- SDValue SuperReg = SDValue(Load, 0);
- for (unsigned I = 0; I < NF; ++I)
- ReplaceUses(SDValue(Node, I),
- CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
- VT, SuperReg));
-
- ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
- CurDAG->RemoveDeadNode(Node);
-}
-
-void RISCVDAGToDAGISel::selectVSSEG(SDNode *Node, unsigned IntNo,
+void RISCVDAGToDAGISel::selectVSSEG(SDNode *Node, bool IsMasked,
bool IsStrided) {
SDLoc DL(Node);
unsigned NF = Node->getNumOperands() - 4;
if (IsStrided)
NF--;
- EVT VT = Node->getOperand(2)->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
- SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
- SmallVector<SDValue, 6> Operands;
- Operands.push_back(StoreVal);
- Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
- if (IsStrided) {
- Operands.push_back(Node->getOperand(3 + NF)); // Stride.
- Operands.push_back(Node->getOperand(4 + NF)); // VL.
- } else {
- Operands.push_back(Node->getOperand(3 + NF)); // VL.
- }
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); // Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Store =
- CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
- ReplaceNode(Node, Store);
-}
-
-void RISCVDAGToDAGISel::selectVSSEGMask(SDNode *Node, unsigned IntNo,
- bool IsStrided) {
- SDLoc DL(Node);
- unsigned NF = Node->getNumOperands() - 5;
- if (IsStrided)
+ if (IsMasked)
NF--;
- EVT VT = Node->getOperand(2)->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ MVT VT = Node->getOperand(2)->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
- SmallVector<SDValue, 7> Operands;
+
+ SmallVector<SDValue, 8> Operands;
Operands.push_back(StoreVal);
- Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
- if (IsStrided) {
- Operands.push_back(Node->getOperand(3 + NF)); // Stride.
- Operands.push_back(Node->getOperand(4 + NF)); // Mask.
- Operands.push_back(Node->getOperand(5 + NF)); // VL.
- } else {
- Operands.push_back(Node->getOperand(3 + NF)); // Mask.
- Operands.push_back(Node->getOperand(4 + NF)); // VL.
- }
- Operands.push_back(SEW);
- Operands.push_back(Node->getOperand(0)); // Chain.
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, ScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(RISCVVLMUL::LMUL_1));
- SDNode *Store =
+ unsigned CurOp = 2 + NF;
+
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
+ Operands);
+
+ const RISCV::VSSEGPseudo *P = RISCV::getVSSEGPseudo(
+ NF, IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL));
+ MachineSDNode *Store =
CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Store, {MemOp->getMemOperand()});
+
ReplaceNode(Node, Store);
}
-void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo) {
+void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked,
+ bool IsOrdered) {
SDLoc DL(Node);
unsigned NF = Node->getNumOperands() - 5;
- EVT VT = Node->getOperand(2)->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ if (IsMasked)
+ --NF;
+ MVT VT = Node->getOperand(2)->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
- SDValue Operands[] = {
- StoreVal,
- Node->getOperand(2 + NF), // Base pointer.
- Node->getOperand(3 + NF), // Index.
- Node->getOperand(4 + NF), // VL.
- SEW,
- Node->getOperand(0) // Chain.
- };
- EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
- RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
- unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ SmallVector<SDValue, 8> Operands;
+ Operands.push_back(StoreVal);
+ unsigned CurOp = 2 + NF;
+
+ MVT IndexVT;
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ true, Operands, &IndexVT);
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Element count mismatch");
+
+ RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+ unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+ const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo(
+ NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
static_cast<unsigned>(IndexLMUL));
- SDNode *Store =
+ MachineSDNode *Store =
CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Store, {MemOp->getMemOperand()});
+
ReplaceNode(Node, Store);
}
-void RISCVDAGToDAGISel::selectVSXSEGMask(SDNode *Node, unsigned IntNo) {
- SDLoc DL(Node);
- unsigned NF = Node->getNumOperands() - 6;
- EVT VT = Node->getOperand(2)->getValueType(0);
- unsigned ScalarSize = VT.getScalarSizeInBits();
- MVT XLenVT = Subtarget->getXLenVT();
- RISCVVLMUL LMUL = getLMUL(VT);
- SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
- SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
- SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
- SDValue Operands[] = {
- StoreVal,
- Node->getOperand(2 + NF), // Base pointer.
- Node->getOperand(3 + NF), // Index.
- Node->getOperand(4 + NF), // Mask.
- Node->getOperand(5 + NF), // VL.
- SEW,
- Node->getOperand(0) // Chain.
- };
-
- EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
- RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
- unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
- const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
- IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
- static_cast<unsigned>(IndexLMUL));
- SDNode *Store =
- CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
- ReplaceNode(Node, Store);
-}
void RISCVDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we have already selected.
@@ -497,46 +448,20 @@
unsigned Opcode = Node->getOpcode();
MVT XLenVT = Subtarget->getXLenVT();
SDLoc DL(Node);
- EVT VT = Node->getValueType(0);
+ MVT VT = Node->getSimpleValueType(0);
switch (Opcode) {
- case ISD::ADD: {
- // Optimize (add r, imm) to (addi (addi r, imm0) imm1) if applicable. The
- // immediate must be in specific ranges and have a single use.
- if (auto *ConstOp = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
- if (!(ConstOp->hasOneUse()))
- break;
- // The imm must be in range [-4096,-2049] or [2048,4094].
- int64_t Imm = ConstOp->getSExtValue();
- if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
- break;
- // Break the imm to imm0+imm1.
- EVT VT = Node->getValueType(0);
- const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
- const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
- auto *NodeAddi0 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
- Node->getOperand(0), ImmOp0);
- auto *NodeAddi1 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
- SDValue(NodeAddi0, 0), ImmOp1);
- ReplaceNode(Node, NodeAddi1);
- return;
- }
- break;
- }
case ISD::Constant: {
- auto ConstNode = cast<ConstantSDNode>(Node);
+ auto *ConstNode = cast<ConstantSDNode>(Node);
if (VT == XLenVT && ConstNode->isNullValue()) {
SDValue New =
CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, RISCV::X0, XLenVT);
ReplaceNode(Node, New.getNode());
return;
}
- int64_t Imm = ConstNode->getSExtValue();
- if (XLenVT == MVT::i64) {
- ReplaceNode(Node, selectImm(CurDAG, DL, Imm, XLenVT));
- return;
- }
- break;
+ ReplaceNode(Node,
+ selectImm(CurDAG, DL, ConstNode->getSExtValue(), *Subtarget));
+ return;
}
case ISD::FrameIndex: {
SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
@@ -545,6 +470,367 @@
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
return;
}
+ case ISD::SRL: {
+ // We don't need this transform if zext.h is supported.
+ if (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())
+ break;
+ // Optimize (srl (and X, 0xffff), C) ->
+ // (srli (slli X, (XLen-16), (XLen-16) + C)
+ // Taking into account that the 0xffff may have had lower bits unset by
+ // SimplifyDemandedBits. This avoids materializing the 0xffff immediate.
+ // This pattern occurs when type legalizing i16 right shifts.
+ // FIXME: This could be extended to other AND masks.
+ auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+ if (N1C) {
+ uint64_t ShAmt = N1C->getZExtValue();
+ SDValue N0 = Node->getOperand(0);
+ if (ShAmt < 16 && N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
+ isa<ConstantSDNode>(N0.getOperand(1))) {
+ uint64_t Mask = N0.getConstantOperandVal(1);
+ Mask |= maskTrailingOnes<uint64_t>(ShAmt);
+ if (Mask == 0xffff) {
+ unsigned LShAmt = Subtarget->getXLen() - 16;
+ SDNode *SLLI =
+ CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getTargetConstant(LShAmt, DL, VT));
+ SDNode *SRLI = CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT));
+ ReplaceNode(Node, SRLI);
+ return;
+ }
+ }
+ }
+
+ break;
+ }
+ case ISD::AND: {
+ auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+ if (!N1C)
+ break;
+
+ SDValue N0 = Node->getOperand(0);
+
+ bool LeftShift = N0.getOpcode() == ISD::SHL;
+ if (!LeftShift && N0.getOpcode() != ISD::SRL)
+ break;
+
+ auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!C)
+ break;
+ uint64_t C2 = C->getZExtValue();
+ unsigned XLen = Subtarget->getXLen();
+ if (!C2 || C2 >= XLen)
+ break;
+
+ uint64_t C1 = N1C->getZExtValue();
+
+ // Keep track of whether this is a andi, zext.h, or zext.w.
+ bool ZExtOrANDI = isInt<12>(N1C->getSExtValue());
+ if (C1 == UINT64_C(0xFFFF) &&
+ (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()))
+ ZExtOrANDI = true;
+ if (C1 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba())
+ ZExtOrANDI = true;
+
+ // Clear irrelevant bits in the mask.
+ if (LeftShift)
+ C1 &= maskTrailingZeros<uint64_t>(C2);
+ else
+ C1 &= maskTrailingOnes<uint64_t>(XLen - C2);
+
+ // Some transforms should only be done if the shift has a single use or
+ // the AND would become (srli (slli X, 32), 32)
+ bool OneUseOrZExtW = N0.hasOneUse() || C1 == UINT64_C(0xFFFFFFFF);
+
+ SDValue X = N0.getOperand(0);
+
+ // Turn (and (srl x, c2) c1) -> (srli (slli x, c3-c2), c3) if c1 is a mask
+ // with c3 leading zeros.
+ if (!LeftShift && isMask_64(C1)) {
+ uint64_t C3 = XLen - (64 - countLeadingZeros(C1));
+ if (C2 < C3) {
+ // If the number of leading zeros is C2+32 this can be SRLIW.
+ if (C2 + 32 == C3) {
+ SDNode *SRLIW =
+ CurDAG->getMachineNode(RISCV::SRLIW, DL, XLenVT, X,
+ CurDAG->getTargetConstant(C2, DL, XLenVT));
+ ReplaceNode(Node, SRLIW);
+ return;
+ }
+
+ // (and (srl (sexti32 Y), c2), c1) -> (srliw (sraiw Y, 31), c3 - 32) if
+ // c1 is a mask with c3 leading zeros and c2 >= 32 and c3-c2==1.
+ //
+ // This pattern occurs when (i32 (srl (sra 31), c3 - 32)) is type
+ // legalized and goes through DAG combine.
+ SDValue Y;
+ if (C2 >= 32 && (C3 - C2) == 1 && N0.hasOneUse() &&
+ selectSExti32(X, Y)) {
+ SDNode *SRAIW =
+ CurDAG->getMachineNode(RISCV::SRAIW, DL, XLenVT, Y,
+ CurDAG->getTargetConstant(31, DL, XLenVT));
+ SDNode *SRLIW = CurDAG->getMachineNode(
+ RISCV::SRLIW, DL, XLenVT, SDValue(SRAIW, 0),
+ CurDAG->getTargetConstant(C3 - 32, DL, XLenVT));
+ ReplaceNode(Node, SRLIW);
+ return;
+ }
+
+ // (srli (slli x, c3-c2), c3).
+ if (OneUseOrZExtW && !ZExtOrANDI) {
+ SDNode *SLLI = CurDAG->getMachineNode(
+ RISCV::SLLI, DL, XLenVT, X,
+ CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+ SDNode *SRLI =
+ CurDAG->getMachineNode(RISCV::SRLI, DL, XLenVT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(C3, DL, XLenVT));
+ ReplaceNode(Node, SRLI);
+ return;
+ }
+ }
+ }
+
+ // Turn (and (shl x, c2) c1) -> (srli (slli c2+c3), c3) if c1 is a mask
+ // shifted by c2 bits with c3 leading zeros.
+ if (LeftShift && isShiftedMask_64(C1)) {
+ uint64_t C3 = XLen - (64 - countLeadingZeros(C1));
+
+ if (C2 + C3 < XLen &&
+ C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) {
+ // Use slli.uw when possible.
+ if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) {
+ SDNode *SLLIUW =
+ CurDAG->getMachineNode(RISCV::SLLIUW, DL, XLenVT, X,
+ CurDAG->getTargetConstant(C2, DL, XLenVT));
+ ReplaceNode(Node, SLLIUW);
+ return;
+ }
+
+ // (srli (slli c2+c3), c3)
+ if (OneUseOrZExtW && !ZExtOrANDI) {
+ SDNode *SLLI = CurDAG->getMachineNode(
+ RISCV::SLLI, DL, XLenVT, X,
+ CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+ SDNode *SRLI =
+ CurDAG->getMachineNode(RISCV::SRLI, DL, XLenVT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(C3, DL, XLenVT));
+ ReplaceNode(Node, SRLI);
+ return;
+ }
+ }
+ }
+
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = Node->getConstantOperandVal(0);
+ switch (IntNo) {
+ // By default we do not custom select any intrinsic.
+ default:
+ break;
+ case Intrinsic::riscv_vmsgeu:
+ case Intrinsic::riscv_vmsge: {
+ SDValue Src1 = Node->getOperand(1);
+ SDValue Src2 = Node->getOperand(2);
+ // Only custom select scalar second operand.
+ if (Src2.getValueType() != XLenVT)
+ break;
+ // Small constants are handled with patterns.
+ if (auto *C = dyn_cast<ConstantSDNode>(Src2)) {
+ int64_t CVal = C->getSExtValue();
+ if (CVal >= -15 && CVal <= 16)
+ break;
+ }
+ bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu;
+ MVT Src1VT = Src1.getSimpleValueType();
+ unsigned VMSLTOpcode, VMNANDOpcode;
+ switch (RISCVTargetLowering::getLMUL(Src1VT)) {
+ default:
+ llvm_unreachable("Unexpected LMUL!");
+ case RISCVII::VLMUL::LMUL_F8:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF8;
+ break;
+ case RISCVII::VLMUL::LMUL_F4:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF4;
+ break;
+ case RISCVII::VLMUL::LMUL_F2:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF2;
+ break;
+ case RISCVII::VLMUL::LMUL_1:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_M1;
+ break;
+ case RISCVII::VLMUL::LMUL_2:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_M2;
+ break;
+ case RISCVII::VLMUL::LMUL_4:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_M4;
+ break;
+ case RISCVII::VLMUL::LMUL_8:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8;
+ VMNANDOpcode = RISCV::PseudoVMNAND_MM_M8;
+ break;
+ }
+ SDValue SEW = CurDAG->getTargetConstant(
+ Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
+ SDValue VL;
+ selectVLOp(Node->getOperand(3), VL);
+
+ // Expand to
+ // vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
+ SDValue Cmp = SDValue(
+ CurDAG->getMachineNode(VMSLTOpcode, DL, VT, {Src1, Src2, VL, SEW}),
+ 0);
+ ReplaceNode(Node, CurDAG->getMachineNode(VMNANDOpcode, DL, VT,
+ {Cmp, Cmp, VL, SEW}));
+ return;
+ }
+ case Intrinsic::riscv_vmsgeu_mask:
+ case Intrinsic::riscv_vmsge_mask: {
+ SDValue Src1 = Node->getOperand(2);
+ SDValue Src2 = Node->getOperand(3);
+ // Only custom select scalar second operand.
+ if (Src2.getValueType() != XLenVT)
+ break;
+ // Small constants are handled with patterns.
+ if (auto *C = dyn_cast<ConstantSDNode>(Src2)) {
+ int64_t CVal = C->getSExtValue();
+ if (CVal >= -15 && CVal <= 16)
+ break;
+ }
+ bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask;
+ MVT Src1VT = Src1.getSimpleValueType();
+ unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOTOpcode;
+ switch (RISCVTargetLowering::getLMUL(Src1VT)) {
+ default:
+ llvm_unreachable("Unexpected LMUL!");
+ case RISCVII::VLMUL::LMUL_F8:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8_MASK
+ : RISCV::PseudoVMSLT_VX_MF8_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_F4:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4_MASK
+ : RISCV::PseudoVMSLT_VX_MF4_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_F2:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2_MASK
+ : RISCV::PseudoVMSLT_VX_MF2_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_1:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1_MASK
+ : RISCV::PseudoVMSLT_VX_M1_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_2:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2_MASK
+ : RISCV::PseudoVMSLT_VX_M2_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_4:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4_MASK
+ : RISCV::PseudoVMSLT_VX_M4_MASK;
+ break;
+ case RISCVII::VLMUL::LMUL_8:
+ VMSLTOpcode =
+ IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8;
+ VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8_MASK
+ : RISCV::PseudoVMSLT_VX_M8_MASK;
+ break;
+ }
+ // Mask operations use the LMUL from the mask type.
+ switch (RISCVTargetLowering::getLMUL(VT)) {
+ default:
+ llvm_unreachable("Unexpected LMUL!");
+ case RISCVII::VLMUL::LMUL_F8:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_MF8;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF8;
+ break;
+ case RISCVII::VLMUL::LMUL_F4:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_MF4;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF4;
+ break;
+ case RISCVII::VLMUL::LMUL_F2:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_MF2;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF2;
+ break;
+ case RISCVII::VLMUL::LMUL_1:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_M1;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M1;
+ break;
+ case RISCVII::VLMUL::LMUL_2:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_M2;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M2;
+ break;
+ case RISCVII::VLMUL::LMUL_4:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_M4;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M4;
+ break;
+ case RISCVII::VLMUL::LMUL_8:
+ VMXOROpcode = RISCV::PseudoVMXOR_MM_M8;
+ VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M8;
+ break;
+ }
+ SDValue SEW = CurDAG->getTargetConstant(
+ Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
+ SDValue MaskSEW = CurDAG->getTargetConstant(0, DL, XLenVT);
+ SDValue VL;
+ selectVLOp(Node->getOperand(5), VL);
+ SDValue MaskedOff = Node->getOperand(1);
+ SDValue Mask = Node->getOperand(4);
+ // If the MaskedOff value and the Mask are the same value use
+ // vmslt{u}.vx vt, va, x; vmandnot.mm vd, vd, vt
+ // This avoids needing to copy v0 to vd before starting the next sequence.
+ if (Mask == MaskedOff) {
+ SDValue Cmp = SDValue(
+ CurDAG->getMachineNode(VMSLTOpcode, DL, VT, {Src1, Src2, VL, SEW}),
+ 0);
+ ReplaceNode(Node, CurDAG->getMachineNode(VMANDNOTOpcode, DL, VT,
+ {Mask, Cmp, VL, MaskSEW}));
+ return;
+ }
+
+ // Mask needs to be copied to V0.
+ SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
+ RISCV::V0, Mask, SDValue());
+ SDValue Glue = Chain.getValue(1);
+ SDValue V0 = CurDAG->getRegister(RISCV::V0, VT);
+
+ // Otherwise use
+ // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+ SDValue Cmp = SDValue(
+ CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
+ {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
+ 0);
+ ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
+ {Cmp, Mask, VL, MaskSEW}));
+ return;
+ }
+ }
+ break;
+ }
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
switch (IntNo) {
@@ -552,56 +838,45 @@
default:
break;
- case Intrinsic::riscv_vsetvli: {
- if (!Subtarget->hasStdExtV())
- break;
-
- assert(Node->getNumOperands() == 5);
-
- RISCVVSEW VSEW =
- static_cast<RISCVVSEW>(Node->getConstantOperandVal(3) & 0x7);
- RISCVVLMUL VLMul =
- static_cast<RISCVVLMUL>(Node->getConstantOperandVal(4) & 0x7);
-
- unsigned VTypeI = RISCVVType::encodeVTYPE(
- VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
- SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
-
- SDValue VLOperand = Node->getOperand(2);
- if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
- uint64_t AVL = C->getZExtValue();
- if (isUInt<5>(AVL)) {
- SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
- ReplaceNode(Node,
- CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT,
- MVT::Other, VLImm, VTypeIOp,
- /* Chain */ Node->getOperand(0)));
- return;
- }
- }
-
- ReplaceNode(Node,
- CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
- MVT::Other, VLOperand, VTypeIOp,
- /* Chain */ Node->getOperand(0)));
- return;
- }
+ case Intrinsic::riscv_vsetvli:
case Intrinsic::riscv_vsetvlimax: {
if (!Subtarget->hasStdExtV())
break;
- assert(Node->getNumOperands() == 4);
+ bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax;
+ unsigned Offset = VLMax ? 2 : 3;
- RISCVVSEW VSEW =
- static_cast<RISCVVSEW>(Node->getConstantOperandVal(2) & 0x7);
- RISCVVLMUL VLMul =
- static_cast<RISCVVLMUL>(Node->getConstantOperandVal(3) & 0x7);
+ assert(Node->getNumOperands() == Offset + 2 &&
+ "Unexpected number of operands");
+
+ unsigned SEW =
+ RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7);
+ RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>(
+ Node->getConstantOperandVal(Offset + 1) & 0x7);
unsigned VTypeI = RISCVVType::encodeVTYPE(
- VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
+ VLMul, SEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
- SDValue VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+ SDValue VLOperand;
+ if (VLMax) {
+ VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+ } else {
+ VLOperand = Node->getOperand(2);
+
+ if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
+ uint64_t AVL = C->getZExtValue();
+ if (isUInt<5>(AVL)) {
+ SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
+ ReplaceNode(
+ Node, CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT,
+ MVT::Other, VLImm, VTypeIOp,
+ /* Chain */ Node->getOperand(0)));
+ return;
+ }
+ }
+ }
+
ReplaceNode(Node,
CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
MVT::Other, VLOperand, VTypeIOp,
@@ -615,7 +890,7 @@
case Intrinsic::riscv_vlseg6:
case Intrinsic::riscv_vlseg7:
case Intrinsic::riscv_vlseg8: {
- selectVLSEG(Node, IntNo, /*IsStrided=*/false);
+ selectVLSEG(Node, /*IsMasked*/ false, /*IsStrided*/ false);
return;
}
case Intrinsic::riscv_vlseg2_mask:
@@ -625,7 +900,7 @@
case Intrinsic::riscv_vlseg6_mask:
case Intrinsic::riscv_vlseg7_mask:
case Intrinsic::riscv_vlseg8_mask: {
- selectVLSEGMask(Node, IntNo, /*IsStrided=*/false);
+ selectVLSEG(Node, /*IsMasked*/ true, /*IsStrided*/ false);
return;
}
case Intrinsic::riscv_vlsseg2:
@@ -635,7 +910,7 @@
case Intrinsic::riscv_vlsseg6:
case Intrinsic::riscv_vlsseg7:
case Intrinsic::riscv_vlsseg8: {
- selectVLSEG(Node, IntNo, /*IsStrided=*/true);
+ selectVLSEG(Node, /*IsMasked*/ false, /*IsStrided*/ true);
return;
}
case Intrinsic::riscv_vlsseg2_mask:
@@ -645,7 +920,7 @@
case Intrinsic::riscv_vlsseg6_mask:
case Intrinsic::riscv_vlsseg7_mask:
case Intrinsic::riscv_vlsseg8_mask: {
- selectVLSEGMask(Node, IntNo, /*IsStrided=*/true);
+ selectVLSEG(Node, /*IsMasked*/ true, /*IsStrided*/ true);
return;
}
case Intrinsic::riscv_vloxseg2:
@@ -655,16 +930,17 @@
case Intrinsic::riscv_vloxseg6:
case Intrinsic::riscv_vloxseg7:
case Intrinsic::riscv_vloxseg8:
+ selectVLXSEG(Node, /*IsMasked*/ false, /*IsOrdered*/ true);
+ return;
case Intrinsic::riscv_vluxseg2:
case Intrinsic::riscv_vluxseg3:
case Intrinsic::riscv_vluxseg4:
case Intrinsic::riscv_vluxseg5:
case Intrinsic::riscv_vluxseg6:
case Intrinsic::riscv_vluxseg7:
- case Intrinsic::riscv_vluxseg8: {
- selectVLXSEG(Node, IntNo);
+ case Intrinsic::riscv_vluxseg8:
+ selectVLXSEG(Node, /*IsMasked*/ false, /*IsOrdered*/ false);
return;
- }
case Intrinsic::riscv_vloxseg2_mask:
case Intrinsic::riscv_vloxseg3_mask:
case Intrinsic::riscv_vloxseg4_mask:
@@ -672,14 +948,143 @@
case Intrinsic::riscv_vloxseg6_mask:
case Intrinsic::riscv_vloxseg7_mask:
case Intrinsic::riscv_vloxseg8_mask:
+ selectVLXSEG(Node, /*IsMasked*/ true, /*IsOrdered*/ true);
+ return;
case Intrinsic::riscv_vluxseg2_mask:
case Intrinsic::riscv_vluxseg3_mask:
case Intrinsic::riscv_vluxseg4_mask:
case Intrinsic::riscv_vluxseg5_mask:
case Intrinsic::riscv_vluxseg6_mask:
case Intrinsic::riscv_vluxseg7_mask:
- case Intrinsic::riscv_vluxseg8_mask: {
- selectVLXSEGMask(Node, IntNo);
+ case Intrinsic::riscv_vluxseg8_mask:
+ selectVLXSEG(Node, /*IsMasked*/ true, /*IsOrdered*/ false);
+ return;
+ case Intrinsic::riscv_vlseg8ff:
+ case Intrinsic::riscv_vlseg7ff:
+ case Intrinsic::riscv_vlseg6ff:
+ case Intrinsic::riscv_vlseg5ff:
+ case Intrinsic::riscv_vlseg4ff:
+ case Intrinsic::riscv_vlseg3ff:
+ case Intrinsic::riscv_vlseg2ff: {
+ selectVLSEGFF(Node, /*IsMasked*/ false);
+ return;
+ }
+ case Intrinsic::riscv_vlseg8ff_mask:
+ case Intrinsic::riscv_vlseg7ff_mask:
+ case Intrinsic::riscv_vlseg6ff_mask:
+ case Intrinsic::riscv_vlseg5ff_mask:
+ case Intrinsic::riscv_vlseg4ff_mask:
+ case Intrinsic::riscv_vlseg3ff_mask:
+ case Intrinsic::riscv_vlseg2ff_mask: {
+ selectVLSEGFF(Node, /*IsMasked*/ true);
+ return;
+ }
+ case Intrinsic::riscv_vloxei:
+ case Intrinsic::riscv_vloxei_mask:
+ case Intrinsic::riscv_vluxei:
+ case Intrinsic::riscv_vluxei_mask: {
+ bool IsMasked = IntNo == Intrinsic::riscv_vloxei_mask ||
+ IntNo == Intrinsic::riscv_vluxei_mask;
+ bool IsOrdered = IntNo == Intrinsic::riscv_vloxei ||
+ IntNo == Intrinsic::riscv_vloxei_mask;
+
+ MVT VT = Node->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ if (IsMasked)
+ Operands.push_back(Node->getOperand(CurOp++));
+
+ MVT IndexVT;
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ true, Operands,
+ &IndexVT);
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Element count mismatch");
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+ unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+ const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
+ IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ MachineSDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Load);
+ return;
+ }
+ case Intrinsic::riscv_vle1:
+ case Intrinsic::riscv_vle:
+ case Intrinsic::riscv_vle_mask:
+ case Intrinsic::riscv_vlse:
+ case Intrinsic::riscv_vlse_mask: {
+ bool IsMasked = IntNo == Intrinsic::riscv_vle_mask ||
+ IntNo == Intrinsic::riscv_vlse_mask;
+ bool IsStrided =
+ IntNo == Intrinsic::riscv_vlse || IntNo == Intrinsic::riscv_vlse_mask;
+
+ MVT VT = Node->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ if (IsMasked)
+ Operands.push_back(Node->getOperand(CurOp++));
+
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
+ Operands);
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ const RISCV::VLEPseudo *P =
+ RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW,
+ static_cast<unsigned>(LMUL));
+ MachineSDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Load);
+ return;
+ }
+ case Intrinsic::riscv_vleff:
+ case Intrinsic::riscv_vleff_mask: {
+ bool IsMasked = IntNo == Intrinsic::riscv_vleff_mask;
+
+ MVT VT = Node->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 7> Operands;
+ if (IsMasked)
+ Operands.push_back(Node->getOperand(CurOp++));
+
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ false, Operands);
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ const RISCV::VLEPseudo *P =
+ RISCV::getVLEPseudo(IsMasked, /*Strided*/ false, /*FF*/ true, Log2SEW,
+ static_cast<unsigned>(LMUL));
+ MachineSDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0),
+ MVT::Other, MVT::Glue, Operands);
+ SDNode *ReadVL = CurDAG->getMachineNode(RISCV::PseudoReadVL, DL, XLenVT,
+ /*Glue*/ SDValue(Load, 2));
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ ReplaceUses(SDValue(Node, 0), SDValue(Load, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(ReadVL, 0)); // VL
+ ReplaceUses(SDValue(Node, 2), SDValue(Load, 1)); // Chain
+ CurDAG->RemoveDeadNode(Node);
return;
}
}
@@ -695,7 +1100,7 @@
case Intrinsic::riscv_vsseg6:
case Intrinsic::riscv_vsseg7:
case Intrinsic::riscv_vsseg8: {
- selectVSSEG(Node, IntNo, /*IsStrided=*/false);
+ selectVSSEG(Node, /*IsMasked*/ false, /*IsStrided*/ false);
return;
}
case Intrinsic::riscv_vsseg2_mask:
@@ -705,7 +1110,7 @@
case Intrinsic::riscv_vsseg6_mask:
case Intrinsic::riscv_vsseg7_mask:
case Intrinsic::riscv_vsseg8_mask: {
- selectVSSEGMask(Node, IntNo, /*IsStrided=*/false);
+ selectVSSEG(Node, /*IsMasked*/ true, /*IsStrided*/ false);
return;
}
case Intrinsic::riscv_vssseg2:
@@ -715,7 +1120,7 @@
case Intrinsic::riscv_vssseg6:
case Intrinsic::riscv_vssseg7:
case Intrinsic::riscv_vssseg8: {
- selectVSSEG(Node, IntNo, /*IsStrided=*/true);
+ selectVSSEG(Node, /*IsMasked*/ false, /*IsStrided*/ true);
return;
}
case Intrinsic::riscv_vssseg2_mask:
@@ -725,7 +1130,7 @@
case Intrinsic::riscv_vssseg6_mask:
case Intrinsic::riscv_vssseg7_mask:
case Intrinsic::riscv_vssseg8_mask: {
- selectVSSEGMask(Node, IntNo, /*IsStrided=*/true);
+ selectVSSEG(Node, /*IsMasked*/ true, /*IsStrided*/ true);
return;
}
case Intrinsic::riscv_vsoxseg2:
@@ -735,16 +1140,17 @@
case Intrinsic::riscv_vsoxseg6:
case Intrinsic::riscv_vsoxseg7:
case Intrinsic::riscv_vsoxseg8:
+ selectVSXSEG(Node, /*IsMasked*/ false, /*IsOrdered*/ true);
+ return;
case Intrinsic::riscv_vsuxseg2:
case Intrinsic::riscv_vsuxseg3:
case Intrinsic::riscv_vsuxseg4:
case Intrinsic::riscv_vsuxseg5:
case Intrinsic::riscv_vsuxseg6:
case Intrinsic::riscv_vsuxseg7:
- case Intrinsic::riscv_vsuxseg8: {
- selectVSXSEG(Node, IntNo);
+ case Intrinsic::riscv_vsuxseg8:
+ selectVSXSEG(Node, /*IsMasked*/ false, /*IsOrdered*/ false);
return;
- }
case Intrinsic::riscv_vsoxseg2_mask:
case Intrinsic::riscv_vsoxseg3_mask:
case Intrinsic::riscv_vsoxseg4_mask:
@@ -752,25 +1158,236 @@
case Intrinsic::riscv_vsoxseg6_mask:
case Intrinsic::riscv_vsoxseg7_mask:
case Intrinsic::riscv_vsoxseg8_mask:
+ selectVSXSEG(Node, /*IsMasked*/ true, /*IsOrdered*/ true);
+ return;
case Intrinsic::riscv_vsuxseg2_mask:
case Intrinsic::riscv_vsuxseg3_mask:
case Intrinsic::riscv_vsuxseg4_mask:
case Intrinsic::riscv_vsuxseg5_mask:
case Intrinsic::riscv_vsuxseg6_mask:
case Intrinsic::riscv_vsuxseg7_mask:
- case Intrinsic::riscv_vsuxseg8_mask: {
- selectVSXSEGMask(Node, IntNo);
+ case Intrinsic::riscv_vsuxseg8_mask:
+ selectVSXSEG(Node, /*IsMasked*/ true, /*IsOrdered*/ false);
+ return;
+ case Intrinsic::riscv_vsoxei:
+ case Intrinsic::riscv_vsoxei_mask:
+ case Intrinsic::riscv_vsuxei:
+ case Intrinsic::riscv_vsuxei_mask: {
+ bool IsMasked = IntNo == Intrinsic::riscv_vsoxei_mask ||
+ IntNo == Intrinsic::riscv_vsuxei_mask;
+ bool IsOrdered = IntNo == Intrinsic::riscv_vsoxei ||
+ IntNo == Intrinsic::riscv_vsoxei_mask;
+
+ MVT VT = Node->getOperand(2)->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ Operands.push_back(Node->getOperand(CurOp++)); // Store value.
+
+ MVT IndexVT;
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
+ /*IsStridedOrIndexed*/ true, Operands,
+ &IndexVT);
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Element count mismatch");
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+ unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+ const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
+ IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ MachineSDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Store, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Store);
+ return;
+ }
+ case Intrinsic::riscv_vse1:
+ case Intrinsic::riscv_vse:
+ case Intrinsic::riscv_vse_mask:
+ case Intrinsic::riscv_vsse:
+ case Intrinsic::riscv_vsse_mask: {
+ bool IsMasked = IntNo == Intrinsic::riscv_vse_mask ||
+ IntNo == Intrinsic::riscv_vsse_mask;
+ bool IsStrided =
+ IntNo == Intrinsic::riscv_vsse || IntNo == Intrinsic::riscv_vsse_mask;
+
+ MVT VT = Node->getOperand(2)->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ unsigned CurOp = 2;
+ SmallVector<SDValue, 8> Operands;
+ Operands.push_back(Node->getOperand(CurOp++)); // Store value.
+
+ addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
+ Operands);
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ const RISCV::VSEPseudo *P = RISCV::getVSEPseudo(
+ IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL));
+ MachineSDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Store, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Store);
return;
}
}
break;
}
- case RISCVISD::VLSEGFF: {
- selectVLSEGFF(Node);
+ case ISD::BITCAST: {
+ MVT SrcVT = Node->getOperand(0).getSimpleValueType();
+ // Just drop bitcasts between vectors if both are fixed or both are
+ // scalable.
+ if ((VT.isScalableVector() && SrcVT.isScalableVector()) ||
+ (VT.isFixedLengthVector() && SrcVT.isFixedLengthVector())) {
+ ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ break;
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue V = Node->getOperand(0);
+ SDValue SubV = Node->getOperand(1);
+ SDLoc DL(SubV);
+ auto Idx = Node->getConstantOperandVal(2);
+ MVT SubVecVT = SubV.getSimpleValueType();
+
+ const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
+ MVT SubVecContainerVT = SubVecVT;
+ // Establish the correct scalable-vector types for any fixed-length type.
+ if (SubVecVT.isFixedLengthVector())
+ SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+ if (VT.isFixedLengthVector())
+ VT = TLI.getContainerForFixedLengthVector(VT);
+
+ const auto *TRI = Subtarget->getRegisterInfo();
+ unsigned SubRegIdx;
+ std::tie(SubRegIdx, Idx) =
+ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ VT, SubVecContainerVT, Idx, TRI);
+
+ // If the Idx hasn't been completely eliminated then this is a subvector
+ // insert which doesn't naturally align to a vector register. These must
+ // be handled using instructions to manipulate the vector registers.
+ if (Idx != 0)
+ break;
+
+ RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecContainerVT);
+ bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
+ SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
+ SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+ (void)IsSubVecPartReg; // Silence unused variable warning without asserts.
+ assert((!IsSubVecPartReg || V.isUndef()) &&
+ "Expecting lowering to have created legal INSERT_SUBVECTORs when "
+ "the subvector is smaller than a full-sized register");
+
+ // If we haven't set a SubRegIdx, then we must be going between
+ // equally-sized LMUL groups (e.g. VR -> VR). This can be done as a copy.
+ if (SubRegIdx == RISCV::NoSubRegister) {
+ unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+ assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecContainerVT) ==
+ InRegClassID &&
+ "Unexpected subvector extraction");
+ SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
+ SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ DL, VT, SubV, RC);
+ ReplaceNode(Node, NewNode);
+ return;
+ }
+
+ SDValue Insert = CurDAG->getTargetInsertSubreg(SubRegIdx, DL, VT, V, SubV);
+ ReplaceNode(Node, Insert.getNode());
return;
}
- case RISCVISD::VLSEGFF_MASK: {
- selectVLSEGFFMask(Node);
+ case ISD::EXTRACT_SUBVECTOR: {
+ SDValue V = Node->getOperand(0);
+ auto Idx = Node->getConstantOperandVal(1);
+ MVT InVT = V.getSimpleValueType();
+ SDLoc DL(V);
+
+ const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
+ MVT SubVecContainerVT = VT;
+ // Establish the correct scalable-vector types for any fixed-length type.
+ if (VT.isFixedLengthVector())
+ SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT);
+ if (InVT.isFixedLengthVector())
+ InVT = TLI.getContainerForFixedLengthVector(InVT);
+
+ const auto *TRI = Subtarget->getRegisterInfo();
+ unsigned SubRegIdx;
+ std::tie(SubRegIdx, Idx) =
+ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ InVT, SubVecContainerVT, Idx, TRI);
+
+ // If the Idx hasn't been completely eliminated then this is a subvector
+ // extract which doesn't naturally align to a vector register. These must
+ // be handled using instructions to manipulate the vector registers.
+ if (Idx != 0)
+ break;
+
+ // If we haven't set a SubRegIdx, then we must be going between
+ // equally-sized LMUL types (e.g. VR -> VR). This can be done as a copy.
+ if (SubRegIdx == RISCV::NoSubRegister) {
+ unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(InVT);
+ assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecContainerVT) ==
+ InRegClassID &&
+ "Unexpected subvector extraction");
+ SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
+ SDNode *NewNode =
+ CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+ ReplaceNode(Node, NewNode);
+ return;
+ }
+
+ SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V);
+ ReplaceNode(Node, Extract.getNode());
+ return;
+ }
+ case RISCVISD::VMV_V_X_VL:
+ case RISCVISD::VFMV_V_F_VL: {
+ // Try to match splat of a scalar load to a strided load with stride of x0.
+ SDValue Src = Node->getOperand(0);
+ auto *Ld = dyn_cast<LoadSDNode>(Src);
+ if (!Ld)
+ break;
+ EVT MemVT = Ld->getMemoryVT();
+ // The memory VT should be the same size as the element type.
+ if (MemVT.getStoreSize() != VT.getVectorElementType().getStoreSize())
+ break;
+ if (!IsProfitableToFold(Src, Node, Node) ||
+ !IsLegalToFold(Src, Node, Node, TM.getOptLevel()))
+ break;
+
+ SDValue VL;
+ selectVLOp(Node->getOperand(1), VL);
+
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ SDValue SEW = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
+
+ SDValue Operands[] = {Ld->getBasePtr(),
+ CurDAG->getRegister(RISCV::X0, XLenVT), VL, SEW,
+ Ld->getChain()};
+
+ RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
+ /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, Log2SEW,
+ static_cast<unsigned>(LMUL));
+ MachineSDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Load);
return;
}
}
@@ -798,80 +1415,94 @@
}
bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
- if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
return true;
}
return false;
}
-// Match (srl (and val, mask), imm) where the result would be a
-// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
-// is equivalent to this (SimplifyDemandedBits may have removed lower bits
-// from the mask that aren't necessary due to the right-shifting).
-bool RISCVDAGToDAGISel::MatchSRLIW(SDNode *N) const {
- assert(N->getOpcode() == ISD::SRL);
- assert(N->getOperand(0).getOpcode() == ISD::AND);
- assert(isa<ConstantSDNode>(N->getOperand(1)));
- assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-
- // The IsRV64 predicate is checked after PatFrag predicates so we can get
- // here even on RV32.
- if (!Subtarget->is64Bit())
- return false;
-
- SDValue And = N->getOperand(0);
- uint64_t ShAmt = N->getConstantOperandVal(1);
- uint64_t Mask = And.getConstantOperandVal(1);
- return (Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff;
+bool RISCVDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) {
+ // If this is FrameIndex, select it directly. Otherwise just let it get
+ // selected to a register independently.
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
+ else
+ Base = Addr;
+ return true;
}
-// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
-// on RV64).
-// SLLIUW is the same as SLLI except for the fact that it clears the bits
-// XLEN-1:32 of the input RS1 before shifting.
-// A PatFrag has already checked that it has the right structure:
-//
-// (AND (SHL RS1, VC2), VC1)
-//
-// We check that VC2, the shamt is less than 32, otherwise the pattern is
-// exactly the same as SLLI and we give priority to that.
-// Eventually we check that VC1, the mask used to clear the upper 32 bits
-// of RS1, is correct:
-//
-// VC1 == (0xFFFFFFFF << VC2)
-//
-bool RISCVDAGToDAGISel::MatchSLLIUW(SDNode *N) const {
- assert(N->getOpcode() == ISD::AND);
- assert(N->getOperand(0).getOpcode() == ISD::SHL);
- assert(isa<ConstantSDNode>(N->getOperand(1)));
- assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
+bool RISCVDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
+ SDValue &ShAmt) {
+ // Shift instructions on RISCV only read the lower 5 or 6 bits of the shift
+ // amount. If there is an AND on the shift amount, we can bypass it if it
+ // doesn't affect any of those bits.
+ if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
+ const APInt &AndMask = N->getConstantOperandAPInt(1);
- // The IsRV64 predicate is checked after PatFrag predicates so we can get
- // here even on RV32.
- if (!Subtarget->is64Bit())
- return false;
+ // Since the max shift amount is a power of 2 we can subtract 1 to make a
+ // mask that covers the bits needed to represent all shift amounts.
+ assert(isPowerOf2_32(ShiftWidth) && "Unexpected max shift amount!");
+ APInt ShMask(AndMask.getBitWidth(), ShiftWidth - 1);
- SDValue Shl = N->getOperand(0);
- uint64_t VC1 = N->getConstantOperandVal(1);
- uint64_t VC2 = Shl.getConstantOperandVal(1);
+ if (ShMask.isSubsetOf(AndMask)) {
+ ShAmt = N.getOperand(0);
+ return true;
+ }
- // Immediate range should be enforced by uimm5 predicate.
- assert(VC2 < 32 && "Unexpected immediate");
- return (VC1 >> VC2) == UINT64_C(0xFFFFFFFF);
+ // SimplifyDemandedBits may have optimized the mask so try restoring any
+ // bits that are known zero.
+ KnownBits Known = CurDAG->computeKnownBits(N->getOperand(0));
+ if (ShMask.isSubsetOf(AndMask | Known.Zero)) {
+ ShAmt = N.getOperand(0);
+ return true;
+ }
+ }
+
+ ShAmt = N;
+ return true;
}
-// X0 has special meaning for vsetvl/vsetvli.
-// rd | rs1 | AVL value | Effect on vl
-//--------------------------------------------------------------
-// !X0 | X0 | VLMAX | Set vl to VLMAX
-// X0 | X0 | Value in vl | Keep current vl, just change vtype.
+bool RISCVDAGToDAGISel::selectSExti32(SDValue N, SDValue &Val) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+ Val = N.getOperand(0);
+ return true;
+ }
+ MVT VT = N.getSimpleValueType();
+ if (CurDAG->ComputeNumSignBits(N) > (VT.getSizeInBits() - 32)) {
+ Val = N;
+ return true;
+ }
+
+ return false;
+}
+
+bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
+ if (N.getOpcode() == ISD::AND) {
+ auto *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (C && C->getZExtValue() == UINT64_C(0xFFFFFFFF)) {
+ Val = N.getOperand(0);
+ return true;
+ }
+ }
+ MVT VT = N.getSimpleValueType();
+ APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 32);
+ if (CurDAG->MaskedValueIsZero(N, Mask)) {
+ Val = N;
+ return true;
+ }
+
+ return false;
+}
+
+// Select VL as a 5 bit immediate or a value that will become a register. This
+// allows us to choose betwen VSETIVLI or VSETVLI later.
bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
- // If the VL value is a constant 0, manually select it to an ADDI with 0
- // immediate to prevent the default selection path from matching it to X0.
auto *C = dyn_cast<ConstantSDNode>(N);
- if (C && C->isNullValue())
- VL = SDValue(selectImm(CurDAG, SDLoc(N), 0, Subtarget->getXLenVT()), 0);
+ if (C && isUInt<5>(C->getZExtValue()))
+ VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
+ N->getValueType(0));
else
VL = N;
@@ -880,45 +1511,71 @@
bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
if (N.getOpcode() != ISD::SPLAT_VECTOR &&
- N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64)
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
+ N.getOpcode() != RISCVISD::VMV_V_X_VL)
return false;
SplatVal = N.getOperand(0);
return true;
}
-bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) {
+using ValidateFn = bool (*)(int64_t);
+
+static bool selectVSplatSimmHelper(SDValue N, SDValue &SplatVal,
+ SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
+ ValidateFn ValidateImm) {
if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
- N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
+ N.getOpcode() != RISCVISD::VMV_V_X_VL) ||
!isa<ConstantSDNode>(N.getOperand(0)))
return false;
int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
- // Both ISD::SPLAT_VECTOR and RISCVISD::SPLAT_VECTOR_I64 share semantics when
- // the operand type is wider than the resulting vector element type: an
- // implicit truncation first takes place. Therefore, perform a manual
- // truncation/sign-extension in order to ignore any truncated bits and catch
- // any zero-extended immediate.
+ // ISD::SPLAT_VECTOR, RISCVISD::SPLAT_VECTOR_I64 and RISCVISD::VMV_V_X_VL
+ // share semantics when the operand type is wider than the resulting vector
+ // element type: an implicit truncation first takes place. Therefore, perform
+ // a manual truncation/sign-extension in order to ignore any truncated bits
+ // and catch any zero-extended immediate.
// For example, we wish to match (i8 -1) -> (XLenVT 255) as a simm5 by first
// sign-extending to (XLenVT -1).
- auto XLenVT = Subtarget->getXLenVT();
+ MVT XLenVT = Subtarget.getXLenVT();
assert(XLenVT == N.getOperand(0).getSimpleValueType() &&
"Unexpected splat operand type");
- auto EltVT = N.getValueType().getVectorElementType();
- if (EltVT.bitsLT(XLenVT)) {
+ MVT EltVT = N.getSimpleValueType().getVectorElementType();
+ if (EltVT.bitsLT(XLenVT))
SplatImm = SignExtend64(SplatImm, EltVT.getSizeInBits());
- }
- if (!isInt<5>(SplatImm))
+ if (!ValidateImm(SplatImm))
return false;
- SplatVal = CurDAG->getTargetConstant(SplatImm, SDLoc(N), XLenVT);
+ SplatVal = DAG.getTargetConstant(SplatImm, SDLoc(N), XLenVT);
return true;
}
+bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) {
+ return selectVSplatSimmHelper(N, SplatVal, *CurDAG, *Subtarget,
+ [](int64_t Imm) { return isInt<5>(Imm); });
+}
+
+bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1(SDValue N, SDValue &SplatVal) {
+ return selectVSplatSimmHelper(
+ N, SplatVal, *CurDAG, *Subtarget,
+ [](int64_t Imm) { return (isInt<5>(Imm) && Imm != -16) || Imm == 16; });
+}
+
+bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NonZero(SDValue N,
+ SDValue &SplatVal) {
+ return selectVSplatSimmHelper(
+ N, SplatVal, *CurDAG, *Subtarget, [](int64_t Imm) {
+ return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16);
+ });
+}
+
bool RISCVDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &SplatVal) {
if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
- N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
+ N.getOpcode() != RISCVISD::VMV_V_X_VL) ||
!isa<ConstantSDNode>(N.getOperand(0)))
return false;
@@ -933,6 +1590,21 @@
return true;
}
+bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
+ SDValue &Imm) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N)) {
+ int64_t ImmVal = SignExtend64(C->getSExtValue(), Width);
+
+ if (!isInt<5>(ImmVal))
+ return false;
+
+ Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), Subtarget->getXLenVT());
+ return true;
+ }
+
+ return false;
+}
+
// Merge an ADDI into the offset of a load/store instruction where possible.
// (load (addi base, off1), off2) -> (load base, off1+off2)
// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
@@ -991,14 +1663,14 @@
SDValue ImmOperand = Base.getOperand(1);
uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
- if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+ if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
int64_t Offset1 = Const->getSExtValue();
int64_t CombinedOffset = Offset1 + Offset2;
if (!isInt<12>(CombinedOffset))
continue;
ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
ImmOperand.getValueType());
- } else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+ } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
// If the off1 in (addi base, off1) is a global variable's address (its
// low part, really), then we can rely on the alignment of that variable
// to provide a margin of safety before off1 can overflow the 12 bits.
@@ -1012,7 +1684,7 @@
ImmOperand = CurDAG->getTargetGlobalAddress(
GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
CombinedOffset, GA->getTargetFlags());
- } else if (auto CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
+ } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
// Ditto.
Align Alignment = CP->getAlign();
if (Offset2 != 0 && Alignment <= Offset2)
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 6099586..56d0722 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -36,6 +36,7 @@
return SelectionDAGISel::runOnMachineFunction(MF);
}
+ void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
void Select(SDNode *Node) override;
@@ -44,26 +45,43 @@
std::vector<SDValue> &OutOps) override;
bool SelectAddrFI(SDValue Addr, SDValue &Base);
+ bool SelectBaseAddr(SDValue Addr, SDValue &Base);
- bool MatchSRLIW(SDNode *N) const;
- bool MatchSLLIUW(SDNode *N) const;
+ bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
+ bool selectShiftMaskXLen(SDValue N, SDValue &ShAmt) {
+ return selectShiftMask(N, Subtarget->getXLen(), ShAmt);
+ }
+ bool selectShiftMask32(SDValue N, SDValue &ShAmt) {
+ return selectShiftMask(N, 32, ShAmt);
+ }
+
+ bool selectSExti32(SDValue N, SDValue &Val);
+ bool selectZExti32(SDValue N, SDValue &Val);
bool selectVLOp(SDValue N, SDValue &VL);
bool selectVSplat(SDValue N, SDValue &SplatVal);
bool selectVSplatSimm5(SDValue N, SDValue &SplatVal);
bool selectVSplatUimm5(SDValue N, SDValue &SplatVal);
+ bool selectVSplatSimm5Plus1(SDValue N, SDValue &SplatVal);
+ bool selectVSplatSimm5Plus1NonZero(SDValue N, SDValue &SplatVal);
- void selectVLSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
- void selectVLSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
- void selectVLSEGFF(SDNode *Node);
- void selectVLSEGFFMask(SDNode *Node);
- void selectVLXSEG(SDNode *Node, unsigned IntNo);
- void selectVLXSEGMask(SDNode *Node, unsigned IntNo);
- void selectVSSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
- void selectVSSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
- void selectVSXSEG(SDNode *Node, unsigned IntNo);
- void selectVSXSEGMask(SDNode *Node, unsigned IntNo);
+ bool selectRVVSimm5(SDValue N, unsigned Width, SDValue &Imm);
+ template <unsigned Width> bool selectRVVSimm5(SDValue N, SDValue &Imm) {
+ return selectRVVSimm5(N, Width, Imm);
+ }
+
+ void addVectorLoadStoreOperands(SDNode *Node, unsigned SEWImm,
+ const SDLoc &DL, unsigned CurOp,
+ bool IsMasked, bool IsStridedOrIndexed,
+ SmallVectorImpl<SDValue> &Operands,
+ MVT *IndexVT = nullptr);
+
+ void selectVLSEG(SDNode *Node, bool IsMasked, bool IsStrided);
+ void selectVLSEGFF(SDNode *Node, bool IsMasked);
+ void selectVLXSEG(SDNode *Node, bool IsMasked, bool IsOrdered);
+ void selectVSSEG(SDNode *Node, bool IsMasked, bool IsStrided);
+ void selectVSXSEG(SDNode *Node, bool IsMasked, bool IsOrdered);
// Include the pieces autogenerated from the target description.
#include "RISCVGenDAGISel.inc"
@@ -71,6 +89,84 @@
private:
void doPeepholeLoadStoreADDI();
};
-}
+
+namespace RISCV {
+struct VLSEGPseudo {
+ uint16_t NF : 4;
+ uint16_t Masked : 1;
+ uint16_t Strided : 1;
+ uint16_t FF : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VLXSEGPseudo {
+ uint16_t NF : 4;
+ uint16_t Masked : 1;
+ uint16_t Ordered : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t IndexLMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VSSEGPseudo {
+ uint16_t NF : 4;
+ uint16_t Masked : 1;
+ uint16_t Strided : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VSXSEGPseudo {
+ uint16_t NF : 4;
+ uint16_t Masked : 1;
+ uint16_t Ordered : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t IndexLMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VLEPseudo {
+ uint16_t Masked : 1;
+ uint16_t Strided : 1;
+ uint16_t FF : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VSEPseudo {
+ uint16_t Masked :1;
+ uint16_t Strided : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t Pseudo;
+};
+
+struct VLX_VSXPseudo {
+ uint16_t Masked : 1;
+ uint16_t Ordered : 1;
+ uint16_t Log2SEW : 3;
+ uint16_t LMUL : 3;
+ uint16_t IndexLMUL : 3;
+ uint16_t Pseudo;
+};
+
+#define GET_RISCVVSSEGTable_DECL
+#define GET_RISCVVLSEGTable_DECL
+#define GET_RISCVVLXSEGTable_DECL
+#define GET_RISCVVSXSEGTable_DECL
+#define GET_RISCVVLETable_DECL
+#define GET_RISCVVSETable_DECL
+#define GET_RISCVVLXTable_DECL
+#define GET_RISCVVSXTable_DECL
+#include "RISCVGenSearchableTables.inc"
+} // namespace RISCV
+
+} // namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 97f46d9..2945320 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20,7 +20,6 @@
#include "RISCVTargetMachine.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,6 +29,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -90,63 +90,71 @@
if (Subtarget.hasStdExtD())
addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
+ static const MVT::SimpleValueType BoolVecVTs[] = {
+ MVT::nxv1i1, MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1,
+ MVT::nxv16i1, MVT::nxv32i1, MVT::nxv64i1};
+ static const MVT::SimpleValueType IntVecVTs[] = {
+ MVT::nxv1i8, MVT::nxv2i8, MVT::nxv4i8, MVT::nxv8i8, MVT::nxv16i8,
+ MVT::nxv32i8, MVT::nxv64i8, MVT::nxv1i16, MVT::nxv2i16, MVT::nxv4i16,
+ MVT::nxv8i16, MVT::nxv16i16, MVT::nxv32i16, MVT::nxv1i32, MVT::nxv2i32,
+ MVT::nxv4i32, MVT::nxv8i32, MVT::nxv16i32, MVT::nxv1i64, MVT::nxv2i64,
+ MVT::nxv4i64, MVT::nxv8i64};
+ static const MVT::SimpleValueType F16VecVTs[] = {
+ MVT::nxv1f16, MVT::nxv2f16, MVT::nxv4f16,
+ MVT::nxv8f16, MVT::nxv16f16, MVT::nxv32f16};
+ static const MVT::SimpleValueType F32VecVTs[] = {
+ MVT::nxv1f32, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv8f32, MVT::nxv16f32};
+ static const MVT::SimpleValueType F64VecVTs[] = {
+ MVT::nxv1f64, MVT::nxv2f64, MVT::nxv4f64, MVT::nxv8f64};
+
if (Subtarget.hasStdExtV()) {
- addRegisterClass(RISCVVMVTs::vbool64_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool32_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool16_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool8_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool4_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vbool1_t, &RISCV::VRRegClass);
+ auto addRegClassForRVV = [this](MVT VT) {
+ unsigned Size = VT.getSizeInBits().getKnownMinValue();
+ assert(Size <= 512 && isPowerOf2_32(Size));
+ const TargetRegisterClass *RC;
+ if (Size <= 64)
+ RC = &RISCV::VRRegClass;
+ else if (Size == 128)
+ RC = &RISCV::VRM2RegClass;
+ else if (Size == 256)
+ RC = &RISCV::VRM4RegClass;
+ else
+ RC = &RISCV::VRM8RegClass;
- addRegisterClass(RISCVVMVTs::vint8mf8_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint8mf4_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint8mf2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint8m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint8m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vint8m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vint8m8_t, &RISCV::VRM8RegClass);
+ addRegisterClass(VT, RC);
+ };
- addRegisterClass(RISCVVMVTs::vint16mf4_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint16mf2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint16m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint16m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vint16m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vint16m8_t, &RISCV::VRM8RegClass);
+ for (MVT VT : BoolVecVTs)
+ addRegClassForRVV(VT);
+ for (MVT VT : IntVecVTs)
+ addRegClassForRVV(VT);
- addRegisterClass(RISCVVMVTs::vint32mf2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint32m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint32m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vint32m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vint32m8_t, &RISCV::VRM8RegClass);
+ if (Subtarget.hasStdExtZfh())
+ for (MVT VT : F16VecVTs)
+ addRegClassForRVV(VT);
- addRegisterClass(RISCVVMVTs::vint64m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vint64m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vint64m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vint64m8_t, &RISCV::VRM8RegClass);
+ if (Subtarget.hasStdExtF())
+ for (MVT VT : F32VecVTs)
+ addRegClassForRVV(VT);
- if (Subtarget.hasStdExtZfh()) {
- addRegisterClass(RISCVVMVTs::vfloat16mf4_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat16mf2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat16m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat16m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vfloat16m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vfloat16m8_t, &RISCV::VRM8RegClass);
- }
+ if (Subtarget.hasStdExtD())
+ for (MVT VT : F64VecVTs)
+ addRegClassForRVV(VT);
- if (Subtarget.hasStdExtF()) {
- addRegisterClass(RISCVVMVTs::vfloat32mf2_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat32m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat32m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vfloat32m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vfloat32m8_t, &RISCV::VRM8RegClass);
- }
+ if (Subtarget.useRVVForFixedLengthVectors()) {
+ auto addRegClassForFixedVectors = [this](MVT VT) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ unsigned RCID = getRegClassIDForVecVT(ContainerVT);
+ const RISCVRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ addRegisterClass(VT, TRI.getRegClass(RCID));
+ };
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ if (useRVVForFixedLengthVectorVT(VT))
+ addRegClassForFixedVectors(VT);
- if (Subtarget.hasStdExtD()) {
- addRegisterClass(RISCVVMVTs::vfloat64m1_t, &RISCV::VRRegClass);
- addRegisterClass(RISCVVMVTs::vfloat64m2_t, &RISCV::VRM2RegClass);
- addRegisterClass(RISCVVMVTs::vfloat64m4_t, &RISCV::VRM4RegClass);
- addRegisterClass(RISCVVMVTs::vfloat64m8_t, &RISCV::VRM8RegClass);
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (useRVVForFixedLengthVectorVT(VT))
+ addRegClassForFixedVectors(VT);
}
}
@@ -163,6 +171,7 @@
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -185,6 +194,11 @@
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
+
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::i32, Custom);
}
if (!Subtarget.hasStdExtM()) {
@@ -195,20 +209,23 @@
setOperationAction(ISD::UDIV, XLenVT, Expand);
setOperationAction(ISD::SREM, XLenVT, Expand);
setOperationAction(ISD::UREM, XLenVT, Expand);
- }
+ } else {
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::MUL, MVT::i32, Custom);
+ setOperationAction(ISD::MUL, MVT::i128, Custom);
- if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
- setOperationAction(ISD::MUL, MVT::i32, Custom);
-
- setOperationAction(ISD::SDIV, MVT::i8, Custom);
- setOperationAction(ISD::UDIV, MVT::i8, Custom);
- setOperationAction(ISD::UREM, MVT::i8, Custom);
- setOperationAction(ISD::SDIV, MVT::i16, Custom);
- setOperationAction(ISD::UDIV, MVT::i16, Custom);
- setOperationAction(ISD::UREM, MVT::i16, Custom);
- setOperationAction(ISD::SDIV, MVT::i32, Custom);
- setOperationAction(ISD::UDIV, MVT::i32, Custom);
- setOperationAction(ISD::UREM, MVT::i32, Custom);
+ setOperationAction(ISD::SDIV, MVT::i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::i8, Custom);
+ setOperationAction(ISD::UREM, MVT::i8, Custom);
+ setOperationAction(ISD::SDIV, MVT::i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::i16, Custom);
+ setOperationAction(ISD::UREM, MVT::i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::i32, Custom);
+ setOperationAction(ISD::UREM, MVT::i32, Custom);
+ } else {
+ setOperationAction(ISD::MUL, MVT::i64, Custom);
+ }
}
setOperationAction(ISD::SDIVREM, XLenVT, Expand);
@@ -233,12 +250,16 @@
if (Subtarget.hasStdExtZbp()) {
// Custom lower bswap/bitreverse so we can convert them to GREVI to enable
// more combining.
- setOperationAction(ISD::BITREVERSE, XLenVT, Custom);
- setOperationAction(ISD::BSWAP, XLenVT, Custom);
+ setOperationAction(ISD::BITREVERSE, XLenVT, Custom);
+ setOperationAction(ISD::BSWAP, XLenVT, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
+ // BSWAP i8 doesn't exist.
+ setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
+ setOperationAction(ISD::BSWAP, MVT::i16, Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
- setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+ setOperationAction(ISD::BSWAP, MVT::i32, Custom);
}
} else {
// With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
@@ -252,6 +273,13 @@
setOperationAction(ISD::SMAX, XLenVT, Legal);
setOperationAction(ISD::UMIN, XLenVT, Legal);
setOperationAction(ISD::UMAX, XLenVT, Legal);
+
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+ }
} else {
setOperationAction(ISD::CTTZ, XLenVT, Expand);
setOperationAction(ISD::CTLZ, XLenVT, Expand);
@@ -259,8 +287,8 @@
}
if (Subtarget.hasStdExtZbt()) {
- setOperationAction(ISD::FSHL, XLenVT, Legal);
- setOperationAction(ISD::FSHR, XLenVT, Legal);
+ setOperationAction(ISD::FSHL, XLenVT, Custom);
+ setOperationAction(ISD::FSHR, XLenVT, Custom);
setOperationAction(ISD::SELECT, XLenVT, Legal);
if (Subtarget.is64Bit()) {
@@ -286,6 +314,10 @@
if (Subtarget.hasStdExtZfh()) {
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ setOperationAction(ISD::LRINT, MVT::f16, Legal);
+ setOperationAction(ISD::LLRINT, MVT::f16, Legal);
+ setOperationAction(ISD::LROUND, MVT::f16, Legal);
+ setOperationAction(ISD::LLROUND, MVT::f16, Legal);
for (auto CC : FPCCToExpand)
setCondCodeAction(CC, MVT::f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
@@ -298,6 +330,10 @@
if (Subtarget.hasStdExtF()) {
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::f32, Legal);
+ setOperationAction(ISD::LLRINT, MVT::f32, Legal);
+ setOperationAction(ISD::LROUND, MVT::f32, Legal);
+ setOperationAction(ISD::LLROUND, MVT::f32, Legal);
for (auto CC : FPCCToExpand)
setCondCodeAction(CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -315,6 +351,10 @@
if (Subtarget.hasStdExtD()) {
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ setOperationAction(ISD::LRINT, MVT::f64, Legal);
+ setOperationAction(ISD::LLRINT, MVT::f64, Legal);
+ setOperationAction(ISD::LROUND, MVT::f64, Legal);
+ setOperationAction(ISD::LLROUND, MVT::f64, Legal);
for (auto CC : FPCCToExpand)
setCondCodeAction(CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -335,6 +375,11 @@
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
}
+ if (Subtarget.hasStdExtF()) {
+ setOperationAction(ISD::FLT_ROUNDS_, XLenVT, Custom);
+ setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+ }
+
setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
setOperationAction(ISD::ConstantPool, XLenVT, Custom);
@@ -350,6 +395,8 @@
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
if (Subtarget.hasStdExtA()) {
setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
@@ -371,18 +418,81 @@
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
-
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
+ } else {
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
}
- for (auto VT : MVT::integer_scalable_vector_valuetypes()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ static unsigned IntegerVPOps[] = {
+ ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV,
+ ISD::VP_SREM, ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR,
+ ISD::VP_ASHR, ISD::VP_LSHR, ISD::VP_SHL};
+
+ static unsigned FloatingPointVPOps[] = {ISD::VP_FADD, ISD::VP_FSUB,
+ ISD::VP_FMUL, ISD::VP_FDIV};
+
+ if (!Subtarget.is64Bit()) {
+ // We must custom-lower certain vXi64 operations on RV32 due to the vector
+ // element type being illegal.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::i64, Custom);
+
+ setOperationAction(ISD::VECREDUCE_ADD, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, MVT::i64, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, MVT::i64, Custom);
+ }
+
+ for (MVT VT : BoolVecVTs) {
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+
+ // Mask VTs are custom-expanded into a series of standard nodes
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+
+ // RVV has native int->float & float->int conversions where the
+ // element type sizes are within one power-of-two of each other. Any
+ // wider distances between type sizes have to be lowered as sequences
+ // which progressively narrow the gap in stages.
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+
+ // Expand all extending loads to types larger than this, and truncating
+ // stores from types larger than this.
+ for (MVT OtherVT : MVT::integer_scalable_vector_valuetypes()) {
+ setTruncStoreAction(OtherVT, VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
+ }
+ }
+
+ for (MVT VT : IntVecVTs) {
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+ setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::SMAX, VT, Legal);
@@ -392,32 +502,73 @@
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
- if (isTypeLegal(VT)) {
- // Custom-lower extensions and truncations from/to mask types.
- setOperationAction(ISD::ANY_EXTEND, VT, Custom);
- setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
- setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ // Custom-lower extensions and truncations from/to mask types.
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
- // We custom-lower all legally-typed vector truncates:
- // 1. Mask VTs are custom-expanded into a series of standard nodes
- // 2. Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR"
- // nodes which truncate by one power of two at a time.
- setOperationAction(ISD::TRUNCATE, VT, Custom);
+ // RVV has native int->float & float->int conversions where the
+ // element type sizes are within one power-of-two of each other. Any
+ // wider distances between type sizes have to be lowered as sequences
+ // which progressively narrow the gap in stages.
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
- // Custom-lower insert/extract operations to simplify patterns.
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
+
+ // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL"
+ // nodes which truncate by one power of two at a time.
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ // Custom-lower insert/extract operations to simplify patterns.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+
+ // Custom-lower reduction operations to set up the corresponding custom
+ // nodes' operands.
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+
+ for (unsigned VPOpc : IntegerVPOps)
+ setOperationAction(VPOpc, VT, Custom);
+
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+ setOperationAction(ISD::STEP_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
+
+ for (MVT OtherVT : MVT::integer_scalable_vector_valuetypes()) {
+ setTruncStoreAction(VT, OtherVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
}
}
- // We must custom-lower certain vXi64 operations on RV32 due to the vector
- // element type being illegal.
- if (!Subtarget.is64Bit()) {
- setOperationAction(ISD::SPLAT_VECTOR, MVT::i64, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::i64, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::i64, Custom);
- }
-
// Expand various CCs to best match the RVV ISA, which natively supports UNE
// but no other unordered comparisons, and supports all ordered comparisons
// except ONE. Additionally, we expand GT,OGT,GE,OGE for optimization
@@ -434,31 +585,260 @@
// Sets common operation actions on RVV floating-point vector types.
const auto SetCommonVFPActions = [&](MVT VT) {
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+ // RVV has native FP_ROUND & FP_EXTEND conversions where the element type
+ // sizes are within one power-of-two of each other. Therefore conversions
+ // between vXf16 and vXf64 must be lowered as sequences which convert via
+ // vXf32.
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
// Custom-lower insert/extract operations to simplify patterns.
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ // Expand various condition codes (explained above).
for (auto CC : VFPCCToExpand)
setCondCodeAction(CC, VT, Expand);
+
+ setOperationAction(ISD::FMINNUM, VT, Legal);
+ setOperationAction(ISD::FMAXNUM, VT, Legal);
+
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Legal);
+
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
+
+ for (unsigned VPOpc : FloatingPointVPOps)
+ setOperationAction(VPOpc, VT, Custom);
};
- if (Subtarget.hasStdExtZfh()) {
- for (auto VT : {RISCVVMVTs::vfloat16mf4_t, RISCVVMVTs::vfloat16mf2_t,
- RISCVVMVTs::vfloat16m1_t, RISCVVMVTs::vfloat16m2_t,
- RISCVVMVTs::vfloat16m4_t, RISCVVMVTs::vfloat16m8_t})
+ // Sets common extload/truncstore actions on RVV floating-point vector
+ // types.
+ const auto SetCommonVFPExtLoadTruncStoreActions =
+ [&](MVT VT, ArrayRef<MVT::SimpleValueType> SmallerVTs) {
+ for (auto SmallVT : SmallerVTs) {
+ setTruncStoreAction(VT, SmallVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, SmallVT, Expand);
+ }
+ };
+
+ if (Subtarget.hasStdExtZfh())
+ for (MVT VT : F16VecVTs)
SetCommonVFPActions(VT);
+
+ for (MVT VT : F32VecVTs) {
+ if (Subtarget.hasStdExtF())
+ SetCommonVFPActions(VT);
+ SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
}
- if (Subtarget.hasStdExtF()) {
- for (auto VT : {RISCVVMVTs::vfloat32mf2_t, RISCVVMVTs::vfloat32m1_t,
- RISCVVMVTs::vfloat32m2_t, RISCVVMVTs::vfloat32m4_t,
- RISCVVMVTs::vfloat32m8_t})
+ for (MVT VT : F64VecVTs) {
+ if (Subtarget.hasStdExtD())
SetCommonVFPActions(VT);
+ SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
+ SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs);
}
- if (Subtarget.hasStdExtD()) {
- for (auto VT : {RISCVVMVTs::vfloat64m1_t, RISCVVMVTs::vfloat64m2_t,
- RISCVVMVTs::vfloat64m4_t, RISCVVMVTs::vfloat64m8_t})
- SetCommonVFPActions(VT);
+ if (Subtarget.useRVVForFixedLengthVectors()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
+ if (!useRVVForFixedLengthVectorVT(VT))
+ continue;
+
+ // By default everything must be expanded.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+ setOperationAction(Op, VT, Expand);
+ for (MVT OtherVT : MVT::integer_fixedlen_vector_valuetypes()) {
+ setTruncStoreAction(VT, OtherVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
+ }
+
+ // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+
+ setOperationAction(ISD::SETCC, VT, Custom);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ setOperationAction(ISD::BITCAST, VT, Custom);
+
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+
+ // Operations below are different for between masks and other vectors.
+ if (VT.getVectorElementType() == MVT::i1) {
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::XOR, VT, Custom);
+ continue;
+ }
+
+ // Use SPLAT_VECTOR to prevent type legalization from destroying the
+ // splats when type legalizing i64 scalar on RV32.
+ // FIXME: Use SPLAT_VECTOR for all types? DAGCombine probably needs
+ // improvements first.
+ if (!Subtarget.is64Bit() && VT.getVectorElementType() == MVT::i64) {
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
+ }
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::XOR, VT, Custom);
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::SREM, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::UREM, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
+ setOperationAction(ISD::ABS, VT, Custom);
+
+ setOperationAction(ISD::MULHS, VT, Custom);
+ setOperationAction(ISD::MULHU, VT, Custom);
+
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+
+ // Custom-lower reduction operations to set up the corresponding custom
+ // nodes' operands.
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+
+ for (unsigned VPOpc : IntegerVPOps)
+ setOperationAction(VPOpc, VT, Custom);
+ }
+
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
+ if (!useRVVForFixedLengthVectorVT(VT))
+ continue;
+
+ // By default everything must be expanded.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+ setOperationAction(Op, VT, Expand);
+ for (MVT OtherVT : MVT::fp_fixedlen_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
+ setTruncStoreAction(VT, OtherVT, Expand);
+ }
+
+ // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FSUB, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FDIV, VT, Custom);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ setOperationAction(ISD::FSQRT, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FMAXNUM, VT, Custom);
+
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+
+ for (auto CC : VFPCCToExpand)
+ setCondCodeAction(CC, VT, Expand);
+
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+ setOperationAction(ISD::BITCAST, VT, Custom);
+
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+
+ for (unsigned VPOpc : FloatingPointVPOps)
+ setOperationAction(VPOpc, VT, Custom);
+ }
+
+ // Custom-legalize bitcasts from fixed-length vectors to scalar types.
+ setOperationAction(ISD::BITCAST, MVT::i8, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f64, Custom);
}
}
@@ -475,21 +855,36 @@
// We can use any register for comparisons
setHasMultipleConditionRegisters();
- setTargetDAGCombine(ISD::SETCC);
- if (Subtarget.hasStdExtZbp()) {
- setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ if (Subtarget.hasStdExtV()) {
+ setTargetDAGCombine(ISD::FCOPYSIGN);
+ setTargetDAGCombine(ISD::MGATHER);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::SHL);
}
}
-EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext &Context,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
- if (Subtarget.hasStdExtV())
- return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+ if (Subtarget.hasStdExtV() &&
+ (VT.isScalableVector() || Subtarget.useRVVForFixedLengthVectors()))
+ return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
+MVT RISCVTargetLowering::getVPExplicitVectorLengthTy() const {
+ return Subtarget.getXLenVT();
+}
+
bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
@@ -505,7 +900,7 @@
case Intrinsic::riscv_masked_atomicrmw_min_i32:
case Intrinsic::riscv_masked_atomicrmw_umax_i32:
case Intrinsic::riscv_masked_atomicrmw_umin_i32:
- case Intrinsic::riscv_masked_cmpxchg_i32:
+ case Intrinsic::riscv_masked_cmpxchg_i32: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
@@ -516,6 +911,7 @@
MachineMemOperand::MOVolatile;
return true;
}
+ }
}
bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
@@ -617,10 +1013,48 @@
(VT == MVT::f64 && Subtarget.hasStdExtD());
}
+MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // Use f32 to pass f16 if it is legal and Zfh is not enabled. We might still
+ // end up using a GPR but that will be decided based on ABI.
+ if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
+ return MVT::f32;
+
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // Use f32 to pass f16 if it is legal and Zfh is not enabled. We might still
+ // end up using a GPR but that will be decided based on ABI.
+ if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
+ return 1;
+
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
// Changes the condition code and swaps operands if necessary, so the SetCC
-// operation matches one of the comparisons supported directly in the RISC-V
-// ISA.
-static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
+// operation matches one of the comparisons supported directly by branches
+// in the RISC-V ISA. May adjust compares to favor compare with 0 over compare
+// with 1/-1.
+static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
+ ISD::CondCode &CC, SelectionDAG &DAG) {
+ // Convert X > -1 to X >= 0.
+ if (CC == ISD::SETGT && isAllOnesConstant(RHS)) {
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ CC = ISD::SETGE;
+ return;
+ }
+ // Convert X < 1 to 0 >= X.
+ if (CC == ISD::SETLT && isOneConstant(RHS)) {
+ RHS = LHS;
+ LHS = DAG.getConstant(0, DL, RHS.getValueType());
+ CC = ISD::SETGE;
+ return;
+ }
+
switch (CC) {
default:
break;
@@ -636,7 +1070,7 @@
// Return the RISC-V branch opcode that matches the given DAG integer
// condition code. The CondCode must be one of those supported by the RISC-V
-// ISA (see normaliseSetCC).
+// ISA (see translateSetCCForBranch).
static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
switch (CC) {
default:
@@ -656,6 +1090,1002 @@
}
}
+RISCVII::VLMUL RISCVTargetLowering::getLMUL(MVT VT) {
+ assert(VT.isScalableVector() && "Expecting a scalable vector type");
+ unsigned KnownSize = VT.getSizeInBits().getKnownMinValue();
+ if (VT.getVectorElementType() == MVT::i1)
+ KnownSize *= 8;
+
+ switch (KnownSize) {
+ default:
+ llvm_unreachable("Invalid LMUL.");
+ case 8:
+ return RISCVII::VLMUL::LMUL_F8;
+ case 16:
+ return RISCVII::VLMUL::LMUL_F4;
+ case 32:
+ return RISCVII::VLMUL::LMUL_F2;
+ case 64:
+ return RISCVII::VLMUL::LMUL_1;
+ case 128:
+ return RISCVII::VLMUL::LMUL_2;
+ case 256:
+ return RISCVII::VLMUL::LMUL_4;
+ case 512:
+ return RISCVII::VLMUL::LMUL_8;
+ }
+}
+
+unsigned RISCVTargetLowering::getRegClassIDForLMUL(RISCVII::VLMUL LMul) {
+ switch (LMul) {
+ default:
+ llvm_unreachable("Invalid LMUL.");
+ case RISCVII::VLMUL::LMUL_F8:
+ case RISCVII::VLMUL::LMUL_F4:
+ case RISCVII::VLMUL::LMUL_F2:
+ case RISCVII::VLMUL::LMUL_1:
+ return RISCV::VRRegClassID;
+ case RISCVII::VLMUL::LMUL_2:
+ return RISCV::VRM2RegClassID;
+ case RISCVII::VLMUL::LMUL_4:
+ return RISCV::VRM4RegClassID;
+ case RISCVII::VLMUL::LMUL_8:
+ return RISCV::VRM8RegClassID;
+ }
+}
+
+unsigned RISCVTargetLowering::getSubregIndexByMVT(MVT VT, unsigned Index) {
+ RISCVII::VLMUL LMUL = getLMUL(VT);
+ if (LMUL == RISCVII::VLMUL::LMUL_F8 ||
+ LMUL == RISCVII::VLMUL::LMUL_F4 ||
+ LMUL == RISCVII::VLMUL::LMUL_F2 ||
+ LMUL == RISCVII::VLMUL::LMUL_1) {
+ static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm1_0 + Index;
+ }
+ if (LMUL == RISCVII::VLMUL::LMUL_2) {
+ static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm2_0 + Index;
+ }
+ if (LMUL == RISCVII::VLMUL::LMUL_4) {
+ static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm4_0 + Index;
+ }
+ llvm_unreachable("Invalid vector type.");
+}
+
+unsigned RISCVTargetLowering::getRegClassIDForVecVT(MVT VT) {
+ if (VT.getVectorElementType() == MVT::i1)
+ return RISCV::VRRegClassID;
+ return getRegClassIDForLMUL(getLMUL(VT));
+}
+
+// Attempt to decompose a subvector insert/extract between VecVT and
+// SubVecVT via subregister indices. Returns the subregister index that
+// can perform the subvector insert/extract with the given element index, as
+// well as the index corresponding to any leftover subvectors that must be
+// further inserted/extracted within the register class for SubVecVT.
+std::pair<unsigned, unsigned>
+RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ MVT VecVT, MVT SubVecVT, unsigned InsertExtractIdx,
+ const RISCVRegisterInfo *TRI) {
+ static_assert((RISCV::VRM8RegClassID > RISCV::VRM4RegClassID &&
+ RISCV::VRM4RegClassID > RISCV::VRM2RegClassID &&
+ RISCV::VRM2RegClassID > RISCV::VRRegClassID),
+ "Register classes not ordered");
+ unsigned VecRegClassID = getRegClassIDForVecVT(VecVT);
+ unsigned SubRegClassID = getRegClassIDForVecVT(SubVecVT);
+ // Try to compose a subregister index that takes us from the incoming
+ // LMUL>1 register class down to the outgoing one. At each step we half
+ // the LMUL:
+ // nxv16i32@12 -> nxv2i32: sub_vrm4_1_then_sub_vrm2_1_then_sub_vrm1_0
+ // Note that this is not guaranteed to find a subregister index, such as
+ // when we are extracting from one VR type to another.
+ unsigned SubRegIdx = RISCV::NoSubRegister;
+ for (const unsigned RCID :
+ {RISCV::VRM4RegClassID, RISCV::VRM2RegClassID, RISCV::VRRegClassID})
+ if (VecRegClassID > RCID && SubRegClassID <= RCID) {
+ VecVT = VecVT.getHalfNumVectorElementsVT();
+ bool IsHi =
+ InsertExtractIdx >= VecVT.getVectorElementCount().getKnownMinValue();
+ SubRegIdx = TRI->composeSubRegIndices(SubRegIdx,
+ getSubregIndexByMVT(VecVT, IsHi));
+ if (IsHi)
+ InsertExtractIdx -= VecVT.getVectorElementCount().getKnownMinValue();
+ }
+ return {SubRegIdx, InsertExtractIdx};
+}
+
+// Permit combining of mask vectors as BUILD_VECTOR never expands to scalar
+// stores for those types.
+bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+ return !Subtarget.useRVVForFixedLengthVectors() ||
+ (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
+}
+
+static bool useRVVForFixedLengthVectorVT(MVT VT,
+ const RISCVSubtarget &Subtarget) {
+ assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
+ if (!Subtarget.useRVVForFixedLengthVectors())
+ return false;
+
+ // We only support a set of vector types with a consistent maximum fixed size
+ // across all supported vector element types to avoid legalization issues.
+ // Therefore -- since the largest is v1024i8/v512i16/etc -- the largest
+ // fixed-length vector type we support is 1024 bytes.
+ if (VT.getFixedSizeInBits() > 1024 * 8)
+ return false;
+
+ unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+
+ // Don't use RVV for vectors we cannot scalarize if required.
+ switch (VT.getVectorElementType().SimpleTy) {
+ // i1 is supported but has different rules.
+ default:
+ return false;
+ case MVT::i1:
+ // Masks can only use a single register.
+ if (VT.getVectorNumElements() > MinVLen)
+ return false;
+ MinVLen /= 8;
+ break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ break;
+ case MVT::f16:
+ if (!Subtarget.hasStdExtZfh())
+ return false;
+ break;
+ case MVT::f32:
+ if (!Subtarget.hasStdExtF())
+ return false;
+ break;
+ case MVT::f64:
+ if (!Subtarget.hasStdExtD())
+ return false;
+ break;
+ }
+
+ unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
+ // Don't use RVV for types that don't fit.
+ if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
+ return false;
+
+ // TODO: Perhaps an artificial restriction, but worth having whilst getting
+ // the base fixed length RVV support in place.
+ if (!VT.isPow2VectorType())
+ return false;
+
+ return true;
+}
+
+bool RISCVTargetLowering::useRVVForFixedLengthVectorVT(MVT VT) const {
+ return ::useRVVForFixedLengthVectorVT(VT, Subtarget);
+}
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
+ const RISCVSubtarget &Subtarget) {
+ // This may be called before legal types are setup.
+ assert(((VT.isFixedLengthVector() && TLI.isTypeLegal(VT)) ||
+ useRVVForFixedLengthVectorVT(VT, Subtarget)) &&
+ "Expected legal fixed length vector!");
+
+ unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+
+ MVT EltVT = VT.getVectorElementType();
+ switch (EltVT.SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for RVV container");
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64: {
+ // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
+ // narrower types, but we can't have a fractional LMUL with demoninator less
+ // than 64/SEW.
+ unsigned NumElts =
+ divideCeil(VT.getVectorNumElements(), MinVLen / RISCV::RVVBitsPerBlock);
+ return MVT::getScalableVectorVT(EltVT, NumElts);
+ }
+ }
+}
+
+static MVT getContainerForFixedLengthVector(SelectionDAG &DAG, MVT VT,
+ const RISCVSubtarget &Subtarget) {
+ return getContainerForFixedLengthVector(DAG.getTargetLoweringInfo(), VT,
+ Subtarget);
+}
+
+MVT RISCVTargetLowering::getContainerForFixedLengthVector(MVT VT) const {
+ return ::getContainerForFixedLengthVector(*this, VT, getSubtarget());
+}
+
+// Grow V to consume an entire RVV register.
+static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(VT.isScalableVector() &&
+ "Expected to convert into a scalable vector!");
+ assert(V.getValueType().isFixedLengthVector() &&
+ "Expected a fixed length vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(VT.isFixedLengthVector() &&
+ "Expected to convert into a fixed length vector!");
+ assert(V.getValueType().isScalableVector() &&
+ "Expected a scalable vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Gets the two common "VL" operands: an all-ones mask and the vector length.
+// VecVT is a vector type, either fixed-length or scalable, and ContainerVT is
+// the vector type that it is contained in.
+static std::pair<SDValue, SDValue>
+getDefaultVLOps(MVT VecVT, MVT ContainerVT, SDLoc DL, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(ContainerVT.isScalableVector() && "Expecting scalable container type");
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue VL = VecVT.isFixedLengthVector()
+ ? DAG.getConstant(VecVT.getVectorNumElements(), DL, XLenVT)
+ : DAG.getRegister(RISCV::X0, XLenVT);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ return {Mask, VL};
+}
+
+// As above but assuming the given type is a scalable vector type.
+static std::pair<SDValue, SDValue>
+getDefaultScalableVLOps(MVT VecVT, SDLoc DL, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(VecVT.isScalableVector() && "Expecting a scalable vector");
+ return getDefaultVLOps(VecVT, VecVT, DL, DAG, Subtarget);
+}
+
+// The state of RVV BUILD_VECTOR and VECTOR_SHUFFLE lowering is that very few
+// of either is (currently) supported. This can get us into an infinite loop
+// where we try to lower a BUILD_VECTOR as a VECTOR_SHUFFLE as a BUILD_VECTOR
+// as a ..., etc.
+// Until either (or both) of these can reliably lower any node, reporting that
+// we don't want to expand BUILD_VECTORs via VECTOR_SHUFFLEs at least breaks
+// the infinite loop. Note that this lowers BUILD_VECTOR through the stack,
+// which is not desirable.
+bool RISCVTargetLowering::shouldExpandBuildVectorWithShuffles(
+ EVT VT, unsigned DefinedValues) const {
+ return false;
+}
+
+bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+ // Only splats are currently supported.
+ if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
+ return true;
+
+ return false;
+}
+
+static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.isFixedLengthVector() && "Unexpected vector!");
+
+ MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ unsigned Opc =
+ VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
+ SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, Op.getOperand(0), VL);
+ return convertFromScalableVector(VT, Splat, DAG, Subtarget);
+}
+
+struct VIDSequence {
+ int64_t Step;
+ int64_t Addend;
+};
+
+// Try to match an arithmetic-sequence BUILD_VECTOR [X,X+S,X+2*S,...,X+(N-1)*S]
+// to the (non-zero) step S and start value X. This can be then lowered as the
+// RVV sequence (VID * S) + X, for example.
+// Note that this method will also match potentially unappealing index
+// sequences, like <i32 0, i32 50939494>, however it is left to the caller to
+// determine whether this is worth generating code for.
+static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
+ unsigned NumElts = Op.getNumOperands();
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
+ if (!Op.getValueType().isInteger())
+ return None;
+
+ Optional<int64_t> SeqStep, SeqAddend;
+ Optional<std::pair<uint64_t, unsigned>> PrevElt;
+ unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
+ for (unsigned Idx = 0; Idx < NumElts; Idx++) {
+ // Assume undef elements match the sequence; we just have to be careful
+ // when interpolating across them.
+ if (Op.getOperand(Idx).isUndef())
+ continue;
+ // The BUILD_VECTOR must be all constants.
+ if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
+ return None;
+
+ uint64_t Val = Op.getConstantOperandVal(Idx) &
+ maskTrailingOnes<uint64_t>(EltSizeInBits);
+
+ if (PrevElt) {
+ // Calculate the step since the last non-undef element, and ensure
+ // it's consistent across the entire sequence.
+ int64_t Diff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+ // The difference must cleanly divide the element span.
+ if (Diff % (Idx - PrevElt->second) != 0)
+ return None;
+ int64_t Step = Diff / (Idx - PrevElt->second);
+ // A zero step indicates we're either a not an index sequence, or we
+ // have a fractional step. This must be handled by a more complex
+ // pattern recognition (undefs complicate things here).
+ if (Step == 0)
+ return None;
+ if (!SeqStep)
+ SeqStep = Step;
+ else if (Step != SeqStep)
+ return None;
+ }
+
+ // Record and/or check any addend.
+ if (SeqStep) {
+ int64_t Addend =
+ SignExtend64(Val - (Idx * (uint64_t)*SeqStep), EltSizeInBits);
+ if (!SeqAddend)
+ SeqAddend = Addend;
+ else if (SeqAddend != Addend)
+ return None;
+ }
+
+ // Record this non-undef element for later.
+ PrevElt = std::make_pair(Val, Idx);
+ }
+ // We need to have logged both a step and an addend for this to count as
+ // a legal index sequence.
+ if (!SeqStep || !SeqAddend)
+ return None;
+
+ return VIDSequence{*SeqStep, *SeqAddend};
+}
+
+static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.isFixedLengthVector() && "Unexpected vector!");
+
+ MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ unsigned NumElts = Op.getNumOperands();
+
+ if (VT.getVectorElementType() == MVT::i1) {
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ SDValue VMClr = DAG.getNode(RISCVISD::VMCLR_VL, DL, ContainerVT, VL);
+ return convertFromScalableVector(VT, VMClr, DAG, Subtarget);
+ }
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ SDValue VMSet = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
+ return convertFromScalableVector(VT, VMSet, DAG, Subtarget);
+ }
+
+ // Lower constant mask BUILD_VECTORs via an integer vector type, in
+ // scalar integer chunks whose bit-width depends on the number of mask
+ // bits and XLEN.
+ // First, determine the most appropriate scalar integer type to use. This
+ // is at most XLenVT, but may be shrunk to a smaller vector element type
+ // according to the size of the final vector - use i8 chunks rather than
+ // XLenVT if we're producing a v8i1. This results in more consistent
+ // codegen across RV32 and RV64.
+ unsigned NumViaIntegerBits =
+ std::min(std::max(NumElts, 8u), Subtarget.getXLen());
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ // If we have to use more than one INSERT_VECTOR_ELT then this
+ // optimization is likely to increase code size; avoid peforming it in
+ // such a case. We can use a load from a constant pool in this case.
+ if (DAG.shouldOptForSize() && NumElts > NumViaIntegerBits)
+ return SDValue();
+ // Now we can create our integer vector type. Note that it may be larger
+ // than the resulting mask type: v4i1 would use v1i8 as its integer type.
+ MVT IntegerViaVecVT =
+ MVT::getVectorVT(MVT::getIntegerVT(NumViaIntegerBits),
+ divideCeil(NumElts, NumViaIntegerBits));
+
+ uint64_t Bits = 0;
+ unsigned BitPos = 0, IntegerEltIdx = 0;
+ SDValue Vec = DAG.getUNDEF(IntegerViaVecVT);
+
+ for (unsigned I = 0; I < NumElts; I++, BitPos++) {
+ // Once we accumulate enough bits to fill our scalar type, insert into
+ // our vector and clear our accumulated data.
+ if (I != 0 && I % NumViaIntegerBits == 0) {
+ if (NumViaIntegerBits <= 32)
+ Bits = SignExtend64(Bits, 32);
+ SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntegerViaVecVT, Vec,
+ Elt, DAG.getConstant(IntegerEltIdx, DL, XLenVT));
+ Bits = 0;
+ BitPos = 0;
+ IntegerEltIdx++;
+ }
+ SDValue V = Op.getOperand(I);
+ bool BitValue = !V.isUndef() && cast<ConstantSDNode>(V)->getZExtValue();
+ Bits |= ((uint64_t)BitValue << BitPos);
+ }
+
+ // Insert the (remaining) scalar value into position in our integer
+ // vector type.
+ if (NumViaIntegerBits <= 32)
+ Bits = SignExtend64(Bits, 32);
+ SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntegerViaVecVT, Vec, Elt,
+ DAG.getConstant(IntegerEltIdx, DL, XLenVT));
+
+ if (NumElts < NumViaIntegerBits) {
+ // If we're producing a smaller vector than our minimum legal integer
+ // type, bitcast to the equivalent (known-legal) mask type, and extract
+ // our final mask.
+ assert(IntegerViaVecVT == MVT::v1i8 && "Unexpected mask vector type");
+ Vec = DAG.getBitcast(MVT::v8i1, Vec);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Vec,
+ DAG.getConstant(0, DL, XLenVT));
+ } else {
+ // Else we must have produced an integer type with the same size as the
+ // mask type; bitcast for the final result.
+ assert(VT.getSizeInBits() == IntegerViaVecVT.getSizeInBits());
+ Vec = DAG.getBitcast(VT, Vec);
+ }
+
+ return Vec;
+ }
+
+ // A BUILD_VECTOR can be lowered as a SETCC. For each fixed-length mask
+ // vector type, we have a legal equivalently-sized i8 type, so we can use
+ // that.
+ MVT WideVecVT = VT.changeVectorElementType(MVT::i8);
+ SDValue VecZero = DAG.getConstant(0, DL, WideVecVT);
+
+ SDValue WideVec;
+ if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
+ // For a splat, perform a scalar truncate before creating the wider
+ // vector.
+ assert(Splat.getValueType() == XLenVT &&
+ "Unexpected type for i1 splat value");
+ Splat = DAG.getNode(ISD::AND, DL, XLenVT, Splat,
+ DAG.getConstant(1, DL, XLenVT));
+ WideVec = DAG.getSplatBuildVector(WideVecVT, DL, Splat);
+ } else {
+ SmallVector<SDValue, 8> Ops(Op->op_values());
+ WideVec = DAG.getBuildVector(WideVecVT, DL, Ops);
+ SDValue VecOne = DAG.getConstant(1, DL, WideVecVT);
+ WideVec = DAG.getNode(ISD::AND, DL, WideVecVT, WideVec, VecOne);
+ }
+
+ return DAG.getSetCC(DL, VT, WideVec, VecZero, ISD::SETNE);
+ }
+
+ if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
+ unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
+ : RISCVISD::VMV_V_X_VL;
+ Splat = DAG.getNode(Opc, DL, ContainerVT, Splat, VL);
+ return convertFromScalableVector(VT, Splat, DAG, Subtarget);
+ }
+
+ // Try and match index sequences, which we can lower to the vid instruction
+ // with optional modifications. An all-undef vector is matched by
+ // getSplatValue, above.
+ if (auto SimpleVID = isSimpleVIDSequence(Op)) {
+ int64_t Step = SimpleVID->Step;
+ int64_t Addend = SimpleVID->Addend;
+ // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
+ // threshold since it's the immediate value many RVV instructions accept.
+ if (isInt<5>(Step) && isInt<5>(Addend)) {
+ SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL);
+ // Convert right out of the scalable type so we can use standard ISD
+ // nodes for the rest of the computation. If we used scalable types with
+ // these, we'd lose the fixed-length vector info and generate worse
+ // vsetvli code.
+ VID = convertFromScalableVector(VT, VID, DAG, Subtarget);
+ assert(Step != 0 && "Invalid step");
+ bool Negate = false;
+ if (Step != 1) {
+ int64_t SplatStepVal = Step;
+ unsigned Opcode = ISD::MUL;
+ if (isPowerOf2_64(std::abs(Step))) {
+ Negate = Step < 0;
+ Opcode = ISD::SHL;
+ SplatStepVal = Log2_64(std::abs(Step));
+ }
+ SDValue SplatStep = DAG.getSplatVector(
+ VT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT));
+ VID = DAG.getNode(Opcode, DL, VT, VID, SplatStep);
+ }
+ if (Addend != 0 || Negate) {
+ SDValue SplatAddend =
+ DAG.getSplatVector(VT, DL, DAG.getConstant(Addend, DL, XLenVT));
+ VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VT, SplatAddend, VID);
+ }
+ return VID;
+ }
+ }
+
+ // Attempt to detect "hidden" splats, which only reveal themselves as splats
+ // when re-interpreted as a vector with a larger element type. For example,
+ // v4i16 = build_vector i16 0, i16 1, i16 0, i16 1
+ // could be instead splat as
+ // v2i32 = build_vector i32 0x00010000, i32 0x00010000
+ // TODO: This optimization could also work on non-constant splats, but it
+ // would require bit-manipulation instructions to construct the splat value.
+ SmallVector<SDValue> Sequence;
+ unsigned EltBitSize = VT.getScalarSizeInBits();
+ const auto *BV = cast<BuildVectorSDNode>(Op);
+ if (VT.isInteger() && EltBitSize < 64 &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ BV->getRepeatedSequence(Sequence) &&
+ (Sequence.size() * EltBitSize) <= 64) {
+ unsigned SeqLen = Sequence.size();
+ MVT ViaIntVT = MVT::getIntegerVT(EltBitSize * SeqLen);
+ MVT ViaVecVT = MVT::getVectorVT(ViaIntVT, NumElts / SeqLen);
+ assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32 ||
+ ViaIntVT == MVT::i64) &&
+ "Unexpected sequence type");
+
+ unsigned EltIdx = 0;
+ uint64_t EltMask = maskTrailingOnes<uint64_t>(EltBitSize);
+ uint64_t SplatValue = 0;
+ // Construct the amalgamated value which can be splatted as this larger
+ // vector type.
+ for (const auto &SeqV : Sequence) {
+ if (!SeqV.isUndef())
+ SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask)
+ << (EltIdx * EltBitSize));
+ EltIdx++;
+ }
+
+ // On RV64, sign-extend from 32 to 64 bits where possible in order to
+ // achieve better constant materializion.
+ if (Subtarget.is64Bit() && ViaIntVT == MVT::i32)
+ SplatValue = SignExtend64(SplatValue, 32);
+
+ // Since we can't introduce illegal i64 types at this stage, we can only
+ // perform an i64 splat on RV32 if it is its own sign-extended value. That
+ // way we can use RVV instructions to splat.
+ assert((ViaIntVT.bitsLE(XLenVT) ||
+ (!Subtarget.is64Bit() && ViaIntVT == MVT::i64)) &&
+ "Unexpected bitcast sequence");
+ if (ViaIntVT.bitsLE(XLenVT) || isInt<32>(SplatValue)) {
+ SDValue ViaVL =
+ DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT);
+ MVT ViaContainerVT =
+ getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget);
+ SDValue Splat =
+ DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT,
+ DAG.getConstant(SplatValue, DL, XLenVT), ViaVL);
+ Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget);
+ return DAG.getBitcast(VT, Splat);
+ }
+ }
+
+ // Try and optimize BUILD_VECTORs with "dominant values" - these are values
+ // which constitute a large proportion of the elements. In such cases we can
+ // splat a vector with the dominant element and make up the shortfall with
+ // INSERT_VECTOR_ELTs.
+ // Note that this includes vectors of 2 elements by association. The
+ // upper-most element is the "dominant" one, allowing us to use a splat to
+ // "insert" the upper element, and an insert of the lower element at position
+ // 0, which improves codegen.
+ SDValue DominantValue;
+ unsigned MostCommonCount = 0;
+ DenseMap<SDValue, unsigned> ValueCounts;
+ unsigned NumUndefElts =
+ count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
+
+ for (SDValue V : Op->op_values()) {
+ if (V.isUndef())
+ continue;
+
+ ValueCounts.insert(std::make_pair(V, 0));
+ unsigned &Count = ValueCounts[V];
+
+ // Is this value dominant? In case of a tie, prefer the highest element as
+ // it's cheaper to insert near the beginning of a vector than it is at the
+ // end.
+ if (++Count >= MostCommonCount) {
+ DominantValue = V;
+ MostCommonCount = Count;
+ }
+ }
+
+ assert(DominantValue && "Not expecting an all-undef BUILD_VECTOR");
+ unsigned NumDefElts = NumElts - NumUndefElts;
+ unsigned DominantValueCountThreshold = NumDefElts <= 2 ? 0 : NumDefElts - 2;
+
+ // Don't perform this optimization when optimizing for size, since
+ // materializing elements and inserting them tends to cause code bloat.
+ if (!DAG.shouldOptForSize() &&
+ ((MostCommonCount > DominantValueCountThreshold) ||
+ (ValueCounts.size() <= Log2_32(NumDefElts)))) {
+ // Start by splatting the most common element.
+ SDValue Vec = DAG.getSplatBuildVector(VT, DL, DominantValue);
+
+ DenseSet<SDValue> Processed{DominantValue};
+ MVT SelMaskTy = VT.changeVectorElementType(MVT::i1);
+ for (const auto &OpIdx : enumerate(Op->ops())) {
+ const SDValue &V = OpIdx.value();
+ if (V.isUndef() || !Processed.insert(V).second)
+ continue;
+ if (ValueCounts[V] == 1) {
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V,
+ DAG.getConstant(OpIdx.index(), DL, XLenVT));
+ } else {
+ // Blend in all instances of this value using a VSELECT, using a
+ // mask where each bit signals whether that element is the one
+ // we're after.
+ SmallVector<SDValue> Ops;
+ transform(Op->op_values(), std::back_inserter(Ops), [&](SDValue V1) {
+ return DAG.getConstant(V == V1, DL, XLenVT);
+ });
+ Vec = DAG.getNode(ISD::VSELECT, DL, VT,
+ DAG.getBuildVector(SelMaskTy, DL, Ops),
+ DAG.getSplatBuildVector(VT, DL, V), Vec);
+ }
+ }
+
+ return Vec;
+ }
+
+ return SDValue();
+}
+
+static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo,
+ SDValue Hi, SDValue VL, SelectionDAG &DAG) {
+ if (isa<ConstantSDNode>(Lo) && isa<ConstantSDNode>(Hi)) {
+ int32_t LoC = cast<ConstantSDNode>(Lo)->getSExtValue();
+ int32_t HiC = cast<ConstantSDNode>(Hi)->getSExtValue();
+ // If Hi constant is all the same sign bit as Lo, lower this as a custom
+ // node in order to try and match RVV vector/scalar instructions.
+ if ((LoC >> 31) == HiC)
+ return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+ }
+
+ // Fall back to a stack store and stride x0 vector load.
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Lo, Hi, VL);
+}
+
+// Called by type legalization to handle splat of i64 on RV32.
+// FIXME: We can optimize this when the type has sign or zero bits in one
+// of the halves.
+static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
+ SDValue VL, SelectionDAG &DAG) {
+ assert(Scalar.getValueType() == MVT::i64 && "Unexpected VT!");
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(1, DL, MVT::i32));
+ return splatPartsI64WithVL(DL, VT, Lo, Hi, VL, DAG);
+}
+
+// This function lowers a splat of a scalar operand Splat with the vector
+// length VL. It ensures the final sequence is type legal, which is useful when
+// lowering a splat after type legalization.
+static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
+ SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (VT.isFloatingPoint())
+ return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ // Simplest case is that the operand needs to be promoted to XLenVT.
+ if (Scalar.getValueType().bitsLE(XLenVT)) {
+ // If the operand is a constant, sign extend to increase our chances
+ // of being able to use a .vi instruction. ANY_EXTEND would become a
+ // a zero extend and the simm5 check in isel would fail.
+ // FIXME: Should we ignore the upper bits in isel instead?
+ unsigned ExtOpc =
+ isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+ Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
+ return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
+ }
+
+ assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
+ "Unexpected scalar for splat lowering!");
+
+ // Otherwise use the more complicated splatting algorithm.
+ return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
+}
+
+static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc DL(Op);
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+ MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+
+ SDValue TrueMask, VL;
+ std::tie(TrueMask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ if (SVN->isSplat()) {
+ const int Lane = SVN->getSplatIndex();
+ if (Lane >= 0) {
+ MVT SVT = VT.getVectorElementType();
+
+ // Turn splatted vector load into a strided load with an X0 stride.
+ SDValue V = V1;
+ // Peek through CONCAT_VECTORS as VectorCombine can concat a vector
+ // with undef.
+ // FIXME: Peek through INSERT_SUBVECTOR, EXTRACT_SUBVECTOR, bitcasts?
+ int Offset = Lane;
+ if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+ int OpElements =
+ V.getOperand(0).getSimpleValueType().getVectorNumElements();
+ V = V.getOperand(Offset / OpElements);
+ Offset %= OpElements;
+ }
+
+ // We need to ensure the load isn't atomic or volatile.
+ if (ISD::isNormalLoad(V.getNode()) && cast<LoadSDNode>(V)->isSimple()) {
+ auto *Ld = cast<LoadSDNode>(V);
+ Offset *= SVT.getStoreSize();
+ SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
+
+ // If this is SEW=64 on RV32, use a strided load with a stride of x0.
+ if (SVT.isInteger() && SVT.bitsGT(XLenVT)) {
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ SDValue IntID =
+ DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
+ SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
+ DAG.getRegister(RISCV::X0, XLenVT), VL};
+ SDValue NewLoad = DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
+ return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+ }
+
+ // Otherwise use a scalar load and splat. This will give the best
+ // opportunity to fold a splat into the operation. ISel can turn it into
+ // the x0 strided load if we aren't able to fold away the select.
+ if (SVT.isFloatingPoint())
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ Ld->getPointerInfo().getWithOffset(Offset),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ else
+ V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
+ Ld->getPointerInfo().getWithOffset(Offset), SVT,
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+
+ unsigned Opc =
+ VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
+ SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
+ return convertFromScalableVector(VT, Splat, DAG, Subtarget);
+ }
+
+ V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+ assert(Lane < (int)NumElts && "Unexpected lane!");
+ SDValue Gather =
+ DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,
+ DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);
+ return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+ }
+ }
+
+ // Detect shuffles which can be re-expressed as vector selects; these are
+ // shuffles in which each element in the destination is taken from an element
+ // at the corresponding index in either source vectors.
+ bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) {
+ int MaskIndex = MaskIdx.value();
+ return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
+ });
+
+ assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
+
+ SmallVector<SDValue> MaskVals;
+ // As a backup, shuffles can be lowered via a vrgather instruction, possibly
+ // merged with a second vrgather.
+ SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
+
+ // By default we preserve the original operand order, and use a mask to
+ // select LHS as true and RHS as false. However, since RVV vector selects may
+ // feature splats but only on the LHS, we may choose to invert our mask and
+ // instead select between RHS and LHS.
+ bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
+ bool InvertMask = IsSelect == SwapOps;
+
+ // Now construct the mask that will be used by the vselect or blended
+ // vrgather operation. For vrgathers, construct the appropriate indices into
+ // each vector.
+ for (int MaskIndex : SVN->getMask()) {
+ bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
+ MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+ if (!IsSelect) {
+ bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
+ GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
+ ? DAG.getConstant(MaskIndex, DL, XLenVT)
+ : DAG.getUNDEF(XLenVT));
+ GatherIndicesRHS.push_back(
+ IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
+ : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
+ }
+ }
+
+ if (SwapOps) {
+ std::swap(V1, V2);
+ std::swap(GatherIndicesLHS, GatherIndicesRHS);
+ }
+
+ assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
+
+ if (IsSelect)
+ return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
+
+ if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
+ // On such a large vector we're unable to use i8 as the index type.
+ // FIXME: We could promote the index to i16 and use vrgatherei16, but that
+ // may involve vector splitting if we're already at LMUL=8, or our
+ // user-supplied maximum fixed-length LMUL.
+ return SDValue();
+ }
+
+ unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+ MVT IndexVT = VT.changeTypeToInteger();
+ // Since we can't introduce illegal index types at this stage, use i16 and
+ // vrgatherei16 if the corresponding index type for plain vrgather is greater
+ // than XLenVT.
+ if (IndexVT.getScalarType().bitsGT(XLenVT)) {
+ GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+ IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+ }
+
+ MVT IndexContainerVT =
+ ContainerVT.changeVectorElementType(IndexVT.getScalarType());
+
+ SDValue Gather;
+ // TODO: This doesn't trigger for i64 vectors on RV32, since there we
+ // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
+ if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
+ Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
+ } else {
+ SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+ LHSIndices =
+ convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+ V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+ Gather =
+ DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+ }
+
+ // If a second vector operand is used by this shuffle, blend it in with an
+ // additional vrgather.
+ if (!V2.isUndef()) {
+ MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+ SelectMask =
+ convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+
+ SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+ RHSIndices =
+ convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+
+ V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+ V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
+ Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
+ Gather, VL);
+ }
+
+ return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+}
+
+static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
+ SDLoc DL, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (VT.isScalableVector())
+ return DAG.getFPExtendOrRound(Op, DL, VT);
+ assert(VT.isFixedLengthVector() &&
+ "Unexpected value type for RVV FP extend/round lowering");
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+ unsigned RVVOpc = ContainerVT.bitsGT(Op.getSimpleValueType())
+ ? RISCVISD::FP_EXTEND_VL
+ : RISCVISD::FP_ROUND_VL;
+ return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
+}
+
+// While RVV has alignment restrictions, we should always be able to load as a
+// legal equivalently-sized byte-typed vector instead. This method is
+// responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If
+// the load is already correctly-aligned, it returns SDValue().
+SDValue RISCVTargetLowering::expandUnalignedRVVLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Load = cast<LoadSDNode>(Op);
+ assert(Load && Load->getMemoryVT().isVector() && "Expected vector load");
+
+ if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ Load->getMemoryVT(),
+ *Load->getMemOperand()))
+ return SDValue();
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ unsigned EltSizeBits = VT.getScalarSizeInBits();
+ assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
+ "Unexpected unaligned RVV load type");
+ MVT NewVT =
+ MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8));
+ assert(NewVT.isValid() &&
+ "Expecting equally-sized RVV vector types to be legal");
+ SDValue L = DAG.getLoad(NewVT, DL, Load->getChain(), Load->getBasePtr(),
+ Load->getPointerInfo(), Load->getOriginalAlign(),
+ Load->getMemOperand()->getFlags());
+ return DAG.getMergeValues({DAG.getBitcast(VT, L), L.getValue(1)}, DL);
+}
+
+// While RVV has alignment restrictions, we should always be able to store as a
+// legal equivalently-sized byte-typed vector instead. This method is
+// responsible for re-expressing a ISD::STORE via a correctly-aligned type. It
+// returns SDValue() if the store is already correctly aligned.
+SDValue RISCVTargetLowering::expandUnalignedRVVStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Store = cast<StoreSDNode>(Op);
+ assert(Store && Store->getValue().getValueType().isVector() &&
+ "Expected vector store");
+
+ if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ Store->getMemoryVT(),
+ *Store->getMemOperand()))
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue StoredVal = Store->getValue();
+ MVT VT = StoredVal.getSimpleValueType();
+ unsigned EltSizeBits = VT.getScalarSizeInBits();
+ assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
+ "Unexpected unaligned RVV store type");
+ MVT NewVT =
+ MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8));
+ assert(NewVT.isValid() &&
+ "Expecting equally-sized RVV vector types to be legal");
+ StoredVal = DAG.getBitcast(NewVT, StoredVal);
+ return DAG.getStore(Store->getChain(), DL, StoredVal, Store->getBasePtr(),
+ Store->getPointerInfo(), Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
+}
+
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -673,6 +2103,8 @@
return lowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT:
return lowerSELECT(Op, DAG);
+ case ISD::BRCOND:
+ return lowerBRCOND(Op, DAG);
case ISD::VASTART:
return lowerVASTART(Op, DAG);
case ISD::FRAMEADDR:
@@ -686,22 +2118,43 @@
case ISD::SRL_PARTS:
return lowerShiftRightParts(Op, DAG, false);
case ISD::BITCAST: {
- assert(((Subtarget.is64Bit() && Subtarget.hasStdExtF()) ||
- Subtarget.hasStdExtZfh()) &&
- "Unexpected custom legalisation");
SDLoc DL(Op);
+ EVT VT = Op.getValueType();
SDValue Op0 = Op.getOperand(0);
- if (Op.getValueType() == MVT::f16 && Subtarget.hasStdExtZfh()) {
- if (Op0.getValueType() != MVT::i16)
- return SDValue();
- SDValue NewOp0 =
- DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Op0);
+ EVT Op0VT = Op0.getValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ if (VT.isFixedLengthVector()) {
+ // We can handle fixed length vector bitcasts with a simple replacement
+ // in isel.
+ if (Op0VT.isFixedLengthVector())
+ return Op;
+ // When bitcasting from scalar to fixed-length vector, insert the scalar
+ // into a one-element vector of the result type, and perform a vector
+ // bitcast.
+ if (!Op0VT.isVector()) {
+ auto BVT = EVT::getVectorVT(*DAG.getContext(), Op0VT, 1);
+ return DAG.getBitcast(VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, BVT,
+ DAG.getUNDEF(BVT), Op0,
+ DAG.getConstant(0, DL, XLenVT)));
+ }
+ return SDValue();
+ }
+ // Custom-legalize bitcasts from fixed-length vector types to scalar types
+ // thus: bitcast the vector to a one-element vector type whose element type
+ // is the same as the result type, and extract the first element.
+ if (!VT.isVector() && Op0VT.isFixedLengthVector()) {
+ LLVMContext &Context = *DAG.getContext();
+ SDValue BVec = DAG.getBitcast(EVT::getVectorVT(Context, VT, 1), Op0);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
+ DAG.getConstant(0, DL, XLenVT));
+ }
+ if (VT == MVT::f16 && Op0VT == MVT::i16 && Subtarget.hasStdExtZfh()) {
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Op0);
SDValue FPConv = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, NewOp0);
return FPConv;
- } else if (Op.getValueType() == MVT::f32 && Subtarget.is64Bit() &&
- Subtarget.hasStdExtF()) {
- if (Op0.getValueType() != MVT::i32)
- return SDValue();
+ }
+ if (VT == MVT::f32 && Op0VT == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtF()) {
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
SDValue FPConv =
DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
@@ -724,12 +2177,27 @@
// If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
if (Op.getOpcode() == ISD::BSWAP)
Imm &= ~0x7U;
- return DAG.getNode(RISCVISD::GREVI, DL, VT, Op.getOperand(0),
- DAG.getTargetConstant(Imm, DL, Subtarget.getXLenVT()));
+ return DAG.getNode(RISCVISD::GREV, DL, VT, Op.getOperand(0),
+ DAG.getConstant(Imm, DL, VT));
+ }
+ case ISD::FSHL:
+ case ISD::FSHR: {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT == Subtarget.getXLenVT() && "Unexpected custom legalization");
+ SDLoc DL(Op);
+ if (Op.getOperand(2).getOpcode() == ISD::Constant)
+ return Op;
+ // FSL/FSR take a log2(XLen)+1 bit shift amount but XLenVT FSHL/FSHR only
+ // use log(XLen) bits. Mask the shift amount accordingly.
+ unsigned ShAmtWidth = Subtarget.getXLen() - 1;
+ SDValue ShAmt = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(2),
+ DAG.getConstant(ShAmtWidth, DL, VT));
+ unsigned Opc = Op.getOpcode() == ISD::FSHL ? RISCVISD::FSL : RISCVISD::FSR;
+ return DAG.getNode(Opc, DL, VT, Op.getOperand(0), Op.getOperand(1), ShAmt);
}
case ISD::TRUNCATE: {
SDLoc DL(Op);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
// Only custom-lower vector truncates
if (!VT.isVector())
return Op;
@@ -739,37 +2207,56 @@
return lowerVectorMaskTrunc(Op, DAG);
// RVV only has truncates which operate from SEW*2->SEW, so lower arbitrary
- // truncates as a series of "RISCVISD::TRUNCATE_VECTOR" nodes which
+ // truncates as a series of "RISCVISD::TRUNCATE_VECTOR_VL" nodes which
// truncate by one power of two at a time.
- EVT DstEltVT = VT.getVectorElementType();
+ MVT DstEltVT = VT.getVectorElementType();
SDValue Src = Op.getOperand(0);
- EVT SrcVT = Src.getValueType();
- EVT SrcEltVT = SrcVT.getVectorElementType();
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT SrcEltVT = SrcVT.getVectorElementType();
assert(DstEltVT.bitsLT(SrcEltVT) &&
isPowerOf2_64(DstEltVT.getSizeInBits()) &&
isPowerOf2_64(SrcEltVT.getSizeInBits()) &&
"Unexpected vector truncate lowering");
+ MVT ContainerVT = SrcVT;
+ if (SrcVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(SrcVT);
+ Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+ }
+
SDValue Result = Src;
+ SDValue Mask, VL;
+ std::tie(Mask, VL) =
+ getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
LLVMContext &Context = *DAG.getContext();
- const ElementCount Count = SrcVT.getVectorElementCount();
+ const ElementCount Count = ContainerVT.getVectorElementCount();
do {
- SrcEltVT = EVT::getIntegerVT(Context, SrcEltVT.getSizeInBits() / 2);
+ SrcEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
- Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR, DL, ResultVT, Result);
+ Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, ResultVT, Result,
+ Mask, VL);
} while (SrcEltVT != DstEltVT);
+ if (SrcVT.isFixedLengthVector())
+ Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
return Result;
}
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
- return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1);
+ if (Op.getOperand(0).getValueType().isVector() &&
+ Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1);
+ return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VZEXT_VL);
case ISD::SIGN_EXTEND:
- return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1);
- case ISD::SPLAT_VECTOR:
- return lowerSPLATVECTOR(Op, DAG);
+ if (Op.getOperand(0).getValueType().isVector() &&
+ Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1);
+ return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VSEXT_VL);
+ case ISD::SPLAT_VECTOR_PARTS:
+ return lowerSPLAT_VECTOR_PARTS(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
@@ -781,10 +2268,401 @@
// We define our scalable vector types for lmul=1 to use a 64 bit known
// minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
// vscale as VLENB / 8.
+ assert(RISCV::RVVBitsPerBlock == 64 && "Unexpected bits per block!");
+ if (isa<ConstantSDNode>(Op.getOperand(0))) {
+ // We assume VLENB is a multiple of 8. We manually choose the best shift
+ // here because SimplifyDemandedBits isn't always able to simplify it.
+ uint64_t Val = Op.getConstantOperandVal(0);
+ if (isPowerOf2_64(Val)) {
+ uint64_t Log2 = Log2_64(Val);
+ if (Log2 < 3)
+ return DAG.getNode(ISD::SRL, DL, VT, VLENB,
+ DAG.getConstant(3 - Log2, DL, VT));
+ if (Log2 > 3)
+ return DAG.getNode(ISD::SHL, DL, VT, VLENB,
+ DAG.getConstant(Log2 - 3, DL, VT));
+ return VLENB;
+ }
+ // If the multiplier is a multiple of 8, scale it down to avoid needing
+ // to shift the VLENB value.
+ if ((Val % 8) == 0)
+ return DAG.getNode(ISD::MUL, DL, VT, VLENB,
+ DAG.getConstant(Val / 8, DL, VT));
+ }
+
SDValue VScale = DAG.getNode(ISD::SRL, DL, VT, VLENB,
DAG.getConstant(3, DL, VT));
return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0));
}
+ case ISD::FP_EXTEND: {
+ // RVV can only do fp_extend to types double the size as the source. We
+ // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going
+ // via f32.
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // Prepare any fixed-length vector operands.
+ MVT ContainerVT = VT;
+ if (SrcVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ MVT SrcContainerVT =
+ ContainerVT.changeVectorElementType(SrcVT.getVectorElementType());
+ Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+ }
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f64 ||
+ SrcVT.getVectorElementType() != MVT::f16) {
+ // For scalable vectors, we only need to close the gap between
+ // vXf16->vXf64.
+ if (!VT.isFixedLengthVector())
+ return Op;
+ // For fixed-length vectors, lower the FP_EXTEND to a custom "VL" version.
+ Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
+ return convertFromScalableVector(VT, Src, DAG, Subtarget);
+ }
+
+ MVT InterVT = VT.changeVectorElementType(MVT::f32);
+ MVT InterContainerVT = ContainerVT.changeVectorElementType(MVT::f32);
+ SDValue IntermediateExtend = getRVVFPExtendOrRound(
+ Src, InterVT, InterContainerVT, DL, DAG, Subtarget);
+
+ SDValue Extend = getRVVFPExtendOrRound(IntermediateExtend, VT, ContainerVT,
+ DL, DAG, Subtarget);
+ if (VT.isFixedLengthVector())
+ return convertFromScalableVector(VT, Extend, DAG, Subtarget);
+ return Extend;
+ }
+ case ISD::FP_ROUND: {
+ // RVV can only do fp_round to types half the size as the source. We
+ // custom-lower f64->f16 rounds via RVV's round-to-odd float
+ // conversion instruction.
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // Prepare any fixed-length vector operands.
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ MVT SrcContainerVT = getContainerForFixedLengthVector(SrcVT);
+ ContainerVT =
+ SrcContainerVT.changeVectorElementType(VT.getVectorElementType());
+ Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+ }
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
+ SrcVT.getVectorElementType() != MVT::f64) {
+ // For scalable vectors, we only need to close the gap between
+ // vXf64<->vXf16.
+ if (!VT.isFixedLengthVector())
+ return Op;
+ // For fixed-length vectors, lower the FP_ROUND to a custom "VL" version.
+ Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
+ return convertFromScalableVector(VT, Src, DAG, Subtarget);
+ }
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ MVT InterVT = ContainerVT.changeVectorElementType(MVT::f32);
+ SDValue IntermediateRound =
+ DAG.getNode(RISCVISD::VFNCVT_ROD_VL, DL, InterVT, Src, Mask, VL);
+ SDValue Round = getRVVFPExtendOrRound(IntermediateRound, VT, ContainerVT,
+ DL, DAG, Subtarget);
+
+ if (VT.isFixedLengthVector())
+ return convertFromScalableVector(VT, Round, DAG, Subtarget);
+ return Round;
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP: {
+ // RVV can only do fp<->int conversions to types half/double the size as
+ // the source. We custom-lower any conversions that do two hops into
+ // sequences.
+ MVT VT = Op.getSimpleValueType();
+ if (!VT.isVector())
+ return Op;
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ MVT EltVT = VT.getVectorElementType();
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT SrcEltVT = SrcVT.getVectorElementType();
+ unsigned EltSize = EltVT.getSizeInBits();
+ unsigned SrcEltSize = SrcEltVT.getSizeInBits();
+ assert(isPowerOf2_32(EltSize) && isPowerOf2_32(SrcEltSize) &&
+ "Unexpected vector element types");
+
+ bool IsInt2FP = SrcEltVT.isInteger();
+ // Widening conversions
+ if (EltSize > SrcEltSize && (EltSize / SrcEltSize >= 4)) {
+ if (IsInt2FP) {
+ // Do a regular integer sign/zero extension then convert to float.
+ MVT IVecVT = MVT::getVectorVT(MVT::getIntegerVT(EltVT.getSizeInBits()),
+ VT.getVectorElementCount());
+ unsigned ExtOpcode = Op.getOpcode() == ISD::UINT_TO_FP
+ ? ISD::ZERO_EXTEND
+ : ISD::SIGN_EXTEND;
+ SDValue Ext = DAG.getNode(ExtOpcode, DL, IVecVT, Src);
+ return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
+ }
+ // FP2Int
+ assert(SrcEltVT == MVT::f16 && "Unexpected FP_TO_[US]INT lowering");
+ // Do one doubling fp_extend then complete the operation by converting
+ // to int.
+ MVT InterimFVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ SDValue FExt = DAG.getFPExtendOrRound(Src, DL, InterimFVT);
+ return DAG.getNode(Op.getOpcode(), DL, VT, FExt);
+ }
+
+ // Narrowing conversions
+ if (SrcEltSize > EltSize && (SrcEltSize / EltSize >= 4)) {
+ if (IsInt2FP) {
+ // One narrowing int_to_fp, then an fp_round.
+ assert(EltVT == MVT::f16 && "Unexpected [US]_TO_FP lowering");
+ MVT InterimFVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ SDValue Int2FP = DAG.getNode(Op.getOpcode(), DL, InterimFVT, Src);
+ return DAG.getFPExtendOrRound(Int2FP, DL, VT);
+ }
+ // FP2Int
+ // One narrowing fp_to_int, then truncate the integer. If the float isn't
+ // representable by the integer, the result is poison.
+ MVT IVecVT =
+ MVT::getVectorVT(MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2),
+ VT.getVectorElementCount());
+ SDValue FP2Int = DAG.getNode(Op.getOpcode(), DL, IVecVT, Src);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, FP2Int);
+ }
+
+ // Scalable vectors can exit here. Patterns will handle equally-sized
+ // conversions halving/doubling ones.
+ if (!VT.isFixedLengthVector())
+ return Op;
+
+ // For fixed-length vectors we lower to a custom "VL" node.
+ unsigned RVVOpc = 0;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Impossible opcode");
+ case ISD::FP_TO_SINT:
+ RVVOpc = RISCVISD::FP_TO_SINT_VL;
+ break;
+ case ISD::FP_TO_UINT:
+ RVVOpc = RISCVISD::FP_TO_UINT_VL;
+ break;
+ case ISD::SINT_TO_FP:
+ RVVOpc = RISCVISD::SINT_TO_FP_VL;
+ break;
+ case ISD::UINT_TO_FP:
+ RVVOpc = RISCVISD::UINT_TO_FP_VL;
+ break;
+ }
+
+ MVT ContainerVT, SrcContainerVT;
+ // Derive the reference container type from the larger vector type.
+ if (SrcEltSize > EltSize) {
+ SrcContainerVT = getContainerForFixedLengthVector(SrcVT);
+ ContainerVT =
+ SrcContainerVT.changeVectorElementType(VT.getVectorElementType());
+ } else {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ SrcContainerVT = ContainerVT.changeVectorElementType(SrcEltVT);
+ }
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+ Src = DAG.getNode(RVVOpc, DL, ContainerVT, Src, Mask, VL);
+ return convertFromScalableVector(VT, Src, DAG, Subtarget);
+ }
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_SMIN:
+ return lowerVECREDUCE(Op, DAG);
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ return lowerVectorMaskVECREDUCE(Op, DAG);
+ return lowerVECREDUCE(Op, DAG);
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_SEQ_FADD:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAX:
+ return lowerFPVECREDUCE(Op, DAG);
+ case ISD::INSERT_SUBVECTOR:
+ return lowerINSERT_SUBVECTOR(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR:
+ return lowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::STEP_VECTOR:
+ return lowerSTEP_VECTOR(Op, DAG);
+ case ISD::VECTOR_REVERSE:
+ return lowerVECTOR_REVERSE(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG, Subtarget);
+ case ISD::SPLAT_VECTOR:
+ if (Op.getValueType().getVectorElementType() == MVT::i1)
+ return lowerVectorMaskSplat(Op, DAG);
+ return lowerSPLAT_VECTOR(Op, DAG, Subtarget);
+ case ISD::VECTOR_SHUFFLE:
+ return lowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
+ case ISD::CONCAT_VECTORS: {
+ // Split CONCAT_VECTORS into a series of INSERT_SUBVECTOR nodes. This is
+ // better than going through the stack, as the default expansion does.
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumOpElts =
+ Op.getOperand(0).getSimpleValueType().getVectorMinNumElements();
+ SDValue Vec = DAG.getUNDEF(VT);
+ for (const auto &OpIdx : enumerate(Op->ops()))
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, OpIdx.value(),
+ DAG.getIntPtrConstant(OpIdx.index() * NumOpElts, DL));
+ return Vec;
+ }
+ case ISD::LOAD:
+ if (auto V = expandUnalignedRVVLoad(Op, DAG))
+ return V;
+ if (Op.getValueType().isFixedLengthVector())
+ return lowerFixedLengthVectorLoadToRVV(Op, DAG);
+ return Op;
+ case ISD::STORE:
+ if (auto V = expandUnalignedRVVStore(Op, DAG))
+ return V;
+ if (Op.getOperand(1).getValueType().isFixedLengthVector())
+ return lowerFixedLengthVectorStoreToRVV(Op, DAG);
+ return Op;
+ case ISD::MLOAD:
+ return lowerMLOAD(Op, DAG);
+ case ISD::MSTORE:
+ return lowerMSTORE(Op, DAG);
+ case ISD::SETCC:
+ return lowerFixedLengthVectorSetccToRVV(Op, DAG);
+ case ISD::ADD:
+ return lowerToScalableOp(Op, DAG, RISCVISD::ADD_VL);
+ case ISD::SUB:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SUB_VL);
+ case ISD::MUL:
+ return lowerToScalableOp(Op, DAG, RISCVISD::MUL_VL);
+ case ISD::MULHS:
+ return lowerToScalableOp(Op, DAG, RISCVISD::MULHS_VL);
+ case ISD::MULHU:
+ return lowerToScalableOp(Op, DAG, RISCVISD::MULHU_VL);
+ case ISD::AND:
+ return lowerFixedLengthVectorLogicOpToRVV(Op, DAG, RISCVISD::VMAND_VL,
+ RISCVISD::AND_VL);
+ case ISD::OR:
+ return lowerFixedLengthVectorLogicOpToRVV(Op, DAG, RISCVISD::VMOR_VL,
+ RISCVISD::OR_VL);
+ case ISD::XOR:
+ return lowerFixedLengthVectorLogicOpToRVV(Op, DAG, RISCVISD::VMXOR_VL,
+ RISCVISD::XOR_VL);
+ case ISD::SDIV:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SDIV_VL);
+ case ISD::SREM:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SREM_VL);
+ case ISD::UDIV:
+ return lowerToScalableOp(Op, DAG, RISCVISD::UDIV_VL);
+ case ISD::UREM:
+ return lowerToScalableOp(Op, DAG, RISCVISD::UREM_VL);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ if (Op.getSimpleValueType().isFixedLengthVector())
+ return lowerFixedLengthVectorShiftToRVV(Op, DAG);
+ // This can be called for an i32 shift amount that needs to be promoted.
+ assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ return SDValue();
+ case ISD::SADDSAT:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SADDSAT_VL);
+ case ISD::UADDSAT:
+ return lowerToScalableOp(Op, DAG, RISCVISD::UADDSAT_VL);
+ case ISD::SSUBSAT:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SSUBSAT_VL);
+ case ISD::USUBSAT:
+ return lowerToScalableOp(Op, DAG, RISCVISD::USUBSAT_VL);
+ case ISD::FADD:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FADD_VL);
+ case ISD::FSUB:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FSUB_VL);
+ case ISD::FMUL:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FMUL_VL);
+ case ISD::FDIV:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FDIV_VL);
+ case ISD::FNEG:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FNEG_VL);
+ case ISD::FABS:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FABS_VL);
+ case ISD::FSQRT:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FSQRT_VL);
+ case ISD::FMA:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FMA_VL);
+ case ISD::SMIN:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SMIN_VL);
+ case ISD::SMAX:
+ return lowerToScalableOp(Op, DAG, RISCVISD::SMAX_VL);
+ case ISD::UMIN:
+ return lowerToScalableOp(Op, DAG, RISCVISD::UMIN_VL);
+ case ISD::UMAX:
+ return lowerToScalableOp(Op, DAG, RISCVISD::UMAX_VL);
+ case ISD::FMINNUM:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FMINNUM_VL);
+ case ISD::FMAXNUM:
+ return lowerToScalableOp(Op, DAG, RISCVISD::FMAXNUM_VL);
+ case ISD::ABS:
+ return lowerABS(Op, DAG);
+ case ISD::VSELECT:
+ return lowerFixedLengthVectorSelectToRVV(Op, DAG);
+ case ISD::FCOPYSIGN:
+ return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
+ case ISD::MGATHER:
+ return lowerMGATHER(Op, DAG);
+ case ISD::MSCATTER:
+ return lowerMSCATTER(Op, DAG);
+ case ISD::FLT_ROUNDS_:
+ return lowerGET_ROUNDING(Op, DAG);
+ case ISD::SET_ROUNDING:
+ return lowerSET_ROUNDING(Op, DAG);
+ case ISD::VP_ADD:
+ return lowerVPOp(Op, DAG, RISCVISD::ADD_VL);
+ case ISD::VP_SUB:
+ return lowerVPOp(Op, DAG, RISCVISD::SUB_VL);
+ case ISD::VP_MUL:
+ return lowerVPOp(Op, DAG, RISCVISD::MUL_VL);
+ case ISD::VP_SDIV:
+ return lowerVPOp(Op, DAG, RISCVISD::SDIV_VL);
+ case ISD::VP_UDIV:
+ return lowerVPOp(Op, DAG, RISCVISD::UDIV_VL);
+ case ISD::VP_SREM:
+ return lowerVPOp(Op, DAG, RISCVISD::SREM_VL);
+ case ISD::VP_UREM:
+ return lowerVPOp(Op, DAG, RISCVISD::UREM_VL);
+ case ISD::VP_AND:
+ return lowerVPOp(Op, DAG, RISCVISD::AND_VL);
+ case ISD::VP_OR:
+ return lowerVPOp(Op, DAG, RISCVISD::OR_VL);
+ case ISD::VP_XOR:
+ return lowerVPOp(Op, DAG, RISCVISD::XOR_VL);
+ case ISD::VP_ASHR:
+ return lowerVPOp(Op, DAG, RISCVISD::SRA_VL);
+ case ISD::VP_LSHR:
+ return lowerVPOp(Op, DAG, RISCVISD::SRL_VL);
+ case ISD::VP_SHL:
+ return lowerVPOp(Op, DAG, RISCVISD::SHL_VL);
+ case ISD::VP_FADD:
+ return lowerVPOp(Op, DAG, RISCVISD::FADD_VL);
+ case ISD::VP_FSUB:
+ return lowerVPOp(Op, DAG, RISCVISD::FSUB_VL);
+ case ISD::VP_FMUL:
+ return lowerVPOp(Op, DAG, RISCVISD::FMUL_VL);
+ case ISD::VP_FDIV:
+ return lowerVPOp(Op, DAG, RISCVISD::FDIV_VL);
}
}
@@ -1010,24 +2888,51 @@
SDValue TrueV = Op.getOperand(1);
SDValue FalseV = Op.getOperand(2);
SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
MVT XLenVT = Subtarget.getXLenVT();
+ // Lower vector SELECTs to VSELECTs by splatting the condition.
+ if (VT.isVector()) {
+ MVT SplatCondVT = VT.changeVectorElementType(MVT::i1);
+ SDValue CondSplat = VT.isScalableVector()
+ ? DAG.getSplatVector(SplatCondVT, DL, CondV)
+ : DAG.getSplatBuildVector(SplatCondVT, DL, CondV);
+ return DAG.getNode(ISD::VSELECT, DL, VT, CondSplat, TrueV, FalseV);
+ }
+
// If the result type is XLenVT and CondV is the output of a SETCC node
// which also operated on XLenVT inputs, then merge the SETCC node into the
// lowered RISCVISD::SELECT_CC to take advantage of the integer
// compare+branch instructions. i.e.:
// (select (setcc lhs, rhs, cc), truev, falsev)
// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
- if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
+ if (VT == XLenVT && CondV.getOpcode() == ISD::SETCC &&
CondV.getOperand(0).getSimpleValueType() == XLenVT) {
SDValue LHS = CondV.getOperand(0);
SDValue RHS = CondV.getOperand(1);
- auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
+ const auto *CC = cast<CondCodeSDNode>(CondV.getOperand(2));
ISD::CondCode CCVal = CC->get();
- normaliseSetCC(LHS, RHS, CCVal);
+ // Special case for a select of 2 constants that have a diffence of 1.
+ // Normally this is done by DAGCombine, but if the select is introduced by
+ // type legalization or op legalization, we miss it. Restricting to SETLT
+ // case for now because that is what signed saturating add/sub need.
+ // FIXME: We don't need the condition to be SETLT or even a SETCC,
+ // but we would probably want to swap the true/false values if the condition
+ // is SETGE/SETLE to avoid an XORI.
+ if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) &&
+ CCVal == ISD::SETLT) {
+ const APInt &TrueVal = cast<ConstantSDNode>(TrueV)->getAPIntValue();
+ const APInt &FalseVal = cast<ConstantSDNode>(FalseV)->getAPIntValue();
+ if (TrueVal - 1 == FalseVal)
+ return DAG.getNode(ISD::ADD, DL, Op.getValueType(), CondV, FalseV);
+ if (TrueVal + 1 == FalseVal)
+ return DAG.getNode(ISD::SUB, DL, Op.getValueType(), FalseV, CondV);
+ }
- SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ SDValue TargetCC = DAG.getTargetConstant(CCVal, DL, XLenVT);
SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
@@ -1036,13 +2941,36 @@
// (select condv, truev, falsev)
// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
SDValue Zero = DAG.getConstant(0, DL, XLenVT);
- SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);
+ SDValue SetNE = DAG.getTargetConstant(ISD::SETNE, DL, XLenVT);
SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
+SDValue RISCVTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ SDValue CondV = Op.getOperand(1);
+ SDLoc DL(Op);
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ if (CondV.getOpcode() == ISD::SETCC &&
+ CondV.getOperand(0).getValueType() == XLenVT) {
+ SDValue LHS = CondV.getOperand(0);
+ SDValue RHS = CondV.getOperand(1);
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(CondV.getOperand(2))->get();
+
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ SDValue TargetCC = DAG.getCondCode(CCVal);
+ return DAG.getNode(RISCVISD::BR_CC, DL, Op.getValueType(), Op.getOperand(0),
+ LHS, RHS, TargetCC, Op.getOperand(2));
+ }
+
+ return DAG.getNode(RISCVISD::BR_CC, DL, Op.getValueType(), Op.getOperand(0),
+ CondV, DAG.getConstant(0, DL, XLenVT),
+ DAG.getCondCode(ISD::SETNE), Op.getOperand(2));
+}
+
SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
@@ -1202,57 +3130,77 @@
return DAG.getMergeValues(Parts, DL);
}
-// Custom-lower a SPLAT_VECTOR where XLEN<SEW, as the SEW element type is
+// Lower splats of i1 types to SETCC. For each mask vector type, we have a
+// legal equivalently-sized i8 type, so we can use that as a go-between.
+SDValue RISCVTargetLowering::lowerVectorMaskSplat(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue SplatVal = Op.getOperand(0);
+ // All-zeros or all-ones splats are handled specially.
+ if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) {
+ SDValue VL = getDefaultScalableVLOps(VT, DL, DAG, Subtarget).second;
+ return DAG.getNode(RISCVISD::VMSET_VL, DL, VT, VL);
+ }
+ if (ISD::isConstantSplatVectorAllZeros(Op.getNode())) {
+ SDValue VL = getDefaultScalableVLOps(VT, DL, DAG, Subtarget).second;
+ return DAG.getNode(RISCVISD::VMCLR_VL, DL, VT, VL);
+ }
+ MVT XLenVT = Subtarget.getXLenVT();
+ assert(SplatVal.getValueType() == XLenVT &&
+ "Unexpected type for i1 splat value");
+ MVT InterVT = VT.changeVectorElementType(MVT::i8);
+ SplatVal = DAG.getNode(ISD::AND, DL, XLenVT, SplatVal,
+ DAG.getConstant(1, DL, XLenVT));
+ SDValue LHS = DAG.getSplatVector(InterVT, DL, SplatVal);
+ SDValue Zero = DAG.getConstant(0, DL, InterVT);
+ return DAG.getSetCC(DL, VT, LHS, Zero, ISD::SETNE);
+}
+
+// Custom-lower a SPLAT_VECTOR_PARTS where XLEN<SEW, as the SEW element type is
// illegal (currently only vXi64 RV32).
// FIXME: We could also catch non-constant sign-extended i32 values and lower
// them to SPLAT_VECTOR_I64
-SDValue RISCVTargetLowering::lowerSPLATVECTOR(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
- EVT VecVT = Op.getValueType();
+ MVT VecVT = Op.getSimpleValueType();
assert(!Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64 &&
- "Unexpected SPLAT_VECTOR lowering");
- SDValue SplatVal = Op.getOperand(0);
+ "Unexpected SPLAT_VECTOR_PARTS lowering");
- // If we can prove that the value is a sign-extended 32-bit value, lower this
- // as a custom node in order to try and match RVV vector/scalar instructions.
- if (auto *CVal = dyn_cast<ConstantSDNode>(SplatVal)) {
- if (isInt<32>(CVal->getSExtValue()))
- return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
- DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32));
+ assert(Op.getNumOperands() == 2 && "Unexpected number of operands!");
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+
+ if (VecVT.isFixedLengthVector()) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VecVT);
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) =
+ getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue Res = splatPartsI64WithVL(DL, ContainerVT, Lo, Hi, VL, DAG);
+ return convertFromScalableVector(VecVT, Res, DAG, Subtarget);
}
- if (SplatVal.getOpcode() == ISD::SIGN_EXTEND &&
- SplatVal.getOperand(0).getValueType() == MVT::i32) {
- return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
- SplatVal.getOperand(0));
+ if (isa<ConstantSDNode>(Lo) && isa<ConstantSDNode>(Hi)) {
+ int32_t LoC = cast<ConstantSDNode>(Lo)->getSExtValue();
+ int32_t HiC = cast<ConstantSDNode>(Hi)->getSExtValue();
+ // If Hi constant is all the same sign bit as Lo, lower this as a custom
+ // node in order to try and match RVV vector/scalar instructions.
+ if ((LoC >> 31) == HiC)
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
}
- // Else, on RV32 we lower an i64-element SPLAT_VECTOR thus, being careful not
- // to accidentally sign-extend the 32-bit halves to the e64 SEW:
- // vmv.v.x vX, hi
- // vsll.vx vX, vX, /*32*/
- // vmv.v.x vY, lo
- // vsll.vx vY, vY, /*32*/
- // vsrl.vx vY, vY, /*32*/
- // vor.vv vX, vX, vY
- SDValue One = DAG.getConstant(1, DL, MVT::i32);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
- SDValue ThirtyTwoV = DAG.getConstant(32, DL, VecVT);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, Zero);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, One);
+ // Detect cases where Hi is (SRA Lo, 31) which means Hi is Lo sign extended.
+ if (Hi.getOpcode() == ISD::SRA && Hi.getOperand(0) == Lo &&
+ isa<ConstantSDNode>(Hi.getOperand(1)) &&
+ Hi.getConstantOperandVal(1) == 31)
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
- Lo = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
- Lo = DAG.getNode(ISD::SHL, DL, VecVT, Lo, ThirtyTwoV);
- Lo = DAG.getNode(ISD::SRL, DL, VecVT, Lo, ThirtyTwoV);
-
- if (isNullConstant(Hi))
- return Lo;
-
- Hi = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Hi);
- Hi = DAG.getNode(ISD::SHL, DL, VecVT, Hi, ThirtyTwoV);
-
- return DAG.getNode(ISD::OR, DL, VecVT, Lo, Hi);
+ // Fall back to use a stack store and stride x0 vector load. Use X0 as VL.
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT, Lo, Hi,
+ DAG.getRegister(RISCV::X0, MVT::i64));
}
// Custom-lower extensions from mask vectors by using a vselect either with 1
@@ -1262,32 +3210,80 @@
SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
int64_t ExtTrueVal) const {
SDLoc DL(Op);
- EVT VecVT = Op.getValueType();
+ MVT VecVT = Op.getSimpleValueType();
SDValue Src = Op.getOperand(0);
// Only custom-lower extensions from mask types
- if (!Src.getValueType().isVector() ||
- Src.getValueType().getVectorElementType() != MVT::i1)
- return Op;
+ assert(Src.getValueType().isVector() &&
+ Src.getValueType().getVectorElementType() == MVT::i1);
- // Be careful not to introduce illegal scalar types at this stage, and be
- // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
- // illegal and must be expanded. Since we know that the constants are
- // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
- bool IsRV32E64 =
- !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
- SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
- SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, Subtarget.getXLenVT());
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue SplatZero = DAG.getConstant(0, DL, XLenVT);
+ SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, XLenVT);
- if (!IsRV32E64) {
- SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
- SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
- } else {
- SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
- SplatTrueVal =
- DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatTrueVal);
+ if (VecVT.isScalableVector()) {
+ // Be careful not to introduce illegal scalar types at this stage, and be
+ // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
+ // illegal and must be expanded. Since we know that the constants are
+ // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
+ bool IsRV32E64 =
+ !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+
+ if (!IsRV32E64) {
+ SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
+ SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
+ } else {
+ SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+ SplatTrueVal =
+ DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatTrueVal);
+ }
+
+ return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
}
- return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
+ MVT ContainerVT = getContainerForFixedLengthVector(VecVT);
+ MVT I1ContainerVT =
+ MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+
+ SDValue CC = convertToScalableVector(I1ContainerVT, Src, DAG, Subtarget);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+
+ SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero, VL);
+ SplatTrueVal =
+ DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatTrueVal, VL);
+ SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC,
+ SplatTrueVal, SplatZero, VL);
+
+ return convertFromScalableVector(VecVT, Select, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerFixedLengthVectorExtendToRVV(
+ SDValue Op, SelectionDAG &DAG, unsigned ExtendOpc) const {
+ MVT ExtVT = Op.getSimpleValueType();
+ // Only custom-lower extensions from fixed-length vector types.
+ if (!ExtVT.isFixedLengthVector())
+ return Op;
+ MVT VT = Op.getOperand(0).getSimpleValueType();
+ // Grab the canonical container type for the extended type. Infer the smaller
+ // type from that to ensure the same number of vector elements, as we know
+ // the LMUL will be sufficient to hold the smaller type.
+ MVT ContainerExtVT = getContainerForFixedLengthVector(ExtVT);
+ // Get the extended container type manually to ensure the same number of
+ // vector elements between source and dest.
+ MVT ContainerVT = MVT::getVectorVT(VT.getVectorElementType(),
+ ContainerExtVT.getVectorElementCount());
+
+ SDValue Op1 =
+ convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget);
+
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue Ext = DAG.getNode(ExtendOpc, DL, ContainerExtVT, Op1, Mask, VL);
+
+ return convertFromScalableVector(ExtVT, Ext, DAG, Subtarget);
}
// Custom-lower truncations from vectors to mask vectors by using a mask and a
@@ -1301,280 +3297,1517 @@
assert(MaskVT.isVector() && MaskVT.getVectorElementType() == MVT::i1 &&
"Unexpected type for vector mask lowering");
SDValue Src = Op.getOperand(0);
- EVT VecVT = Src.getValueType();
+ MVT VecVT = Src.getSimpleValueType();
- // Be careful not to introduce illegal scalar types at this stage, and be
- // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
- // illegal and must be expanded. Since we know that the constants are
- // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
- bool IsRV32E64 =
- !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+ // If this is a fixed vector, we need to convert it to a scalable vector.
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+ }
+
SDValue SplatOne = DAG.getConstant(1, DL, Subtarget.getXLenVT());
SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
- if (!IsRV32E64) {
- SplatOne = DAG.getSplatVector(VecVT, DL, SplatOne);
- SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
- } else {
- SplatOne = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatOne);
- SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+ SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatOne);
+ SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero);
+
+ if (VecVT.isScalableVector()) {
+ SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
+ return DAG.getSetCC(DL, MaskVT, Trunc, SplatZero, ISD::SETNE);
}
- SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
- return DAG.getSetCC(DL, MaskVT, Trunc, SplatZero, ISD::SETNE);
+ MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+ SDValue Trunc =
+ DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Src, SplatOne, Mask, VL);
+ Trunc = DAG.getNode(RISCVISD::SETCC_VL, DL, MaskContainerVT, Trunc, SplatZero,
+ DAG.getCondCode(ISD::SETNE), Mask, VL);
+ return convertFromScalableVector(MaskVT, Trunc, DAG, Subtarget);
}
+// Custom-legalize INSERT_VECTOR_ELT so that the value is inserted into the
+// first position of a vector, and that vector is slid up to the insert index.
+// By limiting the active vector length to index+1 and merging with the
+// original vector (with an undisturbed tail policy for elements >= VL), we
+// achieve the desired result of leaving all elements untouched except the one
+// at VL-1, which is replaced with the desired value.
SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
- EVT VecVT = Op.getValueType();
+ MVT VecVT = Op.getSimpleValueType();
SDValue Vec = Op.getOperand(0);
SDValue Val = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
- // Custom-legalize INSERT_VECTOR_ELT where XLEN>=SEW, so that the vector is
- // first slid down into position, the value is inserted into the first
- // position, and the vector is slid back up. We do this to simplify patterns.
- // (slideup vec, (insertelt (slidedown impdef, vec, idx), val, 0), idx),
- if (Subtarget.is64Bit() || VecVT.getVectorElementType() != MVT::i64) {
- if (isNullConstant(Idx))
- return Op;
- SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
- DAG.getUNDEF(VecVT), Vec, Idx);
- SDValue InsertElt0 =
- DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Slidedown, Val,
- DAG.getConstant(0, DL, Subtarget.getXLenVT()));
-
- return DAG.getNode(RISCVISD::VSLIDEUP, DL, VecVT, Vec, InsertElt0, Idx);
+ if (VecVT.getVectorElementType() == MVT::i1) {
+ // FIXME: For now we just promote to an i8 vector and insert into that,
+ // but this is probably not optimal.
+ MVT WideVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount());
+ Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Vec);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideVT, Vec, Val, Idx);
+ return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Vec);
}
- // Custom-legalize INSERT_VECTOR_ELT where XLEN<SEW, as the SEW element type
- // is illegal (currently only vXi64 RV32).
- // Since there is no easy way of getting a single element into a vector when
- // XLEN<SEW, we lower the operation to the following sequence:
- // splat vVal, rVal
- // vid.v vVid
- // vmseq.vx mMask, vVid, rIdx
- // vmerge.vvm vDest, vSrc, vVal, mMask
- // This essentially merges the original vector with the inserted element by
- // using a mask whose only set bit is that corresponding to the insert
- // index.
- SDValue SplattedVal = DAG.getSplatVector(VecVT, DL, Val);
- SDValue SplattedIdx = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Idx);
+ MVT ContainerVT = VecVT;
+ // If the operand is a fixed-length vector, convert to a scalable one.
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
- SDValue VID = DAG.getNode(RISCVISD::VID, DL, VecVT);
- auto SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VecVT);
- SDValue Mask = DAG.getSetCC(DL, SetCCVT, VID, SplattedIdx, ISD::SETEQ);
+ MVT XLenVT = Subtarget.getXLenVT();
- return DAG.getNode(ISD::VSELECT, DL, VecVT, Mask, SplattedVal, Vec);
+ SDValue Zero = DAG.getConstant(0, DL, XLenVT);
+ bool IsLegalInsert = Subtarget.is64Bit() || Val.getValueType() != MVT::i64;
+ // Even i64-element vectors on RV32 can be lowered without scalar
+ // legalization if the most-significant 32 bits of the value are not affected
+ // by the sign-extension of the lower 32 bits.
+ // TODO: We could also catch sign extensions of a 32-bit value.
+ if (!IsLegalInsert && isa<ConstantSDNode>(Val)) {
+ const auto *CVal = cast<ConstantSDNode>(Val);
+ if (isInt<32>(CVal->getSExtValue())) {
+ IsLegalInsert = true;
+ Val = DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32);
+ }
+ }
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue ValInVec;
+
+ if (IsLegalInsert) {
+ unsigned Opc =
+ VecVT.isFloatingPoint() ? RISCVISD::VFMV_S_F_VL : RISCVISD::VMV_S_X_VL;
+ if (isNullConstant(Idx)) {
+ Vec = DAG.getNode(Opc, DL, ContainerVT, Vec, Val, VL);
+ if (!VecVT.isFixedLengthVector())
+ return Vec;
+ return convertFromScalableVector(VecVT, Vec, DAG, Subtarget);
+ }
+ ValInVec =
+ DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Val, VL);
+ } else {
+ // On RV32, i64-element vectors must be specially handled to place the
+ // value at element 0, by using two vslide1up instructions in sequence on
+ // the i32 split lo/hi value. Use an equivalently-sized i32 vector for
+ // this.
+ SDValue One = DAG.getConstant(1, DL, XLenVT);
+ SDValue ValLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Val, Zero);
+ SDValue ValHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Val, One);
+ MVT I32ContainerVT =
+ MVT::getVectorVT(MVT::i32, ContainerVT.getVectorElementCount() * 2);
+ SDValue I32Mask =
+ getDefaultScalableVLOps(I32ContainerVT, DL, DAG, Subtarget).first;
+ // Limit the active VL to two.
+ SDValue InsertI64VL = DAG.getConstant(2, DL, XLenVT);
+ // Note: We can't pass a UNDEF to the first VSLIDE1UP_VL since an untied
+ // undef doesn't obey the earlyclobber constraint. Just splat a zero value.
+ ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT, Zero,
+ InsertI64VL);
+ // First slide in the hi value, then the lo in underneath it.
+ ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
+ ValHi, I32Mask, InsertI64VL);
+ ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
+ ValLo, I32Mask, InsertI64VL);
+ // Bitcast back to the right container type.
+ ValInVec = DAG.getBitcast(ContainerVT, ValInVec);
+ }
+
+ // Now that the value is in a vector, slide it into position.
+ SDValue InsertVL =
+ DAG.getNode(ISD::ADD, DL, XLenVT, Idx, DAG.getConstant(1, DL, XLenVT));
+ SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec,
+ ValInVec, Idx, Mask, InsertVL);
+ if (!VecVT.isFixedLengthVector())
+ return Slideup;
+ return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget);
}
// Custom-lower EXTRACT_VECTOR_ELT operations to slide the vector down, then
-// extract the first element: (extractelt (slidedown vec, idx), 0). This is
-// done to maintain partity with the legalization of RV32 vXi64 legalization.
+// extract the first element: (extractelt (slidedown vec, idx), 0). For integer
+// types this is done using VMV_X_S to allow us to glean information about the
+// sign bits of the result.
SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Idx = Op.getOperand(1);
- if (isNullConstant(Idx))
- return Op;
-
SDValue Vec = Op.getOperand(0);
EVT EltVT = Op.getValueType();
- EVT VecVT = Vec.getValueType();
- SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
- DAG.getUNDEF(VecVT), Vec, Idx);
+ MVT VecVT = Vec.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Slidedown,
- DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ if (VecVT.getVectorElementType() == MVT::i1) {
+ // FIXME: For now we just promote to an i8 vector and extract from that,
+ // but this is probably not optimal.
+ MVT WideVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount());
+ Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, Idx);
+ }
+
+ // If this is a fixed vector, we need to convert it to a scalable vector.
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
+ // If the index is 0, the vector is already in the right position.
+ if (!isNullConstant(Idx)) {
+ // Use a VL of 1 to avoid processing more elements than we need.
+ SDValue VL = DAG.getConstant(1, DL, XLenVT);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ Vec = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Vec, Idx, Mask, VL);
+ }
+
+ if (!EltVT.isInteger()) {
+ // Floating-point extracts are handled in TableGen.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
+ DAG.getConstant(0, DL, XLenVT));
+ }
+
+ SDValue Elt0 = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Vec);
+ return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Elt0);
+}
+
+// Some RVV intrinsics may claim that they want an integer operand to be
+// promoted or expanded.
+static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert((Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+ Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) &&
+ "Unexpected opcode");
+
+ if (!Subtarget.hasStdExtV())
+ return SDValue();
+
+ bool HasChain = Op.getOpcode() == ISD::INTRINSIC_W_CHAIN;
+ unsigned IntNo = Op.getConstantOperandVal(HasChain ? 1 : 0);
+ SDLoc DL(Op);
+
+ const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+ RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
+ if (!II || !II->SplatOperand)
+ return SDValue();
+
+ unsigned SplatOp = II->SplatOperand + HasChain;
+ assert(SplatOp < Op.getNumOperands());
+
+ SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
+ SDValue &ScalarOp = Operands[SplatOp];
+ MVT OpVT = ScalarOp.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ // If this isn't a scalar, or its type is XLenVT we're done.
+ if (!OpVT.isScalarInteger() || OpVT == XLenVT)
+ return SDValue();
+
+ // Simplest case is that the operand needs to be promoted to XLenVT.
+ if (OpVT.bitsLT(XLenVT)) {
+ // If the operand is a constant, sign extend to increase our chances
+ // of being able to use a .vi instruction. ANY_EXTEND would become a
+ // a zero extend and the simm5 check in isel would fail.
+ // FIXME: Should we ignore the upper bits in isel instead?
+ unsigned ExtOpc =
+ isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+ ScalarOp = DAG.getNode(ExtOpc, DL, XLenVT, ScalarOp);
+ return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
+ }
+
+ // Use the previous operand to get the vXi64 VT. The result might be a mask
+ // VT for compares. Using the previous operand assumes that the previous
+ // operand will never have a smaller element size than a scalar operand and
+ // that a widening operation never uses SEW=64.
+ // NOTE: If this fails the below assert, we can probably just find the
+ // element count from any operand or result and use it to construct the VT.
+ assert(II->SplatOperand > 1 && "Unexpected splat operand!");
+ MVT VT = Op.getOperand(SplatOp - 1).getSimpleValueType();
+
+ // The more complex case is when the scalar is larger than XLenVT.
+ assert(XLenVT == MVT::i32 && OpVT == MVT::i64 &&
+ VT.getVectorElementType() == MVT::i64 && "Unexpected VTs!");
+
+ // If this is a sign-extended 32-bit constant, we can truncate it and rely
+ // on the instruction to sign-extend since SEW>XLEN.
+ if (auto *CVal = dyn_cast<ConstantSDNode>(ScalarOp)) {
+ if (isInt<32>(CVal->getSExtValue())) {
+ ScalarOp = DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32);
+ return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
+ }
+ }
+
+ // We need to convert the scalar to a splat vector.
+ // FIXME: Can we implicitly truncate the scalar if it is known to
+ // be sign extended?
+ // VL should be the last operand.
+ SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+ assert(VL.getValueType() == XLenVT);
+ ScalarOp = splatSplitI64WithVL(DL, VT, ScalarOp, VL, DAG);
+ return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
}
SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned IntNo = Op.getConstantOperandVal(0);
SDLoc DL(Op);
-
- if (Subtarget.hasStdExtV()) {
- // Some RVV intrinsics may claim that they want an integer operand to be
- // extended.
- if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
- RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
- if (II->ExtendedOperand) {
- assert(II->ExtendedOperand < Op.getNumOperands());
- SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
- SDValue &ScalarOp = Operands[II->ExtendedOperand];
- EVT OpVT = ScalarOp.getValueType();
- if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
- (OpVT == MVT::i32 && Subtarget.is64Bit())) {
- // If the operand is a constant, sign extend to increase our chances
- // of being able to use a .vi instruction. ANY_EXTEND would become a
- // a zero extend and the simm5 check in isel would fail.
- // FIXME: Should we ignore the upper bits in isel instead?
- unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
- : ISD::ANY_EXTEND;
- ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
- Operands);
- }
- }
- }
- }
+ MVT XLenVT = Subtarget.getXLenVT();
switch (IntNo) {
default:
- return SDValue(); // Don't custom lower most intrinsics.
+ break; // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getRegister(RISCV::X4, PtrVT);
}
+ case Intrinsic::riscv_orc_b:
+ // Lower to the GORCI encoding for orc.b.
+ return DAG.getNode(RISCVISD::GORC, DL, XLenVT, Op.getOperand(1),
+ DAG.getConstant(7, DL, XLenVT));
+ case Intrinsic::riscv_grev:
+ case Intrinsic::riscv_gorc: {
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_grev ? RISCVISD::GREV : RISCVISD::GORC;
+ return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::riscv_shfl:
+ case Intrinsic::riscv_unshfl: {
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_shfl ? RISCVISD::SHFL : RISCVISD::UNSHFL;
+ return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::riscv_bcompress:
+ case Intrinsic::riscv_bdecompress: {
+ unsigned Opc = IntNo == Intrinsic::riscv_bcompress ? RISCVISD::BCOMPRESS
+ : RISCVISD::BDECOMPRESS;
+ return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
+ }
case Intrinsic::riscv_vmv_x_s:
- assert(Op.getValueType() == Subtarget.getXLenVT() && "Unexpected VT!");
+ assert(Op.getValueType() == XLenVT && "Unexpected VT!");
return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::riscv_vmv_v_x:
+ return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
+ Op.getSimpleValueType(), DL, DAG, Subtarget);
+ case Intrinsic::riscv_vfmv_v_f:
+ return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::riscv_vmv_s_x: {
+ SDValue Scalar = Op.getOperand(2);
+
+ if (Scalar.getValueType().bitsLE(XLenVT)) {
+ Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Scalar);
+ return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, Op.getValueType(),
+ Op.getOperand(1), Scalar, Op.getOperand(3));
+ }
+
+ assert(Scalar.getValueType() == MVT::i64 && "Unexpected scalar VT!");
+
+ // This is an i64 value that lives in two scalar registers. We have to
+ // insert this in a convoluted way. First we build vXi64 splat containing
+ // the/ two values that we assemble using some bit math. Next we'll use
+ // vid.v and vmseq to build a mask with bit 0 set. Then we'll use that mask
+ // to merge element 0 from our splat into the source vector.
+ // FIXME: This is probably not the best way to do this, but it is
+ // consistent with INSERT_VECTOR_ELT lowering so it is a good starting
+ // point.
+ // sw lo, (a0)
+ // sw hi, 4(a0)
+ // vlse vX, (a0)
+ //
+ // vid.v vVid
+ // vmseq.vx mMask, vVid, 0
+ // vmerge.vvm vDest, vSrc, vVal, mMask
+ MVT VT = Op.getSimpleValueType();
+ SDValue Vec = Op.getOperand(1);
+ SDValue VL = Op.getOperand(3);
+
+ SDValue SplattedVal = splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
+ SDValue SplattedIdx = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+ DAG.getConstant(0, DL, MVT::i32), VL);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VT, Mask, VL);
+ SDValue SelectCond =
+ DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, VID, SplattedIdx,
+ DAG.getCondCode(ISD::SETEQ), Mask, VL);
+ return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal,
+ Vec, VL);
}
+ case Intrinsic::riscv_vslide1up:
+ case Intrinsic::riscv_vslide1down:
+ case Intrinsic::riscv_vslide1up_mask:
+ case Intrinsic::riscv_vslide1down_mask: {
+ // We need to special case these when the scalar is larger than XLen.
+ unsigned NumOps = Op.getNumOperands();
+ bool IsMasked = NumOps == 6;
+ unsigned OpOffset = IsMasked ? 1 : 0;
+ SDValue Scalar = Op.getOperand(2 + OpOffset);
+ if (Scalar.getValueType().bitsLE(XLenVT))
+ break;
+
+ // Splatting a sign extended constant is fine.
+ if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar))
+ if (isInt<32>(CVal->getSExtValue()))
+ break;
+
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.getVectorElementType() == MVT::i64 &&
+ Scalar.getValueType() == MVT::i64 && "Unexpected VTs");
+
+ // Convert the vector source to the equivalent nxvXi32 vector.
+ MVT I32VT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
+ SDValue Vec = DAG.getBitcast(I32VT, Op.getOperand(1 + OpOffset));
+
+ SDValue ScalarLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(0, DL, XLenVT));
+ SDValue ScalarHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(1, DL, XLenVT));
+
+ // Double the VL since we halved SEW.
+ SDValue VL = Op.getOperand(NumOps - 1);
+ SDValue I32VL =
+ DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT));
+
+ MVT I32MaskVT = MVT::getVectorVT(MVT::i1, I32VT.getVectorElementCount());
+ SDValue I32Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, I32MaskVT, VL);
+
+ // Shift the two scalar parts in using SEW=32 slide1up/slide1down
+ // instructions.
+ if (IntNo == Intrinsic::riscv_vslide1up ||
+ IntNo == Intrinsic::riscv_vslide1up_mask) {
+ Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Vec, ScalarHi,
+ I32Mask, I32VL);
+ Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Vec, ScalarLo,
+ I32Mask, I32VL);
+ } else {
+ Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Vec, ScalarLo,
+ I32Mask, I32VL);
+ Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Vec, ScalarHi,
+ I32Mask, I32VL);
+ }
+
+ // Convert back to nxvXi64.
+ Vec = DAG.getBitcast(VT, Vec);
+
+ if (!IsMasked)
+ return Vec;
+
+ // Apply mask after the operation.
+ SDValue Mask = Op.getOperand(NumOps - 2);
+ SDValue MaskedOff = Op.getOperand(1);
+ return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff, VL);
+ }
+ }
+
+ return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
}
SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- SDLoc DL(Op);
+ return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
+}
- if (Subtarget.hasStdExtV()) {
- // Some RVV intrinsics may claim that they want an integer operand to be
- // extended.
- if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
- RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
- if (II->ExtendedOperand) {
- // The operands start from the second argument in INTRINSIC_W_CHAIN.
- unsigned ExtendOp = II->ExtendedOperand + 1;
- assert(ExtendOp < Op.getNumOperands());
- SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
- SDValue &ScalarOp = Operands[ExtendOp];
- EVT OpVT = ScalarOp.getValueType();
- if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
- (OpVT == MVT::i32 && Subtarget.is64Bit())) {
- // If the operand is a constant, sign extend to increase our chances
- // of being able to use a .vi instruction. ANY_EXTEND would become a
- // a zero extend and the simm5 check in isel would fail.
- // FIXME: Should we ignore the upper bits in isel instead?
- unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
- : ISD::ANY_EXTEND;
- ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
- return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
- Operands);
- }
- }
+static MVT getLMUL1VT(MVT VT) {
+ assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+ "Unexpected vector MVT");
+ return MVT::getScalableVectorVT(
+ VT.getVectorElementType(),
+ RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
+}
+
+static unsigned getRVVReductionOp(unsigned ISDOpcode) {
+ switch (ISDOpcode) {
+ default:
+ llvm_unreachable("Unhandled reduction");
+ case ISD::VECREDUCE_ADD:
+ return RISCVISD::VECREDUCE_ADD_VL;
+ case ISD::VECREDUCE_UMAX:
+ return RISCVISD::VECREDUCE_UMAX_VL;
+ case ISD::VECREDUCE_SMAX:
+ return RISCVISD::VECREDUCE_SMAX_VL;
+ case ISD::VECREDUCE_UMIN:
+ return RISCVISD::VECREDUCE_UMIN_VL;
+ case ISD::VECREDUCE_SMIN:
+ return RISCVISD::VECREDUCE_SMIN_VL;
+ case ISD::VECREDUCE_AND:
+ return RISCVISD::VECREDUCE_AND_VL;
+ case ISD::VECREDUCE_OR:
+ return RISCVISD::VECREDUCE_OR_VL;
+ case ISD::VECREDUCE_XOR:
+ return RISCVISD::VECREDUCE_XOR_VL;
+ }
+}
+
+SDValue RISCVTargetLowering::lowerVectorMaskVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ assert((Op.getOpcode() == ISD::VECREDUCE_AND ||
+ Op.getOpcode() == ISD::VECREDUCE_OR ||
+ Op.getOpcode() == ISD::VECREDUCE_XOR) &&
+ "Unexpected reduction lowering");
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ assert(Op.getValueType() == XLenVT &&
+ "Expected reduction output to be legalized to XLenVT");
+
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+ SDValue Zero = DAG.getConstant(0, DL, XLenVT);
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unhandled reduction");
+ case ISD::VECREDUCE_AND:
+ // vpopc ~x == 0
+ Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, Mask, VL);
+ Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
+ return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETEQ);
+ case ISD::VECREDUCE_OR:
+ // vpopc x != 0
+ Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
+ return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
+ case ISD::VECREDUCE_XOR: {
+ // ((vpopc x) & 1) != 0
+ SDValue One = DAG.getConstant(1, DL, XLenVT);
+ Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
+ Vec = DAG.getNode(ISD::AND, DL, XLenVT, Vec, One);
+ return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
+ }
+ }
+}
+
+SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(0);
+ EVT VecEVT = Vec.getValueType();
+
+ unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+
+ // Due to ordering in legalize types we may have a vector type that needs to
+ // be split. Do that manually so we can get down to a legal type.
+ while (getTypeAction(*DAG.getContext(), VecEVT) ==
+ TargetLowering::TypeSplitVector) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
+ VecEVT = Lo.getValueType();
+ Vec = DAG.getNode(BaseOpc, DL, VecEVT, Lo, Hi);
+ }
+
+ // TODO: The type may need to be widened rather than split. Or widened before
+ // it can be split.
+ if (!isTypeLegal(VecEVT))
+ return SDValue();
+
+ MVT VecVT = VecEVT.getSimpleVT();
+ MVT VecEltVT = VecVT.getVectorElementType();
+ unsigned RVVOpcode = getRVVReductionOp(Op.getOpcode());
+
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
+ MVT M1VT = getLMUL1VT(ContainerVT);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+
+ // FIXME: This is a VLMAX splat which might be too large and can prevent
+ // vsetvli removal.
+ SDValue NeutralElem =
+ DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
+ SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem);
+ SDValue Reduction =
+ DAG.getNode(RVVOpcode, DL, M1VT, Vec, IdentitySplat, Mask, VL);
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
+}
+
+// Given a reduction op, this function returns the matching reduction opcode,
+// the vector SDValue and the scalar SDValue required to lower this to a
+// RISCVISD node.
+static std::tuple<unsigned, SDValue, SDValue>
+getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) {
+ SDLoc DL(Op);
+ auto Flags = Op->getFlags();
+ unsigned Opcode = Op.getOpcode();
+ unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Opcode);
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unhandled reduction");
+ case ISD::VECREDUCE_FADD:
+ return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0),
+ DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags));
+ case ISD::VECREDUCE_SEQ_FADD:
+ return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1),
+ Op.getOperand(0));
+ case ISD::VECREDUCE_FMIN:
+ return std::make_tuple(RISCVISD::VECREDUCE_FMIN_VL, Op.getOperand(0),
+ DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags));
+ case ISD::VECREDUCE_FMAX:
+ return std::make_tuple(RISCVISD::VECREDUCE_FMAX_VL, Op.getOperand(0),
+ DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags));
+ }
+}
+
+SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VecEltVT = Op.getSimpleValueType();
+
+ unsigned RVVOpcode;
+ SDValue VectorVal, ScalarVal;
+ std::tie(RVVOpcode, VectorVal, ScalarVal) =
+ getRVVFPReductionOpAndOperands(Op, DAG, VecEltVT);
+ MVT VecVT = VectorVal.getSimpleValueType();
+
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ VectorVal = convertToScalableVector(ContainerVT, VectorVal, DAG, Subtarget);
+ }
+
+ MVT M1VT = getLMUL1VT(VectorVal.getSimpleValueType());
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+
+ // FIXME: This is a VLMAX splat which might be too large and can prevent
+ // vsetvli removal.
+ SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal);
+ SDValue Reduction =
+ DAG.getNode(RVVOpcode, DL, M1VT, VectorVal, ScalarSplat, Mask, VL);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+}
+
+SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ MVT VecVT = Vec.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ SDLoc DL(Op);
+ MVT XLenVT = Subtarget.getXLenVT();
+ unsigned OrigIdx = Op.getConstantOperandVal(2);
+ const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // We don't have the ability to slide mask vectors up indexed by their i1
+ // elements; the smallest we can do is i8. Often we are able to bitcast to
+ // equivalent i8 vectors. Note that when inserting a fixed-length vector
+ // into a scalable one, we might not necessarily have enough scalable
+ // elements to safely divide by 8: nxv1i1 = insert nxv1i1, v4i1 is valid.
+ if (SubVecVT.getVectorElementType() == MVT::i1 &&
+ (OrigIdx != 0 || !Vec.isUndef())) {
+ if (VecVT.getVectorMinNumElements() >= 8 &&
+ SubVecVT.getVectorMinNumElements() >= 8) {
+ assert(OrigIdx % 8 == 0 && "Invalid index");
+ assert(VecVT.getVectorMinNumElements() % 8 == 0 &&
+ SubVecVT.getVectorMinNumElements() % 8 == 0 &&
+ "Unexpected mask vector lowering");
+ OrigIdx /= 8;
+ SubVecVT =
+ MVT::getVectorVT(MVT::i8, SubVecVT.getVectorMinNumElements() / 8,
+ SubVecVT.isScalableVector());
+ VecVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorMinNumElements() / 8,
+ VecVT.isScalableVector());
+ Vec = DAG.getBitcast(VecVT, Vec);
+ SubVec = DAG.getBitcast(SubVecVT, SubVec);
+ } else {
+ // We can't slide this mask vector up indexed by its i1 elements.
+ // This poses a problem when we wish to insert a scalable vector which
+ // can't be re-expressed as a larger type. Just choose the slow path and
+ // extend to a larger type, then truncate back down.
+ MVT ExtVecVT = VecVT.changeVectorElementType(MVT::i8);
+ MVT ExtSubVecVT = SubVecVT.changeVectorElementType(MVT::i8);
+ Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVecVT, Vec);
+ SubVec = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtSubVecVT, SubVec);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ExtVecVT, Vec, SubVec,
+ Op.getOperand(2));
+ SDValue SplatZero = DAG.getConstant(0, DL, ExtVecVT);
+ return DAG.getSetCC(DL, VecVT, Vec, SplatZero, ISD::SETNE);
}
}
- unsigned NF = 1;
- switch (IntNo) {
- default:
- return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::riscv_vleff: {
- SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
- SDValue Load = DAG.getNode(RISCVISD::VLEFF, DL, VTs, Op.getOperand(0),
- Op.getOperand(2), Op.getOperand(3));
- VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
- SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
- return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+ // If the subvector vector is a fixed-length type, we cannot use subregister
+ // manipulation to simplify the codegen; we don't know which register of a
+ // LMUL group contains the specific subvector as we only know the minimum
+ // register size. Therefore we must slide the vector group up the full
+ // amount.
+ if (SubVecVT.isFixedLengthVector()) {
+ if (OrigIdx == 0 && Vec.isUndef())
+ return Op;
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), SubVec,
+ DAG.getConstant(0, DL, XLenVT));
+ SDValue Mask =
+ getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
+ // Set the vector length to only the number of elements we care about. Note
+ // that for slideup this includes the offset.
+ SDValue VL =
+ DAG.getConstant(OrigIdx + SubVecVT.getVectorNumElements(), DL, XLenVT);
+ SDValue SlideupAmt = DAG.getConstant(OrigIdx, DL, XLenVT);
+ SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec,
+ SubVec, SlideupAmt, Mask, VL);
+ if (VecVT.isFixedLengthVector())
+ Slideup = convertFromScalableVector(VecVT, Slideup, DAG, Subtarget);
+ return DAG.getBitcast(Op.getValueType(), Slideup);
}
- case Intrinsic::riscv_vleff_mask: {
- SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
- SDValue Load = DAG.getNode(RISCVISD::VLEFF_MASK, DL, VTs, Op.getOperand(0),
- Op.getOperand(2), Op.getOperand(3),
- Op.getOperand(4), Op.getOperand(5));
- VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
- SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
- return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+
+ unsigned SubRegIdx, RemIdx;
+ std::tie(SubRegIdx, RemIdx) =
+ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ VecVT, SubVecVT, OrigIdx, TRI);
+
+ RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+ bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
+ SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
+ SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+
+ // 1. If the Idx has been completely eliminated and this subvector's size is
+ // a vector register or a multiple thereof, or the surrounding elements are
+ // undef, then this is a subvector insert which naturally aligns to a vector
+ // register. These can easily be handled using subregister manipulation.
+ // 2. If the subvector is smaller than a vector register, then the insertion
+ // must preserve the undisturbed elements of the register. We do this by
+ // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
+ // (which resolves to a subregister copy), performing a VSLIDEUP to place the
+ // subvector within the vector register, and an INSERT_SUBVECTOR of that
+ // LMUL=1 type back into the larger vector (resolving to another subregister
+ // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
+ // to avoid allocating a large register group to hold our subvector.
+ if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+ return Op;
+
+ // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
+ // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
+ // (in our case undisturbed). This means we can set up a subvector insertion
+ // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
+ // size of the subvector.
+ MVT InterSubVT = VecVT;
+ SDValue AlignedExtract = Vec;
+ unsigned AlignedIdx = OrigIdx - RemIdx;
+ if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
+ InterSubVT = getLMUL1VT(VecVT);
+ // Extract a subvector equal to the nearest full vector register type. This
+ // should resolve to a EXTRACT_SUBREG instruction.
+ AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
+ DAG.getConstant(AlignedIdx, DL, XLenVT));
}
- case Intrinsic::riscv_vlseg8ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg7ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg6ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg5ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg4ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg3ff:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg2ff: {
- NF++;
- SDLoc DL(Op);
- SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
- EVTs.push_back(MVT::Other);
- EVTs.push_back(MVT::Glue);
- SDVTList VTs = DAG.getVTList(EVTs);
- SDValue Load =
- DAG.getNode(RISCVISD::VLSEGFF, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
- SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
- /*Glue*/ Load.getValue(NF + 1));
- SmallVector<SDValue, 8> Results;
- for (unsigned i = 0; i < NF; ++i)
- Results.push_back(Load.getValue(i));
- Results.push_back(ReadVL);
- Results.push_back(Load.getValue(NF)); // Chain.
- return DAG.getMergeValues(Results, DL);
+
+ SDValue SlideupAmt = DAG.getConstant(RemIdx, DL, XLenVT);
+ // For scalable vectors this must be further multiplied by vscale.
+ SlideupAmt = DAG.getNode(ISD::VSCALE, DL, XLenVT, SlideupAmt);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+
+ // Construct the vector length corresponding to RemIdx + length(SubVecVT).
+ VL = DAG.getConstant(SubVecVT.getVectorMinNumElements(), DL, XLenVT);
+ VL = DAG.getNode(ISD::VSCALE, DL, XLenVT, VL);
+ VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
+
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT,
+ DAG.getUNDEF(InterSubVT), SubVec,
+ DAG.getConstant(0, DL, XLenVT));
+
+ SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, InterSubVT,
+ AlignedExtract, SubVec, SlideupAmt, Mask, VL);
+
+ // If required, insert this subvector back into the correct vector register.
+ // This should resolve to an INSERT_SUBREG instruction.
+ if (VecVT.bitsGT(InterSubVT))
+ Slideup = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup,
+ DAG.getConstant(AlignedIdx, DL, XLenVT));
+
+ // We might have bitcast from a mask type: cast back to the original type if
+ // required.
+ return DAG.getBitcast(Op.getSimpleValueType(), Slideup);
+}
+
+SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ MVT SubVecVT = Op.getSimpleValueType();
+ MVT VecVT = Vec.getSimpleValueType();
+
+ SDLoc DL(Op);
+ MVT XLenVT = Subtarget.getXLenVT();
+ unsigned OrigIdx = Op.getConstantOperandVal(1);
+ const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // We don't have the ability to slide mask vectors down indexed by their i1
+ // elements; the smallest we can do is i8. Often we are able to bitcast to
+ // equivalent i8 vectors. Note that when extracting a fixed-length vector
+ // from a scalable one, we might not necessarily have enough scalable
+ // elements to safely divide by 8: v8i1 = extract nxv1i1 is valid.
+ if (SubVecVT.getVectorElementType() == MVT::i1 && OrigIdx != 0) {
+ if (VecVT.getVectorMinNumElements() >= 8 &&
+ SubVecVT.getVectorMinNumElements() >= 8) {
+ assert(OrigIdx % 8 == 0 && "Invalid index");
+ assert(VecVT.getVectorMinNumElements() % 8 == 0 &&
+ SubVecVT.getVectorMinNumElements() % 8 == 0 &&
+ "Unexpected mask vector lowering");
+ OrigIdx /= 8;
+ SubVecVT =
+ MVT::getVectorVT(MVT::i8, SubVecVT.getVectorMinNumElements() / 8,
+ SubVecVT.isScalableVector());
+ VecVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorMinNumElements() / 8,
+ VecVT.isScalableVector());
+ Vec = DAG.getBitcast(VecVT, Vec);
+ } else {
+ // We can't slide this mask vector down, indexed by its i1 elements.
+ // This poses a problem when we wish to extract a scalable vector which
+ // can't be re-expressed as a larger type. Just choose the slow path and
+ // extend to a larger type, then truncate back down.
+ // TODO: We could probably improve this when extracting certain fixed
+ // from fixed, where we can extract as i8 and shift the correct element
+ // right to reach the desired subvector?
+ MVT ExtVecVT = VecVT.changeVectorElementType(MVT::i8);
+ MVT ExtSubVecVT = SubVecVT.changeVectorElementType(MVT::i8);
+ Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVecVT, Vec);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtSubVecVT, Vec,
+ Op.getOperand(1));
+ SDValue SplatZero = DAG.getConstant(0, DL, ExtSubVecVT);
+ return DAG.getSetCC(DL, SubVecVT, Vec, SplatZero, ISD::SETNE);
+ }
}
- case Intrinsic::riscv_vlseg8ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg7ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg6ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg5ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg4ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg3ff_mask:
- NF++;
- LLVM_FALLTHROUGH;
- case Intrinsic::riscv_vlseg2ff_mask: {
- NF++;
- SDLoc DL(Op);
- SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
- EVTs.push_back(MVT::Other);
- EVTs.push_back(MVT::Glue);
- SDVTList VTs = DAG.getVTList(EVTs);
- SmallVector<SDValue, 13> LoadOps;
- LoadOps.push_back(Op.getOperand(0)); // Chain.
- LoadOps.push_back(Op.getOperand(1)); // Intrinsic ID.
- for (unsigned i = 0; i < NF; ++i)
- LoadOps.push_back(Op.getOperand(2 + i)); // MaskedOff.
- LoadOps.push_back(Op.getOperand(2 + NF)); // Base.
- LoadOps.push_back(Op.getOperand(3 + NF)); // Mask.
- LoadOps.push_back(Op.getOperand(4 + NF)); // VL.
- SDValue Load = DAG.getNode(RISCVISD::VLSEGFF_MASK, DL, VTs, LoadOps);
- VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
- SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
- /*Glue*/ Load.getValue(NF + 1));
- SmallVector<SDValue, 8> Results;
- for (unsigned i = 0; i < NF; ++i)
- Results.push_back(Load.getValue(i));
- Results.push_back(ReadVL);
- Results.push_back(Load.getValue(NF)); // Chain.
- return DAG.getMergeValues(Results, DL);
+
+ // If the subvector vector is a fixed-length type, we cannot use subregister
+ // manipulation to simplify the codegen; we don't know which register of a
+ // LMUL group contains the specific subvector as we only know the minimum
+ // register size. Therefore we must slide the vector group down the full
+ // amount.
+ if (SubVecVT.isFixedLengthVector()) {
+ // With an index of 0 this is a cast-like subvector, which can be performed
+ // with subregister operations.
+ if (OrigIdx == 0)
+ return Op;
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+ SDValue Mask =
+ getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
+ // Set the vector length to only the number of elements we care about. This
+ // avoids sliding down elements we're going to discard straight away.
+ SDValue VL = DAG.getConstant(SubVecVT.getVectorNumElements(), DL, XLenVT);
+ SDValue SlidedownAmt = DAG.getConstant(OrigIdx, DL, XLenVT);
+ SDValue Slidedown =
+ DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Vec, SlidedownAmt, Mask, VL);
+ // Now we can use a cast-like subvector extract to get the result.
+ Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
+ DAG.getConstant(0, DL, XLenVT));
+ return DAG.getBitcast(Op.getValueType(), Slidedown);
}
+
+ unsigned SubRegIdx, RemIdx;
+ std::tie(SubRegIdx, RemIdx) =
+ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ VecVT, SubVecVT, OrigIdx, TRI);
+
+ // If the Idx has been completely eliminated then this is a subvector extract
+ // which naturally aligns to a vector register. These can easily be handled
+ // using subregister manipulation.
+ if (RemIdx == 0)
+ return Op;
+
+ // Else we must shift our vector register directly to extract the subvector.
+ // Do this using VSLIDEDOWN.
+
+ // If the vector type is an LMUL-group type, extract a subvector equal to the
+ // nearest full vector register type. This should resolve to a EXTRACT_SUBREG
+ // instruction.
+ MVT InterSubVT = VecVT;
+ if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
+ InterSubVT = getLMUL1VT(VecVT);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
+ DAG.getConstant(OrigIdx - RemIdx, DL, XLenVT));
}
+
+ // Slide this vector register down by the desired number of elements in order
+ // to place the desired subvector starting at element 0.
+ SDValue SlidedownAmt = DAG.getConstant(RemIdx, DL, XLenVT);
+ // For scalable vectors this must be further multiplied by vscale.
+ SlidedownAmt = DAG.getNode(ISD::VSCALE, DL, XLenVT, SlidedownAmt);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget);
+ SDValue Slidedown =
+ DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, InterSubVT,
+ DAG.getUNDEF(InterSubVT), Vec, SlidedownAmt, Mask, VL);
+
+ // Now the vector is in the right position, extract our final subvector. This
+ // should resolve to a COPY.
+ Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
+ DAG.getConstant(0, DL, XLenVT));
+
+ // We might have bitcast from a mask type: cast back to the original type if
+ // required.
+ return DAG.getBitcast(Op.getSimpleValueType(), Slidedown);
+}
+
+// Lower step_vector to the vid instruction. Any non-identity step value must
+// be accounted for my manual expansion.
+SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+ SDValue StepVec = DAG.getNode(RISCVISD::VID_VL, DL, VT, Mask, VL);
+ uint64_t StepValImm = Op.getConstantOperandVal(0);
+ if (StepValImm != 1) {
+ if (isPowerOf2_64(StepValImm)) {
+ SDValue StepVal =
+ DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+ DAG.getConstant(Log2_64(StepValImm), DL, XLenVT));
+ StepVec = DAG.getNode(ISD::SHL, DL, VT, StepVec, StepVal);
+ } else {
+ SDValue StepVal = lowerScalarSplat(
+ DAG.getConstant(StepValImm, DL, VT.getVectorElementType()), VL, VT,
+ DL, DAG, Subtarget);
+ StepVec = DAG.getNode(ISD::MUL, DL, VT, StepVec, StepVal);
+ }
+ }
+ return StepVec;
+}
+
+// Implement vector_reverse using vrgather.vv with indices determined by
+// subtracting the id of each element from (VLMAX-1). This will convert
+// the indices like so:
+// (0, 1,..., VLMAX-2, VLMAX-1) -> (VLMAX-1, VLMAX-2,..., 1, 0).
+// TODO: This code assumes VLMAX <= 65536 for LMUL=8 SEW=16.
+SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VecVT = Op.getSimpleValueType();
+ unsigned EltSize = VecVT.getScalarSizeInBits();
+ unsigned MinSize = VecVT.getSizeInBits().getKnownMinValue();
+
+ unsigned MaxVLMAX = 0;
+ unsigned VectorBitsMax = Subtarget.getMaxRVVVectorSizeInBits();
+ if (VectorBitsMax != 0)
+ MaxVLMAX = ((VectorBitsMax / EltSize) * MinSize) / RISCV::RVVBitsPerBlock;
+
+ unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+ MVT IntVT = VecVT.changeVectorElementTypeToInteger();
+
+ // If this is SEW=8 and VLMAX is unknown or more than 256, we need
+ // to use vrgatherei16.vv.
+ // TODO: It's also possible to use vrgatherei16.vv for other types to
+ // decrease register width for the index calculation.
+ if ((MaxVLMAX == 0 || MaxVLMAX > 256) && EltSize == 8) {
+ // If this is LMUL=8, we have to split before can use vrgatherei16.vv.
+ // Reverse each half, then reassemble them in reverse order.
+ // NOTE: It's also possible that after splitting that VLMAX no longer
+ // requires vrgatherei16.vv.
+ if (MinSize == (8 * RISCV::RVVBitsPerBlock)) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+ Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, LoVT, Lo);
+ Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi);
+ // Reassemble the low and high pieces reversed.
+ // FIXME: This is a CONCAT_VECTORS.
+ SDValue Res =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, DAG.getUNDEF(VecVT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, VecVT, Res, Lo,
+ DAG.getIntPtrConstant(LoVT.getVectorMinNumElements(), DL));
+ }
+
+ // Just promote the int type to i16 which will double the LMUL.
+ IntVT = MVT::getVectorVT(MVT::i16, VecVT.getVectorElementCount());
+ GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+ }
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+
+ // Calculate VLMAX-1 for the desired SEW.
+ unsigned MinElts = VecVT.getVectorMinNumElements();
+ SDValue VLMax = DAG.getNode(ISD::VSCALE, DL, XLenVT,
+ DAG.getConstant(MinElts, DL, XLenVT));
+ SDValue VLMinus1 =
+ DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, DAG.getConstant(1, DL, XLenVT));
+
+ // Splat VLMAX-1 taking care to handle SEW==64 on RV32.
+ bool IsRV32E64 =
+ !Subtarget.is64Bit() && IntVT.getVectorElementType() == MVT::i64;
+ SDValue SplatVL;
+ if (!IsRV32E64)
+ SplatVL = DAG.getSplatVector(IntVT, DL, VLMinus1);
+ else
+ SplatVL = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, IntVT, VLMinus1);
+
+ SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, IntVT, Mask, VL);
+ SDValue Indices =
+ DAG.getNode(RISCVISD::SUB_VL, DL, IntVT, SplatVL, VID, Mask, VL);
+
+ return DAG.getNode(GatherOpc, DL, VecVT, Op.getOperand(0), Indices, Mask, VL);
+}
+
+SDValue
+RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto *Load = cast<LoadSDNode>(Op);
+
+ assert(allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ Load->getMemoryVT(),
+ *Load->getMemOperand()) &&
+ "Expecting a correctly-aligned load");
+
+ MVT VT = Op.getSimpleValueType();
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+ SDValue VL =
+ DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
+
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ SDValue NewLoad = DAG.getMemIntrinsicNode(
+ RISCVISD::VLE_VL, DL, VTs, {Load->getChain(), Load->getBasePtr(), VL},
+ Load->getMemoryVT(), Load->getMemOperand());
+
+ SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+ return DAG.getMergeValues({Result, Load->getChain()}, DL);
+}
+
+SDValue
+RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto *Store = cast<StoreSDNode>(Op);
+
+ assert(allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ Store->getMemoryVT(),
+ *Store->getMemOperand()) &&
+ "Expecting a correctly-aligned store");
+
+ SDValue StoreVal = Store->getValue();
+ MVT VT = StoreVal.getSimpleValueType();
+
+ // If the size less than a byte, we need to pad with zeros to make a byte.
+ if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {
+ VT = MVT::v8i1;
+ StoreVal = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ DAG.getConstant(0, DL, VT), StoreVal,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+ SDValue VL =
+ DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
+
+ SDValue NewValue =
+ convertToScalableVector(ContainerVT, StoreVal, DAG, Subtarget);
+ return DAG.getMemIntrinsicNode(
+ RISCVISD::VSE_VL, DL, DAG.getVTList(MVT::Other),
+ {Store->getChain(), NewValue, Store->getBasePtr(), VL},
+ Store->getMemoryVT(), Store->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::lowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
+ auto *Load = cast<MaskedLoadSDNode>(Op);
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ SDValue Mask = Load->getMask();
+ SDValue PassThru = Load->getPassThru();
+ SDValue VL;
+
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+ VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+ } else
+ VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vle_mask, DL, XLenVT);
+ SDValue Ops[] = {Load->getChain(), IntID, PassThru,
+ Load->getBasePtr(), Mask, VL};
+ SDValue Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+ Load->getMemoryVT(), Load->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+
+ if (VT.isFixedLengthVector())
+ Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+ return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue RISCVTargetLowering::lowerMSTORE(SDValue Op, SelectionDAG &DAG) const {
+ auto *Store = cast<MaskedStoreSDNode>(Op);
+
+ SDLoc DL(Op);
+ SDValue Val = Store->getValue();
+ SDValue Mask = Store->getMask();
+ MVT VT = Val.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue VL;
+
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+
+ Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+ } else
+ VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+ SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vse_mask, DL, XLenVT);
+ return DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other),
+ {Store->getChain(), IntID, Val, Store->getBasePtr(), Mask, VL},
+ Store->getMemoryVT(), Store->getMemOperand());
+}
+
+SDValue
+RISCVTargetLowering::lowerFixedLengthVectorSetccToRVV(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT InVT = Op.getOperand(0).getSimpleValueType();
+ MVT ContainerVT = getContainerForFixedLengthVector(InVT);
+
+ MVT VT = Op.getSimpleValueType();
+
+ SDValue Op1 =
+ convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget);
+ SDValue Op2 =
+ convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget);
+
+ SDLoc DL(Op);
+ SDValue VL =
+ DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+
+ SDValue Cmp = DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, Op1, Op2,
+ Op.getOperand(2), Mask, VL);
+
+ return convertFromScalableVector(VT, Cmp, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerFixedLengthVectorLogicOpToRVV(
+ SDValue Op, SelectionDAG &DAG, unsigned MaskOpc, unsigned VecOpc) const {
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.getVectorElementType() == MVT::i1)
+ return lowerToScalableOp(Op, DAG, MaskOpc, /*HasMask*/ false);
+
+ return lowerToScalableOp(Op, DAG, VecOpc, /*HasMask*/ true);
+}
+
+SDValue
+RISCVTargetLowering::lowerFixedLengthVectorShiftToRVV(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::SHL: Opc = RISCVISD::SHL_VL; break;
+ case ISD::SRA: Opc = RISCVISD::SRA_VL; break;
+ case ISD::SRL: Opc = RISCVISD::SRL_VL; break;
+ }
+
+ return lowerToScalableOp(Op, DAG, Opc);
+}
+
+// Lower vector ABS to smax(X, sub(0, X)).
+SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue X = Op.getOperand(0);
+
+ assert(VT.isFixedLengthVector() && "Unexpected type");
+
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ X = convertToScalableVector(ContainerVT, X, DAG, Subtarget);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue SplatZero =
+ DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ SDValue NegX =
+ DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, SplatZero, X, Mask, VL);
+ SDValue Max =
+ DAG.getNode(RISCVISD::SMAX_VL, DL, ContainerVT, X, NegX, Mask, VL);
+
+ return convertFromScalableVector(VT, Max, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mag = Op.getOperand(0);
+ SDValue Sign = Op.getOperand(1);
+ assert(Mag.getValueType() == Sign.getValueType() &&
+ "Can only handle COPYSIGN with matching types.");
+
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ Mag = convertToScalableVector(ContainerVT, Mag, DAG, Subtarget);
+ Sign = convertToScalableVector(ContainerVT, Sign, DAG, Subtarget);
+
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue CopySign =
+ DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Mag, Sign, Mask, VL);
+
+ return convertFromScalableVector(VT, CopySign, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
+ SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+ MVT I1ContainerVT =
+ MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+
+ SDValue CC =
+ convertToScalableVector(I1ContainerVT, Op.getOperand(0), DAG, Subtarget);
+ SDValue Op1 =
+ convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget);
+ SDValue Op2 =
+ convertToScalableVector(ContainerVT, Op.getOperand(2), DAG, Subtarget);
+
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ SDValue Select =
+ DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, Op1, Op2, VL);
+
+ return convertFromScalableVector(VT, Select, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOpc,
+ bool HasMask) const {
+ MVT VT = Op.getSimpleValueType();
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+ // Create list of operands by converting existing ones to scalable types.
+ SmallVector<SDValue, 6> Ops;
+ for (const SDValue &V : Op->op_values()) {
+ assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+
+ // Pass through non-vector operands.
+ if (!V.getValueType().isVector()) {
+ Ops.push_back(V);
+ continue;
+ }
+
+ // "cast" fixed length vector to a scalable vector.
+ assert(useRVVForFixedLengthVectorVT(V.getSimpleValueType()) &&
+ "Only fixed length vectors are supported!");
+ Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget));
+ }
+
+ SDLoc DL(Op);
+ SDValue Mask, VL;
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+ if (HasMask)
+ Ops.push_back(Mask);
+ Ops.push_back(VL);
+
+ SDValue ScalableRes = DAG.getNode(NewOpc, DL, ContainerVT, Ops);
+ return convertFromScalableVector(VT, ScalableRes, DAG, Subtarget);
+}
+
+// Lower a VP_* ISD node to the corresponding RISCVISD::*_VL node:
+// * Operands of each node are assumed to be in the same order.
+// * The EVL operand is promoted from i32 to i64 on RV64.
+// * Fixed-length vectors are converted to their scalable-vector container
+// types.
+SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG,
+ unsigned RISCVISDOpc) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SmallVector<SDValue, 4> Ops;
+
+ for (const auto &OpIdx : enumerate(Op->ops())) {
+ SDValue V = OpIdx.value();
+ assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+ // Pass through operands which aren't fixed-length vectors.
+ if (!V.getValueType().isFixedLengthVector()) {
+ Ops.push_back(V);
+ continue;
+ }
+ // "cast" fixed length vector to a scalable vector.
+ MVT OpVT = V.getSimpleValueType();
+ MVT ContainerVT = getContainerForFixedLengthVector(OpVT);
+ assert(useRVVForFixedLengthVectorVT(OpVT) &&
+ "Only fixed length vectors are supported!");
+ Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget));
+ }
+
+ if (!VT.isFixedLengthVector())
+ return DAG.getNode(RISCVISDOpc, DL, VT, Ops);
+
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+ SDValue VPOp = DAG.getNode(RISCVISDOpc, DL, ContainerVT, Ops);
+
+ return convertFromScalableVector(VT, VPOp, DAG, Subtarget);
+}
+
+// Custom lower MGATHER to a legalized form for RVV. It will then be matched to
+// a RVV indexed load. The RVV indexed load instructions only support the
+// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
+// truncated to XLEN and are treated as byte offsets. Any signed or scaled
+// indexing is extended to the XLEN value type and scaled accordingly.
+SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
+ auto *MGN = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc DL(Op);
+
+ SDValue Index = MGN->getIndex();
+ SDValue Mask = MGN->getMask();
+ SDValue PassThru = MGN->getPassThru();
+
+ MVT VT = Op.getSimpleValueType();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Unexpected VTs!");
+ assert(MGN->getBasePtr().getSimpleValueType() == XLenVT &&
+ "Unexpected pointer type");
+ // Targets have to explicitly opt-in for extending vector loads.
+ assert(MGN->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Unexpected extending MGATHER");
+
+ // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+ // the selection of the masked intrinsics doesn't do this for us.
+ bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+ SDValue VL;
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ // We need to use the larger of the result and index type to determine the
+ // scalable type to use so we don't increase LMUL for any operand/result.
+ if (VT.bitsGE(IndexVT)) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+ ContainerVT.getVectorElementCount());
+ } else {
+ IndexVT = getContainerForFixedLengthVector(IndexVT);
+ ContainerVT = MVT::getVectorVT(ContainerVT.getVectorElementType(),
+ IndexVT.getVectorElementCount());
+ }
+
+ Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
+
+ if (!IsUnmasked) {
+ MVT MaskVT =
+ MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+ }
+
+ VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+ } else
+ VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+ unsigned IntID =
+ IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask;
+ SmallVector<SDValue, 8> Ops{MGN->getChain(),
+ DAG.getTargetConstant(IntID, DL, XLenVT)};
+ if (!IsUnmasked)
+ Ops.push_back(PassThru);
+ Ops.push_back(MGN->getBasePtr());
+ Ops.push_back(Index);
+ if (!IsUnmasked)
+ Ops.push_back(Mask);
+ Ops.push_back(VL);
+
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ SDValue Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+ MGN->getMemoryVT(), MGN->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+
+ if (VT.isFixedLengthVector())
+ Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+ return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+// Custom lower MSCATTER to a legalized form for RVV. It will then be matched to
+// a RVV indexed store. The RVV indexed store instructions only support the
+// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
+// truncated to XLEN and are treated as byte offsets. Any signed or scaled
+// indexing is extended to the XLEN value type and scaled accordingly.
+SDValue RISCVTargetLowering::lowerMSCATTER(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *MSN = cast<MaskedScatterSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ SDValue Index = MSN->getIndex();
+ SDValue Mask = MSN->getMask();
+ SDValue Val = MSN->getValue();
+
+ MVT VT = Val.getSimpleValueType();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+ "Unexpected VTs!");
+ assert(MSN->getBasePtr().getSimpleValueType() == XLenVT &&
+ "Unexpected pointer type");
+ // Targets have to explicitly opt-in for extending vector loads and
+ // truncating vector stores.
+ assert(!MSN->isTruncatingStore() && "Unexpected extending MSCATTER");
+
+ // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+ // the selection of the masked intrinsics doesn't do this for us.
+ bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+ SDValue VL;
+ if (VT.isFixedLengthVector()) {
+ // We need to use the larger of the value and index type to determine the
+ // scalable type to use so we don't increase LMUL for any operand/result.
+ MVT ContainerVT;
+ if (VT.bitsGE(IndexVT)) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+ ContainerVT.getVectorElementCount());
+ } else {
+ IndexVT = getContainerForFixedLengthVector(IndexVT);
+ ContainerVT = MVT::getVectorVT(VT.getVectorElementType(),
+ IndexVT.getVectorElementCount());
+ }
+
+ Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
+ Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
+
+ if (!IsUnmasked) {
+ MVT MaskVT =
+ MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ }
+
+ VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+ } else
+ VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+ unsigned IntID =
+ IsUnmasked ? Intrinsic::riscv_vsoxei : Intrinsic::riscv_vsoxei_mask;
+ SmallVector<SDValue, 8> Ops{MSN->getChain(),
+ DAG.getTargetConstant(IntID, DL, XLenVT)};
+ Ops.push_back(Val);
+ Ops.push_back(MSN->getBasePtr());
+ Ops.push_back(Index);
+ if (!IsUnmasked)
+ Ops.push_back(Mask);
+ Ops.push_back(VL);
+
+ return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, MSN->getVTList(), Ops,
+ MSN->getMemoryVT(), MSN->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::lowerGET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MVT XLenVT = Subtarget.getXLenVT();
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue SysRegNo = DAG.getConstant(
+ RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
+ SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
+ SDValue RM = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
+
+ // Encoding used for rounding mode in RISCV differs from that used in
+ // FLT_ROUNDS. To convert it the RISCV rounding mode is used as an index in a
+ // table, which consists of a sequence of 4-bit fields, each representing
+ // corresponding FLT_ROUNDS mode.
+ static const int Table =
+ (int(RoundingMode::NearestTiesToEven) << 4 * RISCVFPRndMode::RNE) |
+ (int(RoundingMode::TowardZero) << 4 * RISCVFPRndMode::RTZ) |
+ (int(RoundingMode::TowardNegative) << 4 * RISCVFPRndMode::RDN) |
+ (int(RoundingMode::TowardPositive) << 4 * RISCVFPRndMode::RUP) |
+ (int(RoundingMode::NearestTiesToAway) << 4 * RISCVFPRndMode::RMM);
+
+ SDValue Shift =
+ DAG.getNode(ISD::SHL, DL, XLenVT, RM, DAG.getConstant(2, DL, XLenVT));
+ SDValue Shifted = DAG.getNode(ISD::SRL, DL, XLenVT,
+ DAG.getConstant(Table, DL, XLenVT), Shift);
+ SDValue Masked = DAG.getNode(ISD::AND, DL, XLenVT, Shifted,
+ DAG.getConstant(7, DL, XLenVT));
+
+ return DAG.getMergeValues({Masked, Chain}, DL);
+}
+
+SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MVT XLenVT = Subtarget.getXLenVT();
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue RMValue = Op->getOperand(1);
+ SDValue SysRegNo = DAG.getConstant(
+ RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
+
+ // Encoding used for rounding mode in RISCV differs from that used in
+ // FLT_ROUNDS. To convert it the C rounding mode is used as an index in
+ // a table, which consists of a sequence of 4-bit fields, each representing
+ // corresponding RISCV mode.
+ static const unsigned Table =
+ (RISCVFPRndMode::RNE << 4 * int(RoundingMode::NearestTiesToEven)) |
+ (RISCVFPRndMode::RTZ << 4 * int(RoundingMode::TowardZero)) |
+ (RISCVFPRndMode::RDN << 4 * int(RoundingMode::TowardNegative)) |
+ (RISCVFPRndMode::RUP << 4 * int(RoundingMode::TowardPositive)) |
+ (RISCVFPRndMode::RMM << 4 * int(RoundingMode::NearestTiesToAway));
+
+ SDValue Shift = DAG.getNode(ISD::SHL, DL, XLenVT, RMValue,
+ DAG.getConstant(2, DL, XLenVT));
+ SDValue Shifted = DAG.getNode(ISD::SRL, DL, XLenVT,
+ DAG.getConstant(Table, DL, XLenVT), Shift);
+ RMValue = DAG.getNode(ISD::AND, DL, XLenVT, Shifted,
+ DAG.getConstant(0x7, DL, XLenVT));
+ return DAG.getNode(RISCVISD::WRITE_CSR, DL, MVT::Other, Chain, SysRegNo,
+ RMValue);
}
// Returns the opcode of the target-specific SDNode that implements the 32-bit
@@ -1599,18 +4832,18 @@
return RISCVISD::ROLW;
case ISD::ROTR:
return RISCVISD::RORW;
- case RISCVISD::GREVI:
- return RISCVISD::GREVIW;
- case RISCVISD::GORCI:
- return RISCVISD::GORCIW;
+ case RISCVISD::GREV:
+ return RISCVISD::GREVW;
+ case RISCVISD::GORC:
+ return RISCVISD::GORCW;
}
}
-// Converts the given 32-bit operation to a target-specific SelectionDAG node.
-// Because i32 isn't a legal type for RV64, these operations would otherwise
-// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
-// later one because the fact the operation was originally of type i32 is
-// lost.
+// Converts the given i8/i16/i32 operation to a target-specific SelectionDAG
+// node. Because i8/i16/i32 isn't a legal type for RV64, these operations would
+// otherwise be promoted to i64, making it difficult to select the
+// SLLW/DIVUW/.../*W later one because the fact the operation was originally of
+// type i8/i16/i32 is lost.
static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG,
unsigned ExtOpc = ISD::ANY_EXTEND) {
SDLoc DL(N);
@@ -1645,20 +4878,30 @@
case ISD::STRICT_FP_TO_UINT:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
- bool IsStrict = N->isStrictFPOpcode();
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
+ N->getOpcode() == ISD::STRICT_FP_TO_SINT;
SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
+ if (getTypeAction(*DAG.getContext(), Op0.getValueType()) !=
+ TargetLowering::TypeSoftenFloat) {
+ // FIXME: Support strict FP.
+ if (IsStrict)
+ return;
+ if (!isTypeLegal(Op0.getValueType()))
+ return;
+ unsigned Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ return;
+ }
// If the FP type needs to be softened, emit a library call using the 'si'
// version. If we left it to default legalization we'd end up with 'di'. If
// the FP type doesn't need to be softened just let generic type
// legalization promote the result type.
- if (getTypeAction(*DAG.getContext(), Op0.getValueType()) !=
- TargetLowering::TypeSoftenFloat)
- return;
RTLIB::Libcall LC;
- if (N->getOpcode() == ISD::FP_TO_SINT ||
- N->getOpcode() == ISD::STRICT_FP_TO_SINT)
+ if (IsSigned)
LC = RTLIB::getFPTOSINT(Op0.getValueType(), N->getValueType(0));
else
LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
@@ -1687,9 +4930,47 @@
Results.push_back(RCW.getValue(2));
break;
}
+ case ISD::MUL: {
+ unsigned Size = N->getSimpleValueType(0).getSizeInBits();
+ unsigned XLen = Subtarget.getXLen();
+ // This multiply needs to be expanded, try to use MULHSU+MUL if possible.
+ if (Size > XLen) {
+ assert(Size == (XLen * 2) && "Unexpected custom legalisation");
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ APInt HighMask = APInt::getHighBitsSet(Size, XLen);
+
+ bool LHSIsU = DAG.MaskedValueIsZero(LHS, HighMask);
+ bool RHSIsU = DAG.MaskedValueIsZero(RHS, HighMask);
+ // We need exactly one side to be unsigned.
+ if (LHSIsU == RHSIsU)
+ return;
+
+ auto MakeMULPair = [&](SDValue S, SDValue U) {
+ MVT XLenVT = Subtarget.getXLenVT();
+ S = DAG.getNode(ISD::TRUNCATE, DL, XLenVT, S);
+ U = DAG.getNode(ISD::TRUNCATE, DL, XLenVT, U);
+ SDValue Lo = DAG.getNode(ISD::MUL, DL, XLenVT, S, U);
+ SDValue Hi = DAG.getNode(RISCVISD::MULHSU, DL, XLenVT, S, U);
+ return DAG.getNode(ISD::BUILD_PAIR, DL, N->getValueType(0), Lo, Hi);
+ };
+
+ bool LHSIsS = DAG.ComputeNumSignBits(LHS) > XLen;
+ bool RHSIsS = DAG.ComputeNumSignBits(RHS) > XLen;
+
+ // The other operand should be signed, but still prefer MULH when
+ // possible.
+ if (RHSIsU && LHSIsS && !RHSIsS)
+ Results.push_back(MakeMULPair(LHS, RHS));
+ else if (LHSIsU && RHSIsS && !LHSIsS)
+ Results.push_back(MakeMULPair(RHS, LHS));
+
+ return;
+ }
+ LLVM_FALLTHROUGH;
+ }
case ISD::ADD:
case ISD::SUB:
- case ISD::MUL:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
if (N->getOperand(1).getOpcode() == ISD::Constant)
@@ -1711,6 +4992,22 @@
"Unexpected custom legalisation");
Results.push_back(customLegalizeToWOp(N, DAG));
break;
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+
+ SDValue NewOp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ bool IsCTZ =
+ N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+ unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ return;
+ }
case ISD::SDIV:
case ISD::UDIV:
case ISD::UREM: {
@@ -1718,8 +5015,10 @@
assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
Subtarget.is64Bit() && Subtarget.hasStdExtM() &&
"Unexpected custom legalisation");
- if (N->getOperand(0).getOpcode() == ISD::Constant ||
- N->getOperand(1).getOpcode() == ISD::Constant)
+ // Don't promote division/remainder by constant since we should expand those
+ // to multiply by magic constant.
+ // FIXME: What if the expansion is disabled for minsize.
+ if (N->getOperand(1).getOpcode() == ISD::Constant)
return;
// If the input is i32, use ANY_EXTEND since the W instructions don't read
@@ -1733,41 +5032,108 @@
Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc));
break;
}
- case ISD::BITCAST: {
- assert(((N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
- Subtarget.hasStdExtF()) ||
- (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh())) &&
+ case ISD::UADDO:
+ case ISD::USUBO: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
+ bool IsAdd = N->getOpcode() == ISD::UADDO;
+ // Create an ADDW or SUBW.
+ SDValue LHS = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue RHS = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue Res =
+ DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, DL, MVT::i64, LHS, RHS);
+ Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Res,
+ DAG.getValueType(MVT::i32));
+
+ // Sign extend the LHS and perform an unsigned compare with the ADDW result.
+ // Since the inputs are sign extended from i32, this is equivalent to
+ // comparing the lower 32 bits.
+ LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue Overflow = DAG.getSetCC(DL, N->getValueType(1), Res, LHS,
+ IsAdd ? ISD::SETULT : ISD::SETUGT);
+
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ Results.push_back(Overflow);
+ return;
+ }
+ case ISD::UADDSAT:
+ case ISD::USUBSAT: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ if (Subtarget.hasStdExtZbb()) {
+ // With Zbb we can sign extend and let LegalizeDAG use minu/maxu. Using
+ // sign extend allows overflow of the lower 32 bits to be detected on
+ // the promoted size.
+ SDValue LHS =
+ DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue RHS =
+ DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue Res = DAG.getNode(N->getOpcode(), DL, MVT::i64, LHS, RHS);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ return;
+ }
+
+ // Without Zbb, expand to UADDO/USUBO+select which will trigger our custom
+ // promotion for UADDO/USUBO.
+ Results.push_back(expandAddSubSat(N, DAG));
+ return;
+ }
+ case ISD::BITCAST: {
+ EVT VT = N->getValueType(0);
+ assert(VT.isInteger() && !VT.isVector() && "Unexpected VT!");
SDValue Op0 = N->getOperand(0);
- if (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh()) {
- if (Op0.getValueType() != MVT::f16)
- return;
- SDValue FPConv =
- DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, Subtarget.getXLenVT(), Op0);
+ EVT Op0VT = Op0.getValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ if (VT == MVT::i16 && Op0VT == MVT::f16 && Subtarget.hasStdExtZfh()) {
+ SDValue FPConv = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Op0);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FPConv));
- } else if (N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ } else if (VT == MVT::i32 && Op0VT == MVT::f32 && Subtarget.is64Bit() &&
Subtarget.hasStdExtF()) {
- if (Op0.getValueType() != MVT::f32)
- return;
SDValue FPConv =
DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+ } else if (!VT.isVector() && Op0VT.isFixedLengthVector() &&
+ isTypeLegal(Op0VT)) {
+ // Custom-legalize bitcasts from fixed-length vector types to illegal
+ // scalar types in order to improve codegen. Bitcast the vector to a
+ // one-element vector type whose element type is the same as the result
+ // type, and extract the first element.
+ LLVMContext &Context = *DAG.getContext();
+ SDValue BVec = DAG.getBitcast(EVT::getVectorVT(Context, VT, 1), Op0);
+ Results.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
+ DAG.getConstant(0, DL, XLenVT)));
}
break;
}
- case RISCVISD::GREVI:
- case RISCVISD::GORCI: {
+ case RISCVISD::GREV:
+ case RISCVISD::GORC: {
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
+ assert(isa<ConstantSDNode>(N->getOperand(1)) && "Expected constant");
// This is similar to customLegalizeToWOp, except that we pass the second
// operand (a TargetConstant) straight through: it is already of type
// XLenVT.
- SDLoc DL(N);
RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
SDValue NewOp0 =
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
- SDValue NewRes =
- DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, N->getOperand(1));
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
+ // ReplaceNodeResults requires we maintain the same type for the return
+ // value.
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+ break;
+ }
+ case RISCVISD::SHFL: {
+ // There is no SHFLIW instruction, but we can just promote the operation.
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ assert(isa<ConstantSDNode>(N->getOperand(1)) && "Expected constant");
+ SDValue NewOp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewRes = DAG.getNode(RISCVISD::SHFL, DL, MVT::i64, NewOp0, NewOp1);
// ReplaceNodeResults requires we maintain the same type for the return
// value.
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
@@ -1775,17 +5141,22 @@
}
case ISD::BSWAP:
case ISD::BITREVERSE: {
- assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ MVT VT = N->getSimpleValueType(0);
+ MVT XLenVT = Subtarget.getXLenVT();
+ assert((VT == MVT::i8 || VT == MVT::i16 ||
+ (VT == MVT::i32 && Subtarget.is64Bit())) &&
Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
- SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
- N->getOperand(0));
- unsigned Imm = N->getOpcode() == ISD::BITREVERSE ? 31 : 24;
- SDValue GREVIW = DAG.getNode(RISCVISD::GREVIW, DL, MVT::i64, NewOp0,
- DAG.getTargetConstant(Imm, DL,
- Subtarget.getXLenVT()));
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, N->getOperand(0));
+ unsigned Imm = VT.getSizeInBits() - 1;
+ // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
+ if (N->getOpcode() == ISD::BSWAP)
+ Imm &= ~0x7U;
+ unsigned Opc = Subtarget.is64Bit() ? RISCVISD::GREVW : RISCVISD::GREV;
+ SDValue GREVI =
+ DAG.getNode(Opc, DL, XLenVT, NewOp0, DAG.getConstant(Imm, DL, XLenVT));
// ReplaceNodeResults requires we maintain the same type for the return
// value.
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, GREVIW));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, GREVI));
break;
}
case ISD::FSHL:
@@ -1815,33 +5186,53 @@
// transferred to the destination register. We issue two of these from the
// upper- and lower- halves of the SEW-bit vector element, slid down to the
// first element.
- SDLoc DL(N);
SDValue Vec = N->getOperand(0);
SDValue Idx = N->getOperand(1);
- EVT VecVT = Vec.getValueType();
+
+ // The vector type hasn't been legalized yet so we can't issue target
+ // specific nodes if it needs legalization.
+ // FIXME: We would manually legalize if it's important.
+ if (!isTypeLegal(Vec.getValueType()))
+ return;
+
+ MVT VecVT = Vec.getSimpleValueType();
+
assert(!Subtarget.is64Bit() && N->getValueType(0) == MVT::i64 &&
VecVT.getVectorElementType() == MVT::i64 &&
"Unexpected EXTRACT_VECTOR_ELT legalization");
- SDValue Slidedown = Vec;
- // Unless the index is known to be 0, we must slide the vector down to get
- // the desired element into index 0.
- if (!isNullConstant(Idx))
- Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
- DAG.getUNDEF(VecVT), Vec, Idx);
+ // If this is a fixed vector, we need to convert it to a scalable vector.
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
MVT XLenVT = Subtarget.getXLenVT();
+
+ // Use a VL of 1 to avoid processing more elements than we need.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
+ SDValue VL = DAG.getConstant(1, DL, XLenVT);
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+
+ // Unless the index is known to be 0, we must slide the vector down to get
+ // the desired element into index 0.
+ if (!isNullConstant(Idx)) {
+ Vec = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Vec, Idx, Mask, VL);
+ }
+
// Extract the lower XLEN bits of the correct vector element.
- SDValue EltLo = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Slidedown, Idx);
+ SDValue EltLo = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Vec);
// To extract the upper XLEN bits of the vector element, shift the first
// element right by 32 bits and re-extract the lower XLEN bits.
- SDValue ThirtyTwoV =
- DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
- DAG.getConstant(32, DL, Subtarget.getXLenVT()));
- SDValue LShr32 = DAG.getNode(ISD::SRL, DL, VecVT, Slidedown, ThirtyTwoV);
+ SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+ DAG.getConstant(32, DL, XLenVT), VL);
+ SDValue LShr32 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Vec,
+ ThirtyTwoV, Mask, VL);
- SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32, Idx);
+ SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32);
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, EltLo, EltHi));
break;
@@ -1852,19 +5243,124 @@
default:
llvm_unreachable(
"Don't know how to custom type legalize this intrinsic!");
+ case Intrinsic::riscv_orc_b: {
+ // Lower to the GORCI encoding for orc.b with the operand extended.
+ SDValue NewOp =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ // If Zbp is enabled, use GORCIW which will sign extend the result.
+ unsigned Opc =
+ Subtarget.hasStdExtZbp() ? RISCVISD::GORCW : RISCVISD::GORC;
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp,
+ DAG.getConstant(7, DL, MVT::i64));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ return;
+ }
+ case Intrinsic::riscv_grev:
+ case Intrinsic::riscv_gorc: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewOp2 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_grev ? RISCVISD::GREVW : RISCVISD::GORCW;
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ break;
+ }
+ case Intrinsic::riscv_shfl:
+ case Intrinsic::riscv_unshfl: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewOp2 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_shfl ? RISCVISD::SHFLW : RISCVISD::UNSHFLW;
+ if (isa<ConstantSDNode>(N->getOperand(2))) {
+ NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
+ DAG.getConstant(0xf, DL, MVT::i64));
+ Opc =
+ IntNo == Intrinsic::riscv_shfl ? RISCVISD::SHFL : RISCVISD::UNSHFL;
+ }
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ break;
+ }
+ case Intrinsic::riscv_bcompress:
+ case Intrinsic::riscv_bdecompress: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewOp2 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+ unsigned Opc = IntNo == Intrinsic::riscv_bcompress
+ ? RISCVISD::BCOMPRESSW
+ : RISCVISD::BDECOMPRESSW;
+ SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ break;
+ }
case Intrinsic::riscv_vmv_x_s: {
EVT VT = N->getValueType(0);
- assert((VT == MVT::i8 || VT == MVT::i16 ||
- (Subtarget.is64Bit() && VT == MVT::i32)) &&
- "Unexpected custom legalisation!");
- SDValue Extract = DAG.getNode(RISCVISD::VMV_X_S, DL,
- Subtarget.getXLenVT(), N->getOperand(1));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Extract));
+ MVT XLenVT = Subtarget.getXLenVT();
+ if (VT.bitsLT(XLenVT)) {
+ // Simple case just extract using vmv.x.s and truncate.
+ SDValue Extract = DAG.getNode(RISCVISD::VMV_X_S, DL,
+ Subtarget.getXLenVT(), N->getOperand(1));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Extract));
+ return;
+ }
+
+ assert(VT == MVT::i64 && !Subtarget.is64Bit() &&
+ "Unexpected custom legalization");
+
+ // We need to do the move in two steps.
+ SDValue Vec = N->getOperand(1);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ // First extract the lower XLEN bits of the element.
+ SDValue EltLo = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Vec);
+
+ // To extract the upper XLEN bits of the vector element, shift the first
+ // element right by 32 bits and re-extract the lower XLEN bits.
+ SDValue VL = DAG.getConstant(1, DL, XLenVT);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT,
+ DAG.getConstant(32, DL, XLenVT), VL);
+ SDValue LShr32 =
+ DAG.getNode(RISCVISD::SRL_VL, DL, VecVT, Vec, ThirtyTwoV, Mask, VL);
+ SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32);
+
+ Results.push_back(
+ DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, EltLo, EltHi));
break;
}
}
break;
}
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMIN:
+ if (SDValue V = lowerVECREDUCE(SDValue(N, 0), DAG))
+ Results.push_back(V);
+ break;
+ case ISD::FLT_ROUNDS_: {
+ SDVTList VTs = DAG.getVTList(Subtarget.getXLenVT(), MVT::Other);
+ SDValue Res = DAG.getNode(ISD::FLT_ROUNDS_, DL, VTs, N->getOperand(0));
+ Results.push_back(Res.getValue(0));
+ Results.push_back(Res.getValue(1));
+ break;
+ }
}
}
@@ -1882,19 +5378,21 @@
}
};
-// Matches any of the following bit-manipulation patterns:
-// (and (shl x, 1), (0x55555555 << 1))
-// (and (srl x, 1), 0x55555555)
-// (shl (and x, 0x55555555), 1)
-// (srl (and x, (0x55555555 << 1)), 1)
-// where the shift amount and mask may vary thus:
-// [1] = 0x55555555 / 0xAAAAAAAA
-// [2] = 0x33333333 / 0xCCCCCCCC
-// [4] = 0x0F0F0F0F / 0xF0F0F0F0
-// [8] = 0x00FF00FF / 0xFF00FF00
-// [16] = 0x0000FFFF / 0xFFFFFFFF
-// [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
-static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
+// Matches patterns of the form
+// (and (shl x, C2), (C1 << C2))
+// (and (srl x, C2), C1)
+// (shl (and x, C1), C2)
+// (srl (and x, (C1 << C2)), C2)
+// Where C2 is a power of 2 and C1 has at least that many leading zeroes.
+// The expected masks for each shift amount are specified in BitmanipMasks where
+// BitmanipMasks[log2(C2)] specifies the expected C1 value.
+// The max allowed shift amount is either XLen/2 or XLen/4 determined by whether
+// BitmanipMasks contains 6 or 5 entries assuming that the maximum possible
+// XLen is 64.
+static Optional<RISCVBitmanipPat>
+matchRISCVBitmanipPat(SDValue Op, ArrayRef<uint64_t> BitmanipMasks) {
+ assert((BitmanipMasks.size() == 5 || BitmanipMasks.size() == 6) &&
+ "Unexpected number of masks");
Optional<uint64_t> Mask;
// Optionally consume a mask around the shift operation.
if (Op.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -1907,26 +5405,17 @@
if (!isa<ConstantSDNode>(Op.getOperand(1)))
return None;
- auto ShAmt = Op.getConstantOperandVal(1);
-
- if (!isPowerOf2_64(ShAmt))
- return None;
-
- // These are the unshifted masks which we use to match bit-manipulation
- // patterns. They may be shifted left in certain circumstances.
- static const uint64_t BitmanipMasks[] = {
- 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
- 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
- };
-
- unsigned MaskIdx = Log2_64(ShAmt);
- if (MaskIdx >= array_lengthof(BitmanipMasks))
- return None;
-
- auto Src = Op.getOperand(0);
+ uint64_t ShAmt = Op.getConstantOperandVal(1);
unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
- auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+ if (ShAmt >= Width || !isPowerOf2_64(ShAmt))
+ return None;
+ // If we don't have enough masks for 64 bit, then we must be trying to
+ // match SHFL so we're only allowed to shift 1/4 of the width.
+ if (BitmanipMasks.size() == 5 && ShAmt >= (Width / 2))
+ return None;
+
+ SDValue Src = Op.getOperand(0);
// The expected mask is shifted left when the AND is found around SHL
// patterns.
@@ -1953,6 +5442,9 @@
}
}
+ unsigned MaskIdx = Log2_32(ShAmt);
+ uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
if (SHLExpMask)
ExpMask <<= ShAmt;
@@ -1962,20 +5454,42 @@
return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL};
}
+// Matches any of the following bit-manipulation patterns:
+// (and (shl x, 1), (0x55555555 << 1))
+// (and (srl x, 1), 0x55555555)
+// (shl (and x, 0x55555555), 1)
+// (srl (and x, (0x55555555 << 1)), 1)
+// where the shift amount and mask may vary thus:
+// [1] = 0x55555555 / 0xAAAAAAAA
+// [2] = 0x33333333 / 0xCCCCCCCC
+// [4] = 0x0F0F0F0F / 0xF0F0F0F0
+// [8] = 0x00FF00FF / 0xFF00FF00
+// [16] = 0x0000FFFF / 0xFFFFFFFF
+// [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchGREVIPat(SDValue Op) {
+ // These are the unshifted masks which we use to match bit-manipulation
+ // patterns. They may be shifted left in certain circumstances.
+ static const uint64_t BitmanipMasks[] = {
+ 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+ 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
+
+ return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
// Match the following pattern as a GREVI(W) operation
// (or (BITMANIP_SHL x), (BITMANIP_SRL x))
static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
+ assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
EVT VT = Op.getValueType();
if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
- auto LHS = matchRISCVBitmanipPat(Op.getOperand(0));
- auto RHS = matchRISCVBitmanipPat(Op.getOperand(1));
+ auto LHS = matchGREVIPat(Op.getOperand(0));
+ auto RHS = matchGREVIPat(Op.getOperand(1));
if (LHS && RHS && LHS->formsPairWith(*RHS)) {
SDLoc DL(Op);
- return DAG.getNode(
- RISCVISD::GREVI, DL, VT, LHS->Op,
- DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+ return DAG.getNode(RISCVISD::GREV, DL, VT, LHS->Op,
+ DAG.getConstant(LHS->ShAmt, DL, VT));
}
}
return SDValue();
@@ -1992,6 +5506,7 @@
// 4. (or (rotl/rotr x, bitwidth/2), x)
static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
+ assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
EVT VT = Op.getValueType();
if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
@@ -2000,9 +5515,10 @@
SDValue Op1 = Op.getOperand(1);
auto MatchOROfReverse = [&](SDValue Reverse, SDValue X) {
- if (Reverse.getOpcode() == RISCVISD::GREVI && Reverse.getOperand(0) == X &&
+ if (Reverse.getOpcode() == RISCVISD::GREV && Reverse.getOperand(0) == X &&
+ isa<ConstantSDNode>(Reverse.getOperand(1)) &&
isPowerOf2_32(Reverse.getConstantOperandVal(1)))
- return DAG.getNode(RISCVISD::GORCI, DL, VT, X, Reverse.getOperand(1));
+ return DAG.getNode(RISCVISD::GORC, DL, VT, X, Reverse.getOperand(1));
// We can also form GORCI from ROTL/ROTR by half the bitwidth.
if ((Reverse.getOpcode() == ISD::ROTL ||
Reverse.getOpcode() == ISD::ROTR) &&
@@ -2010,9 +5526,8 @@
isa<ConstantSDNode>(Reverse.getOperand(1))) {
uint64_t RotAmt = Reverse.getConstantOperandVal(1);
if (RotAmt == (VT.getSizeInBits() / 2))
- return DAG.getNode(
- RISCVISD::GORCI, DL, VT, X,
- DAG.getTargetConstant(RotAmt, DL, Subtarget.getXLenVT()));
+ return DAG.getNode(RISCVISD::GORC, DL, VT, X,
+ DAG.getConstant(RotAmt, DL, VT));
}
return SDValue();
};
@@ -2030,39 +5545,137 @@
return SDValue();
SDValue OrOp0 = Op0.getOperand(0);
SDValue OrOp1 = Op0.getOperand(1);
- auto LHS = matchRISCVBitmanipPat(OrOp0);
+ auto LHS = matchGREVIPat(OrOp0);
// OR is commutable so swap the operands and try again: x might have been
// on the left
if (!LHS) {
std::swap(OrOp0, OrOp1);
- LHS = matchRISCVBitmanipPat(OrOp0);
+ LHS = matchGREVIPat(OrOp0);
}
- auto RHS = matchRISCVBitmanipPat(Op1);
+ auto RHS = matchGREVIPat(Op1);
if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) {
- return DAG.getNode(
- RISCVISD::GORCI, DL, VT, LHS->Op,
- DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+ return DAG.getNode(RISCVISD::GORC, DL, VT, LHS->Op,
+ DAG.getConstant(LHS->ShAmt, DL, VT));
}
}
return SDValue();
}
+// Matches any of the following bit-manipulation patterns:
+// (and (shl x, 1), (0x22222222 << 1))
+// (and (srl x, 1), 0x22222222)
+// (shl (and x, 0x22222222), 1)
+// (srl (and x, (0x22222222 << 1)), 1)
+// where the shift amount and mask may vary thus:
+// [1] = 0x22222222 / 0x44444444
+// [2] = 0x0C0C0C0C / 0x3C3C3C3C
+// [4] = 0x00F000F0 / 0x0F000F00
+// [8] = 0x0000FF00 / 0x00FF0000
+// [16] = 0x00000000FFFF0000 / 0x0000FFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchSHFLPat(SDValue Op) {
+ // These are the unshifted masks which we use to match bit-manipulation
+ // patterns. They may be shifted left in certain circumstances.
+ static const uint64_t BitmanipMasks[] = {
+ 0x2222222222222222ULL, 0x0C0C0C0C0C0C0C0CULL, 0x00F000F000F000F0ULL,
+ 0x0000FF000000FF00ULL, 0x00000000FFFF0000ULL};
+
+ return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
+// Match (or (or (SHFL_SHL x), (SHFL_SHR x)), (SHFL_AND x)
+static SDValue combineORToSHFL(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
+ EVT VT = Op.getValueType();
+
+ if (VT != MVT::i32 && VT != Subtarget.getXLenVT())
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Or is commutable so canonicalize the second OR to the LHS.
+ if (Op0.getOpcode() != ISD::OR)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() != ISD::OR)
+ return SDValue();
+
+ // We found an inner OR, so our operands are the operands of the inner OR
+ // and the other operand of the outer OR.
+ SDValue A = Op0.getOperand(0);
+ SDValue B = Op0.getOperand(1);
+ SDValue C = Op1;
+
+ auto Match1 = matchSHFLPat(A);
+ auto Match2 = matchSHFLPat(B);
+
+ // If neither matched, we failed.
+ if (!Match1 && !Match2)
+ return SDValue();
+
+ // We had at least one match. if one failed, try the remaining C operand.
+ if (!Match1) {
+ std::swap(A, C);
+ Match1 = matchSHFLPat(A);
+ if (!Match1)
+ return SDValue();
+ } else if (!Match2) {
+ std::swap(B, C);
+ Match2 = matchSHFLPat(B);
+ if (!Match2)
+ return SDValue();
+ }
+ assert(Match1 && Match2);
+
+ // Make sure our matches pair up.
+ if (!Match1->formsPairWith(*Match2))
+ return SDValue();
+
+ // All the remains is to make sure C is an AND with the same input, that masks
+ // out the bits that are being shuffled.
+ if (C.getOpcode() != ISD::AND || !isa<ConstantSDNode>(C.getOperand(1)) ||
+ C.getOperand(0) != Match1->Op)
+ return SDValue();
+
+ uint64_t Mask = C.getConstantOperandVal(1);
+
+ static const uint64_t BitmanipMasks[] = {
+ 0x9999999999999999ULL, 0xC3C3C3C3C3C3C3C3ULL, 0xF00FF00FF00FF00FULL,
+ 0xFF0000FFFF0000FFULL, 0xFFFF00000000FFFFULL,
+ };
+
+ unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+ unsigned MaskIdx = Log2_32(Match1->ShAmt);
+ uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
+ if (Mask != ExpMask)
+ return SDValue();
+
+ SDLoc DL(Op);
+ return DAG.getNode(RISCVISD::SHFL, DL, VT, Match1->Op,
+ DAG.getConstant(Match1->ShAmt, DL, VT));
+}
+
// Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
// non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
// Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
// not undo itself, but they are redundant.
static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
- unsigned ShAmt1 = N->getConstantOperandVal(1);
SDValue Src = N->getOperand(0);
if (Src.getOpcode() != N->getOpcode())
return SDValue();
+ if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+ !isa<ConstantSDNode>(Src.getOperand(1)))
+ return SDValue();
+
+ unsigned ShAmt1 = N->getConstantOperandVal(1);
unsigned ShAmt2 = Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
unsigned CombinedShAmt;
- if (N->getOpcode() == RISCVISD::GORCI || N->getOpcode() == RISCVISD::GORCIW)
+ if (N->getOpcode() == RISCVISD::GORC || N->getOpcode() == RISCVISD::GORCW)
CombinedShAmt = ShAmt1 | ShAmt2;
else
CombinedShAmt = ShAmt1 ^ ShAmt2;
@@ -2071,9 +5684,186 @@
return Src;
SDLoc DL(N);
- return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), Src,
- DAG.getTargetConstant(CombinedShAmt, DL,
- N->getOperand(1).getValueType()));
+ return DAG.getNode(
+ N->getOpcode(), DL, N->getValueType(0), Src,
+ DAG.getConstant(CombinedShAmt, DL, N->getOperand(1).getValueType()));
+}
+
+// Combine a constant select operand into its use:
+//
+// (and (select_cc lhs, rhs, cc, -1, c), x)
+// -> (select_cc lhs, rhs, cc, x, (and, x, c)) [AllOnes=1]
+// (or (select_cc lhs, rhs, cc, 0, c), x)
+// -> (select_cc lhs, rhs, cc, x, (or, x, c)) [AllOnes=0]
+// (xor (select_cc lhs, rhs, cc, 0, c), x)
+// -> (select_cc lhs, rhs, cc, x, (xor, x, c)) [AllOnes=0]
+static SDValue combineSelectCCAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+ SelectionDAG &DAG, bool AllOnes) {
+ EVT VT = N->getValueType(0);
+
+ if (Slct.getOpcode() != RISCVISD::SELECT_CC || !Slct.hasOneUse())
+ return SDValue();
+
+ auto isZeroOrAllOnes = [](SDValue N, bool AllOnes) {
+ return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+ };
+
+ bool SwapSelectOps;
+ SDValue TrueVal = Slct.getOperand(3);
+ SDValue FalseVal = Slct.getOperand(4);
+ SDValue NonConstantVal;
+ if (isZeroOrAllOnes(TrueVal, AllOnes)) {
+ SwapSelectOps = false;
+ NonConstantVal = FalseVal;
+ } else if (isZeroOrAllOnes(FalseVal, AllOnes)) {
+ SwapSelectOps = true;
+ NonConstantVal = TrueVal;
+ } else
+ return SDValue();
+
+ // Slct is now know to be the desired identity constant when CC is true.
+ TrueVal = OtherOp;
+ FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal);
+ // Unless SwapSelectOps says CC should be false.
+ if (SwapSelectOps)
+ std::swap(TrueVal, FalseVal);
+
+ return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), VT,
+ {Slct.getOperand(0), Slct.getOperand(1),
+ Slct.getOperand(2), TrueVal, FalseVal});
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static SDValue combineSelectCCAndUseCommutative(SDNode *N, SelectionDAG &DAG,
+ bool AllOnes) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (SDValue Result = combineSelectCCAndUse(N, N0, N1, DAG, AllOnes))
+ return Result;
+ if (SDValue Result = combineSelectCCAndUse(N, N1, N0, DAG, AllOnes))
+ return Result;
+ return SDValue();
+}
+
+static SDValue performANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // fold (and (select_cc lhs, rhs, cc, -1, y), x) ->
+ // (select lhs, rhs, cc, x, (and x, y))
+ return combineSelectCCAndUseCommutative(N, DAG, true);
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ if (Subtarget.hasStdExtZbp()) {
+ if (auto GREV = combineORToGREV(SDValue(N, 0), DAG, Subtarget))
+ return GREV;
+ if (auto GORC = combineORToGORC(SDValue(N, 0), DAG, Subtarget))
+ return GORC;
+ if (auto SHFL = combineORToSHFL(SDValue(N, 0), DAG, Subtarget))
+ return SHFL;
+ }
+
+ // fold (or (select_cc lhs, rhs, cc, 0, y), x) ->
+ // (select lhs, rhs, cc, x, (or x, y))
+ return combineSelectCCAndUseCommutative(N, DAG, false);
+}
+
+static SDValue performXORCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // fold (xor (select_cc lhs, rhs, cc, 0, y), x) ->
+ // (select lhs, rhs, cc, x, (xor x, y))
+ return combineSelectCCAndUseCommutative(N, DAG, false);
+}
+
+// Attempt to turn ANY_EXTEND into SIGN_EXTEND if the input to the ANY_EXTEND
+// has users that require SIGN_EXTEND and the SIGN_EXTEND can be done for free
+// by an instruction like ADDW/SUBW/MULW. Without this the ANY_EXTEND would be
+// removed during type legalization leaving an ADD/SUB/MUL use that won't use
+// ADDW/SUBW/MULW.
+static SDValue performANY_EXTENDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ if (!Subtarget.is64Bit())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+
+ SDValue Src = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64 || Src.getValueType() != MVT::i32)
+ return SDValue();
+
+ // The opcode must be one that can implicitly sign_extend.
+ // FIXME: Additional opcodes.
+ switch (Src.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::MUL:
+ if (!Subtarget.hasStdExtM())
+ return SDValue();
+ LLVM_FALLTHROUGH;
+ case ISD::ADD:
+ case ISD::SUB:
+ break;
+ }
+
+ // Only handle cases where the result is used by a CopyToReg that likely
+ // means the value is a liveout of the basic block. This helps prevent
+ // infinite combine loops like PR51206.
+ if (none_of(N->uses(),
+ [](SDNode *User) { return User->getOpcode() == ISD::CopyToReg; }))
+ return SDValue();
+
+ SmallVector<SDNode *, 4> SetCCs;
+ for (SDNode::use_iterator UI = Src.getNode()->use_begin(),
+ UE = Src.getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User == N)
+ continue;
+ if (UI.getUse().getResNo() != Src.getResNo())
+ continue;
+ // All i32 setccs are legalized by sign extending operands.
+ if (User->getOpcode() == ISD::SETCC) {
+ SetCCs.push_back(User);
+ continue;
+ }
+ // We don't know if we can extend this user.
+ break;
+ }
+
+ // If we don't have any SetCCs, this isn't worthwhile.
+ if (SetCCs.empty())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Src);
+ DCI.CombineTo(N, SExt);
+
+ // Promote all the setccs.
+ for (SDNode *SetCC : SetCCs) {
+ SmallVector<SDValue, 4> Ops;
+
+ for (unsigned j = 0; j != 2; ++j) {
+ SDValue SOp = SetCC->getOperand(j);
+ if (SOp == Src)
+ Ops.push_back(SExt);
+ else
+ Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, SOp));
+ }
+
+ Ops.push_back(SetCC->getOperand(2));
+ DCI.CombineTo(SetCC,
+ DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
+ }
+ return SDValue(N, 0);
}
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
@@ -2143,6 +5933,32 @@
}
break;
}
+ case RISCVISD::CLZW:
+ case RISCVISD::CTZW: {
+ // Only the lower 32 bits of the first operand are read
+ SDValue Op0 = N->getOperand(0);
+ APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
+ if (SimplifyDemandedBits(Op0, Mask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ break;
+ }
+ case RISCVISD::FSL:
+ case RISCVISD::FSR: {
+ // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read.
+ SDValue ShAmt = N->getOperand(2);
+ unsigned BitWidth = ShAmt.getValueSizeInBits();
+ assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
+ APInt ShAmtMask(BitWidth, (BitWidth * 2) - 1);
+ if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ break;
+ }
case RISCVISD::FSLW:
case RISCVISD::FSRW: {
// Only the lower 32 bits of Values and lower 6 bits of shift amount are
@@ -2161,12 +5977,14 @@
}
break;
}
- case RISCVISD::GREVIW:
- case RISCVISD::GORCIW: {
- // Only the lower 32 bits of the first operand are read
- SDValue Op0 = N->getOperand(0);
- APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
- if (SimplifyDemandedBits(Op0, Mask, DCI)) {
+ case RISCVISD::GREV:
+ case RISCVISD::GORC: {
+ // Only the lower log2(Bitwidth) bits of the the shift amount are read.
+ SDValue ShAmt = N->getOperand(1);
+ unsigned BitWidth = ShAmt.getValueSizeInBits();
+ assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
+ APInt ShAmtMask(BitWidth, BitWidth - 1);
+ if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
@@ -2174,6 +5992,68 @@
return combineGREVI_GORCI(N, DCI.DAG);
}
+ case RISCVISD::GREVW:
+ case RISCVISD::GORCW: {
+ // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
+ APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
+ if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
+ SimplifyDemandedBits(RHS, RHSMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return combineGREVI_GORCI(N, DCI.DAG);
+ }
+ case RISCVISD::SHFL:
+ case RISCVISD::UNSHFL: {
+ // Only the lower log2(Bitwidth) bits of the the shift amount are read.
+ SDValue ShAmt = N->getOperand(1);
+ unsigned BitWidth = ShAmt.getValueSizeInBits();
+ assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
+ APInt ShAmtMask(BitWidth, (BitWidth / 2) - 1);
+ if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ break;
+ }
+ case RISCVISD::SHFLW:
+ case RISCVISD::UNSHFLW: {
+ // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
+ APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 4);
+ if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
+ SimplifyDemandedBits(RHS, RHSMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ break;
+ }
+ case RISCVISD::BCOMPRESSW:
+ case RISCVISD::BDECOMPRESSW: {
+ // Only the lower 32 bits of LHS and RHS are read.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ APInt Mask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
+ if (SimplifyDemandedBits(LHS, Mask, DCI) ||
+ SimplifyDemandedBits(RHS, Mask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ break;
+ }
case RISCVISD::FMV_X_ANYEXTW_RV64: {
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
@@ -2204,54 +6084,278 @@
return DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
DAG.getConstant(~SignBit, DL, MVT::i64));
}
- case RISCVISD::GREVI:
- case RISCVISD::GORCI:
- return combineGREVI_GORCI(N, DCI.DAG);
+ case ISD::AND:
+ return performANDCombine(N, DCI, Subtarget);
case ISD::OR:
- if (auto GREV = combineORToGREV(SDValue(N, 0), DCI.DAG, Subtarget))
- return GREV;
- if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget))
- return GORC;
- break;
+ return performORCombine(N, DCI, Subtarget);
+ case ISD::XOR:
+ return performXORCombine(N, DCI, Subtarget);
+ case ISD::ANY_EXTEND:
+ return performANY_EXTENDCombine(N, DCI, Subtarget);
+ case ISD::ZERO_EXTEND:
+ // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during
+ // type legalization. This is safe because fp_to_uint produces poison if
+ // it overflows.
+ if (N->getValueType(0) == MVT::i64 && Subtarget.is64Bit() &&
+ N->getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
+ isTypeLegal(N->getOperand(0).getOperand(0).getValueType()))
+ return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), MVT::i64,
+ N->getOperand(0).getOperand(0));
+ return SDValue();
case RISCVISD::SELECT_CC: {
// Transform
- // (select_cc (xor X, 1), 0, setne, trueV, falseV) ->
- // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
- // This can occur when legalizing some floating point comparisons.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
auto CCVal = static_cast<ISD::CondCode>(N->getConstantOperandVal(2));
+ if (!ISD::isIntEqualitySetCC(CCVal))
+ break;
+
+ // Fold (select_cc (setlt X, Y), 0, ne, trueV, falseV) ->
+ // (select_cc X, Y, lt, trueV, falseV)
+ // Sometimes the setcc is introduced after select_cc has been formed.
+ if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+ LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
+ // If we're looking for eq 0 instead of ne 0, we need to invert the
+ // condition.
+ bool Invert = CCVal == ISD::SETEQ;
+ CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ if (Invert)
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+ SDLoc DL(N);
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ SDValue TargetCC =
+ DAG.getTargetConstant(CCVal, DL, Subtarget.getXLenVT());
+ return DAG.getNode(
+ RISCVISD::SELECT_CC, DL, N->getValueType(0),
+ {LHS, RHS, TargetCC, N->getOperand(3), N->getOperand(4)});
+ }
+
+ // Fold (select_cc (xor X, Y), 0, eq/ne, trueV, falseV) ->
+ // (select_cc X, Y, eq/ne, trueV, falseV)
+ if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
+ return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+ {LHS.getOperand(0), LHS.getOperand(1),
+ N->getOperand(2), N->getOperand(3),
+ N->getOperand(4)});
+ // (select_cc X, 1, setne, trueV, falseV) ->
+ // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
+ // This can occur when legalizing some floating point comparisons.
APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
- if (ISD::isIntEqualitySetCC(CCVal) && isNullConstant(RHS) &&
- LHS.getOpcode() == ISD::XOR && isOneConstant(LHS.getOperand(1)) &&
- DAG.MaskedValueIsZero(LHS.getOperand(0), Mask)) {
+ if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
SDLoc DL(N);
CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
- SDValue TargetCC = DAG.getConstant(CCVal, DL, Subtarget.getXLenVT());
- return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
- {LHS.getOperand(0), RHS, TargetCC, N->getOperand(3),
- N->getOperand(4)});
+ SDValue TargetCC =
+ DAG.getTargetConstant(CCVal, DL, Subtarget.getXLenVT());
+ RHS = DAG.getConstant(0, DL, LHS.getValueType());
+ return DAG.getNode(
+ RISCVISD::SELECT_CC, DL, N->getValueType(0),
+ {LHS, RHS, TargetCC, N->getOperand(3), N->getOperand(4)});
+ }
+
+ break;
+ }
+ case RISCVISD::BR_CC: {
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(3))->get();
+ if (!ISD::isIntEqualitySetCC(CCVal))
+ break;
+
+ // Fold (br_cc (setlt X, Y), 0, ne, dest) ->
+ // (br_cc X, Y, lt, dest)
+ // Sometimes the setcc is introduced after br_cc has been formed.
+ if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+ LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
+ // If we're looking for eq 0 instead of ne 0, we need to invert the
+ // condition.
+ bool Invert = CCVal == ISD::SETEQ;
+ CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ if (Invert)
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+ SDLoc DL(N);
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
+ N->getOperand(0), LHS, RHS, DAG.getCondCode(CCVal),
+ N->getOperand(4));
+ }
+
+ // Fold (br_cc (xor X, Y), 0, eq/ne, dest) ->
+ // (br_cc X, Y, eq/ne, trueV, falseV)
+ if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
+ return DAG.getNode(RISCVISD::BR_CC, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), LHS.getOperand(0), LHS.getOperand(1),
+ N->getOperand(3), N->getOperand(4));
+
+ // (br_cc X, 1, setne, br_cc) ->
+ // (br_cc X, 0, seteq, br_cc) if we can prove X is 0/1.
+ // This can occur when legalizing some floating point comparisons.
+ APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+ if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
+ SDLoc DL(N);
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+ SDValue TargetCC = DAG.getCondCode(CCVal);
+ RHS = DAG.getConstant(0, DL, LHS.getValueType());
+ return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
+ N->getOperand(0), LHS, RHS, TargetCC,
+ N->getOperand(4));
}
break;
}
- case ISD::SETCC: {
- // (setcc X, 1, setne) -> (setcc X, 0, seteq) if we can prove X is 0/1.
- // Comparing with 0 may allow us to fold into bnez/beqz.
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- if (LHS.getValueType().isScalableVector())
+ case ISD::FCOPYSIGN: {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
break;
- auto CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
- if (isOneConstant(RHS) && ISD::isIntEqualitySetCC(CC) &&
- DAG.MaskedValueIsZero(LHS, Mask)) {
+ // There is a form of VFSGNJ which injects the negated sign of its second
+ // operand. Try and bubble any FNEG up after the extend/round to produce
+ // this optimized pattern. Avoid modifying cases where FP_ROUND and
+ // TRUNC=1.
+ SDValue In2 = N->getOperand(1);
+ // Avoid cases where the extend/round has multiple uses, as duplicating
+ // those is typically more expensive than removing a fneg.
+ if (!In2.hasOneUse())
+ break;
+ if (In2.getOpcode() != ISD::FP_EXTEND &&
+ (In2.getOpcode() != ISD::FP_ROUND || In2.getConstantOperandVal(1) != 0))
+ break;
+ In2 = In2.getOperand(0);
+ if (In2.getOpcode() != ISD::FNEG)
+ break;
+ SDLoc DL(N);
+ SDValue NewFPExtRound = DAG.getFPExtendOrRound(In2.getOperand(0), DL, VT);
+ return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound));
+ }
+ case ISD::MGATHER:
+ case ISD::MSCATTER: {
+ if (!DCI.isBeforeLegalize())
+ break;
+ MaskedGatherScatterSDNode *MGSN = cast<MaskedGatherScatterSDNode>(N);
+ SDValue Index = MGSN->getIndex();
+ EVT IndexVT = Index.getValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ // RISCV indexed loads only support the "unsigned unscaled" addressing
+ // mode, so anything else must be manually legalized.
+ bool NeedsIdxLegalization = MGSN->isIndexScaled() ||
+ (MGSN->isIndexSigned() &&
+ IndexVT.getVectorElementType().bitsLT(XLenVT));
+ if (!NeedsIdxLegalization)
+ break;
+
+ SDLoc DL(N);
+
+ // Any index legalization should first promote to XLenVT, so we don't lose
+ // bits when scaling. This may create an illegal index type so we let
+ // LLVM's legalization take care of the splitting.
+ if (IndexVT.getVectorElementType().bitsLT(XLenVT)) {
+ IndexVT = IndexVT.changeVectorElementType(XLenVT);
+ Index = DAG.getNode(MGSN->isIndexSigned() ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND,
+ DL, IndexVT, Index);
+ }
+
+ unsigned Scale = N->getConstantOperandVal(5);
+ if (MGSN->isIndexScaled() && Scale != 1) {
+ // Manually scale the indices by the element size.
+ // TODO: Sanitize the scale operand here?
+ assert(isPowerOf2_32(Scale) && "Expecting power-of-two types");
+ SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT);
+ Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale);
+ }
+
+ ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED;
+ if (const auto *MGN = dyn_cast<MaskedGatherSDNode>(N)) {
+ return DAG.getMaskedGather(
+ N->getVTList(), MGSN->getMemoryVT(), DL,
+ {MGSN->getChain(), MGN->getPassThru(), MGSN->getMask(),
+ MGSN->getBasePtr(), Index, MGN->getScale()},
+ MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType());
+ }
+ const auto *MSN = cast<MaskedScatterSDNode>(N);
+ return DAG.getMaskedScatter(
+ N->getVTList(), MGSN->getMemoryVT(), DL,
+ {MGSN->getChain(), MSN->getValue(), MGSN->getMask(), MGSN->getBasePtr(),
+ Index, MGSN->getScale()},
+ MGSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore());
+ }
+ case RISCVISD::SRA_VL:
+ case RISCVISD::SRL_VL:
+ case RISCVISD::SHL_VL: {
+ SDValue ShAmt = N->getOperand(1);
+ if (ShAmt.getOpcode() == RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL) {
+ // We don't need the upper 32 bits of a 64-bit element for a shift amount.
SDLoc DL(N);
- SDValue Zero = DAG.getConstant(0, DL, LHS.getValueType());
- CC = ISD::getSetCCInverse(CC, LHS.getValueType());
- return DAG.getSetCC(DL, N->getValueType(0), LHS, Zero, CC);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+ ShAmt =
+ DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, ShAmt.getOperand(0), VL);
+ return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt,
+ N->getOperand(2), N->getOperand(3));
}
break;
}
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: {
+ SDValue ShAmt = N->getOperand(1);
+ if (ShAmt.getOpcode() == RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL) {
+ // We don't need the upper 32 bits of a 64-bit element for a shift amount.
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ ShAmt =
+ DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VT, ShAmt.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt);
+ }
+ break;
+ }
+ case RISCVISD::MUL_VL: {
+ // Try to form VWMUL or VWMULU.
+ // FIXME: Look for splat of extended scalar as well.
+ // FIXME: Support VWMULSU.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
+ bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
+ if ((!IsSignExt && !IsZeroExt) || Op0.getOpcode() != Op1.getOpcode())
+ return SDValue();
+
+ // Make sure the extends have a single use.
+ if (!Op0.hasOneUse() || !Op1.hasOneUse())
+ return SDValue();
+
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ if (Op0.getOperand(1) != Mask || Op1.getOperand(1) != Mask ||
+ Op0.getOperand(2) != VL || Op1.getOperand(2) != VL)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ Op1 = Op1.getOperand(0);
+
+ MVT VT = N->getSimpleValueType(0);
+ MVT NarrowVT =
+ MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() / 2),
+ VT.getVectorElementCount());
+
+ SDLoc DL(N);
+
+ // Re-introduce narrower extends if needed.
+ unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
+ if (Op0.getValueType() != NarrowVT)
+ Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+ if (Op1.getValueType() != NarrowVT)
+ Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+
+ unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+ return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+ }
}
return SDValue();
@@ -2290,9 +6394,11 @@
// Neither constant will fit into an immediate, so find materialisation
// costs.
int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
- Subtarget.is64Bit());
+ Subtarget.getFeatureBits(),
+ /*CompressionCost*/true);
int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
- ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());
+ ShiftedC1Int, Ty.getSizeInBits(), Subtarget.getFeatureBits(),
+ /*CompressionCost*/true);
// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
// combine should be prevented.
@@ -2327,16 +6433,43 @@
// Clear all non-demanded bits initially.
APInt ShrunkMask = Mask & DemandedBits;
+ // Try to make a smaller immediate by setting undemanded bits.
+
+ APInt ExpandedMask = Mask | ~DemandedBits;
+
+ auto IsLegalMask = [ShrunkMask, ExpandedMask](const APInt &Mask) -> bool {
+ return ShrunkMask.isSubsetOf(Mask) && Mask.isSubsetOf(ExpandedMask);
+ };
+ auto UseMask = [Mask, Op, VT, &TLO](const APInt &NewMask) -> bool {
+ if (NewMask == Mask)
+ return true;
+ SDLoc DL(Op);
+ SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
+ SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+ };
+
// If the shrunk mask fits in sign extended 12 bits, let the target
// independent code apply it.
if (ShrunkMask.isSignedIntN(12))
return false;
- // Try to make a smaller immediate by setting undemanded bits.
+ // Preserve (and X, 0xffff) when zext.h is supported.
+ if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+ APInt NewMask = APInt(Mask.getBitWidth(), 0xffff);
+ if (IsLegalMask(NewMask))
+ return UseMask(NewMask);
+ }
- // We need to be able to make a negative number through a combination of mask
- // and undemanded bits.
- APInt ExpandedMask = Mask | ~DemandedBits;
+ // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern.
+ if (VT == MVT::i64) {
+ APInt NewMask = APInt(64, 0xffffffff);
+ if (IsLegalMask(NewMask))
+ return UseMask(NewMask);
+ }
+
+ // For the remaining optimizations, we need to be able to make a negative
+ // number through a combination of mask and undemanded bits.
if (!ExpandedMask.isNegative())
return false;
@@ -2354,18 +6487,26 @@
return false;
// Sanity check that our new mask is a subset of the demanded mask.
- assert(NewMask.isSubsetOf(ExpandedMask));
+ assert(IsLegalMask(NewMask));
+ return UseMask(NewMask);
+}
- // If we aren't changing the mask, just return true to keep it and prevent
- // the caller from optimizing.
- if (NewMask == Mask)
- return true;
-
- // Replace the constant with the new mask.
- SDLoc DL(Op);
- SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
- SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
- return TLO.CombineTo(Op, NewOp);
+static void computeGREV(APInt &Src, unsigned ShAmt) {
+ ShAmt &= Src.getBitWidth() - 1;
+ uint64_t x = Src.getZExtValue();
+ if (ShAmt & 1)
+ x = ((x & 0x5555555555555555LL) << 1) | ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
+ if (ShAmt & 2)
+ x = ((x & 0x3333333333333333LL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
+ if (ShAmt & 4)
+ x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
+ if (ShAmt & 8)
+ x = ((x & 0x00FF00FF00FF00FFLL) << 8) | ((x & 0xFF00FF00FF00FF00LL) >> 8);
+ if (ShAmt & 16)
+ x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
+ if (ShAmt & 32)
+ x = ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32);
+ Src = x;
}
void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
@@ -2385,6 +6526,17 @@
Known.resetAll();
switch (Opc) {
default: break;
+ case RISCVISD::SELECT_CC: {
+ Known = DAG.computeKnownBits(Op.getOperand(4), Depth + 1);
+ // If we don't know any bits, early out.
+ if (Known.isUnknown())
+ break;
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(3), Depth + 1);
+
+ // Only known if known in both the LHS and RHS.
+ Known = KnownBits::commonBits(Known, Known2);
+ break;
+ }
case RISCVISD::REMUW: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
@@ -2405,12 +6557,57 @@
Known = Known.sext(BitWidth);
break;
}
- case RISCVISD::READ_VLENB:
- // We assume VLENB is at least 8 bytes.
- // FIXME: The 1.0 draft spec defines minimum VLEN as 128 bits.
- Known.Zero.setLowBits(3);
+ case RISCVISD::CTZW: {
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ unsigned PossibleTZ = Known2.trunc(32).countMaxTrailingZeros();
+ unsigned LowBits = Log2_32(PossibleTZ) + 1;
+ Known.Zero.setBitsFrom(LowBits);
break;
}
+ case RISCVISD::CLZW: {
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ unsigned PossibleLZ = Known2.trunc(32).countMaxLeadingZeros();
+ unsigned LowBits = Log2_32(PossibleLZ) + 1;
+ Known.Zero.setBitsFrom(LowBits);
+ break;
+ }
+ case RISCVISD::GREV:
+ case RISCVISD::GREVW: {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ if (Opc == RISCVISD::GREVW)
+ Known = Known.trunc(32);
+ unsigned ShAmt = C->getZExtValue();
+ computeGREV(Known.Zero, ShAmt);
+ computeGREV(Known.One, ShAmt);
+ if (Opc == RISCVISD::GREVW)
+ Known = Known.sext(BitWidth);
+ }
+ break;
+ }
+ case RISCVISD::READ_VLENB:
+ // We assume VLENB is at least 16 bytes.
+ Known.Zero.setLowBits(4);
+ // We assume VLENB is no more than 65536 / 8 bytes.
+ Known.Zero.setBitsFrom(14);
+ break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ switch (IntNo) {
+ default:
+ // We can't do anything for most intrinsics.
+ break;
+ case Intrinsic::riscv_vsetvli:
+ case Intrinsic::riscv_vsetvlimax:
+ // Assume that VL output is positive and would fit in an int32_t.
+ // TODO: VLEN might be capped at 16 bits in a future V spec update.
+ if (BitWidth >= 32)
+ Known.Zero.setBitsFrom(31);
+ break;
+ }
+ break;
+ }
+ }
}
unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
@@ -2427,14 +6624,35 @@
case RISCVISD::REMUW:
case RISCVISD::ROLW:
case RISCVISD::RORW:
- case RISCVISD::GREVIW:
- case RISCVISD::GORCIW:
+ case RISCVISD::GREVW:
+ case RISCVISD::GORCW:
case RISCVISD::FSLW:
case RISCVISD::FSRW:
+ case RISCVISD::SHFLW:
+ case RISCVISD::UNSHFLW:
+ case RISCVISD::BCOMPRESSW:
+ case RISCVISD::BDECOMPRESSW:
+ case RISCVISD::FCVT_W_RV64:
+ case RISCVISD::FCVT_WU_RV64:
// TODO: As the result is sign-extended, this is conservatively correct. A
// more precise answer could be calculated for SRAW depending on known
// bits in the shift amount.
return 33;
+ case RISCVISD::SHFL:
+ case RISCVISD::UNSHFL: {
+ // There is no SHFLIW, but a i64 SHFLI with bit 4 of the control word
+ // cleared doesn't affect bit 31. The upper 32 bits will be shuffled, but
+ // will stay within the upper 32 bits. If there were more than 32 sign bits
+ // before there will be at least 33 sign bits after.
+ if (Op.getValueType() == MVT::i64 &&
+ isa<ConstantSDNode>(Op.getOperand(1)) &&
+ (Op.getConstantOperandVal(1) & 0x10) == 0) {
+ unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ if (Tmp > 32)
+ return 33;
+ }
+ break;
+ }
case RISCVISD::VMV_X_S:
// The number of sign bits of the scalar result is computed by obtaining the
// element type of the input vector operand, subtracting its width from the
@@ -2714,80 +6932,9 @@
return TailMBB;
}
-static MachineBasicBlock *addVSetVL(MachineInstr &MI, MachineBasicBlock *BB,
- int VLIndex, unsigned SEWIndex,
- RISCVVLMUL VLMul, bool WritesElement0) {
- MachineFunction &MF = *BB->getParent();
- DebugLoc DL = MI.getDebugLoc();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-
- unsigned SEW = MI.getOperand(SEWIndex).getImm();
- assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
- RISCVVSEW ElementWidth = static_cast<RISCVVSEW>(Log2_32(SEW / 8));
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- // VL and VTYPE are alive here.
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII.get(RISCV::PseudoVSETVLI));
-
- if (VLIndex >= 0) {
- // Set VL (rs1 != X0).
- Register DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- MIB.addReg(DestReg, RegState::Define | RegState::Dead)
- .addReg(MI.getOperand(VLIndex).getReg());
- } else
- // With no VL operator in the pseudo, do not modify VL (rd = X0, rs1 = X0).
- MIB.addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill);
-
- // Default to tail agnostic unless the destination is tied to a source. In
- // that case the user would have some control over the tail values. The tail
- // policy is also ignored on instructions that only update element 0 like
- // vmv.s.x or reductions so use agnostic there to match the common case.
- // FIXME: This is conservatively correct, but we might want to detect that
- // the input is undefined.
- bool TailAgnostic = true;
- unsigned UseOpIdx;
- if (MI.isRegTiedToUseOperand(0, &UseOpIdx) && !WritesElement0) {
- TailAgnostic = false;
- // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
- const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
- MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg());
- if (UseMI && UseMI->isImplicitDef())
- TailAgnostic = true;
- }
-
- // For simplicity we reuse the vtype representation here.
- MIB.addImm(RISCVVType::encodeVTYPE(VLMul, ElementWidth,
- /*TailAgnostic*/ TailAgnostic,
- /*MaskAgnostic*/ false));
-
- // Remove (now) redundant operands from pseudo
- MI.getOperand(SEWIndex).setImm(-1);
- if (VLIndex >= 0) {
- MI.getOperand(VLIndex).setReg(RISCV::NoRegister);
- MI.getOperand(VLIndex).setIsKill(false);
- }
-
- return BB;
-}
-
MachineBasicBlock *
RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
- uint64_t TSFlags = MI.getDesc().TSFlags;
-
- if (TSFlags & RISCVII::HasSEWOpMask) {
- unsigned NumOperands = MI.getNumExplicitOperands();
- int VLIndex = (TSFlags & RISCVII::HasVLOpMask) ? NumOperands - 2 : -1;
- unsigned SEWIndex = NumOperands - 1;
- bool WritesElement0 = TSFlags & RISCVII::WritesElement0Mask;
-
- RISCVVLMUL VLMul = static_cast<RISCVVLMUL>((TSFlags & RISCVII::VLMulMask) >>
- RISCVII::VLMulShift);
- return addVSetVL(MI, BB, VLIndex, SEWIndex, VLMul, WritesElement0);
- }
-
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected instr type to insert");
@@ -2899,6 +7046,27 @@
return false;
}
+static unsigned allocateRVVReg(MVT ValVT, unsigned ValNo,
+ Optional<unsigned> FirstMaskArgument,
+ CCState &State, const RISCVTargetLowering &TLI) {
+ const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+ if (RC == &RISCV::VRRegClass) {
+ // Assign the first mask argument to V0.
+ // This is an interim calling convention and it may be changed in the
+ // future.
+ if (FirstMaskArgument.hasValue() && ValNo == FirstMaskArgument.getValue())
+ return State.AllocateReg(RISCV::V0);
+ return State.AllocateReg(ArgVRs);
+ }
+ if (RC == &RISCV::VRM2RegClass)
+ return State.AllocateReg(ArgVRM2s);
+ if (RC == &RISCV::VRM4RegClass)
+ return State.AllocateReg(ArgVRM4s);
+ if (RC == &RISCV::VRM8RegClass)
+ return State.AllocateReg(ArgVRM8s);
+ llvm_unreachable("Unhandled register class for ValueType");
+}
+
// Implements the RISC-V calling convention. Returns true upon failure.
static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
@@ -2910,8 +7078,8 @@
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
// Any return value split in to more than two values can't be returned
- // directly.
- if (IsRet && ValNo > 1)
+ // directly. Vectors are returned via the available vector registers.
+ if (!LocVT.isVector() && IsRet && ValNo > 1)
return true;
// UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a
@@ -3002,9 +7170,15 @@
return false;
}
+ // Fixed-length vectors are located in the corresponding scalable-vector
+ // container types.
+ if (ValVT.isFixedLengthVector())
+ LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+
// Split arguments might be passed indirectly, so keep track of the pending
- // values.
- if (ArgFlags.isSplit() || !PendingLocs.empty()) {
+ // values. Split vectors are passed via a mix of registers and indirectly, so
+ // treat them as we would any other argument.
+ if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) {
LocVT = XLenVT;
LocInfo = CCValAssign::Indirect;
PendingLocs.push_back(
@@ -3017,7 +7191,8 @@
// If the split argument only had two elements, it should be passed directly
// in registers or on the stack.
- if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
+ if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() &&
+ PendingLocs.size() <= 2) {
assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
// Apply the normal calling convention rules to the first half of the
// split argument.
@@ -3031,43 +7206,45 @@
// Allocate to a register if possible, or else a stack slot.
Register Reg;
+ unsigned StoreSizeBytes = XLen / 8;
+ Align StackAlign = Align(XLen / 8);
+
if (ValVT == MVT::f16 && !UseGPRForF16_F32)
Reg = State.AllocateReg(ArgFPR16s);
else if (ValVT == MVT::f32 && !UseGPRForF16_F32)
Reg = State.AllocateReg(ArgFPR32s);
else if (ValVT == MVT::f64 && !UseGPRForF64)
Reg = State.AllocateReg(ArgFPR64s);
- else if (ValVT.isScalableVector()) {
- const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
- if (RC == &RISCV::VRRegClass) {
- // Assign the first mask argument to V0.
- // This is an interim calling convention and it may be changed in the
- // future.
- if (FirstMaskArgument.hasValue() &&
- ValNo == FirstMaskArgument.getValue()) {
- Reg = State.AllocateReg(RISCV::V0);
- } else {
- Reg = State.AllocateReg(ArgVRs);
- }
- } else if (RC == &RISCV::VRM2RegClass) {
- Reg = State.AllocateReg(ArgVRM2s);
- } else if (RC == &RISCV::VRM4RegClass) {
- Reg = State.AllocateReg(ArgVRM4s);
- } else if (RC == &RISCV::VRM8RegClass) {
- Reg = State.AllocateReg(ArgVRM8s);
- } else {
- llvm_unreachable("Unhandled class register for ValueType");
- }
+ else if (ValVT.isVector()) {
+ Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI);
if (!Reg) {
- LocInfo = CCValAssign::Indirect;
+ // For return values, the vector must be passed fully via registers or
+ // via the stack.
+ // FIXME: The proposed vector ABI only mandates v8-v15 for return values,
+ // but we're using all of them.
+ if (IsRet)
+ return true;
// Try using a GPR to pass the address
- Reg = State.AllocateReg(ArgGPRs);
- LocVT = XLenVT;
+ if ((Reg = State.AllocateReg(ArgGPRs))) {
+ LocVT = XLenVT;
+ LocInfo = CCValAssign::Indirect;
+ } else if (ValVT.isScalableVector()) {
+ report_fatal_error("Unable to pass scalable vector types on the stack");
+ } else {
+ // Pass fixed-length vectors on the stack.
+ LocVT = ValVT;
+ StoreSizeBytes = ValVT.getStoreSize();
+ // Align vectors to their element sizes, being careful for vXi1
+ // vectors.
+ StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+ }
}
- } else
+ } else {
Reg = State.AllocateReg(ArgGPRs);
+ }
+
unsigned StackOffset =
- Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));
+ Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign);
// If we reach this point and PendingLocs is non-empty, we must be at the
// end of a split argument that must be passed indirectly.
@@ -3088,8 +7265,8 @@
}
assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
- (TLI.getSubtarget().hasStdExtV() && ValVT.isScalableVector())) &&
- "Expected an XLenVT or scalable vector types at this stage");
+ (TLI.getSubtarget().hasStdExtV() && ValVT.isVector())) &&
+ "Expected an XLenVT or vector types at this stage");
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -3110,8 +7287,7 @@
static Optional<unsigned> preAssignMask(const ArgTy &Args) {
for (const auto &ArgIdx : enumerate(Args)) {
MVT ArgVT = ArgIdx.value().VT;
- if (ArgVT.isScalableVector() &&
- ArgVT.getVectorElementType().SimpleTy == MVT::i1)
+ if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1)
return ArgIdx.index();
}
return None;
@@ -3119,7 +7295,8 @@
void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
- const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
+ const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
+ RISCVCCAssignFn Fn) const {
unsigned NumArgs = Ins.size();
FunctionType *FType = MF.getFunction().getFunctionType();
@@ -3138,9 +7315,9 @@
ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
- if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
- FirstMaskArgument)) {
+ if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
+ ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
+ FirstMaskArgument)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n');
llvm_unreachable(nullptr);
@@ -3151,7 +7328,7 @@
void RISCVTargetLowering::analyzeOutputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
- CallLoweringInfo *CLI) const {
+ CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const {
unsigned NumArgs = Outs.size();
Optional<unsigned> FirstMaskArgument;
@@ -3164,9 +7341,9 @@
Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
- if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
- FirstMaskArgument)) {
+ if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
+ ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
+ FirstMaskArgument)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n");
llvm_unreachable(nullptr);
@@ -3177,11 +7354,14 @@
// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
// values.
static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
- const CCValAssign &VA, const SDLoc &DL) {
+ const CCValAssign &VA, const SDLoc &DL,
+ const RISCVSubtarget &Subtarget) {
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
+ if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector())
+ Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
break;
case CCValAssign::BCvt:
if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
@@ -3212,17 +7392,20 @@
if (VA.getLocInfo() == CCValAssign::Indirect)
return Val;
- return convertLocVTToValVT(DAG, Val, VA, DL);
+ return convertLocVTToValVT(DAG, Val, VA, DL, TLI.getSubtarget());
}
static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
- const CCValAssign &VA, const SDLoc &DL) {
+ const CCValAssign &VA, const SDLoc &DL,
+ const RISCVSubtarget &Subtarget) {
EVT LocVT = VA.getLocVT();
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
+ if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector())
+ Val = convertToScalableVector(LocVT, Val, DAG, Subtarget);
break;
case CCValAssign::BCvt:
if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
@@ -3245,8 +7428,8 @@
EVT LocVT = VA.getLocVT();
EVT ValVT = VA.getValVT();
EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
- int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
- VA.getLocMemOffset(), /*Immutable=*/true);
+ int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
+ /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val;
@@ -3305,16 +7488,21 @@
// FastCC has less than 1% performance improvement for some particular
// benchmark. But theoretically, it may has benenfit for some cases.
-static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
+static bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
+ unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, CCState &State,
+ bool IsFixed, bool IsRet, Type *OrigTy,
+ const RISCVTargetLowering &TLI,
+ Optional<unsigned> FirstMaskArgument) {
+
+ // X5 and X6 might be used for save-restore libcall.
+ static const MCPhysReg GPRList[] = {
+ RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
+ RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28,
+ RISCV::X29, RISCV::X30, RISCV::X31};
if (LocVT == MVT::i32 || LocVT == MVT::i64) {
- // X5 and X6 might be used for save-restore libcall.
- static const MCPhysReg GPRList[] = {
- RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
- RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28,
- RISCV::X29, RISCV::X30, RISCV::X31};
if (unsigned Reg = State.AllocateReg(GPRList)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
@@ -3369,6 +7557,36 @@
return false;
}
+ if (LocVT.isVector()) {
+ if (unsigned Reg =
+ allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) {
+ // Fixed-length vectors are located in the corresponding scalable-vector
+ // container types.
+ if (ValVT.isFixedLengthVector())
+ LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ } else {
+ // Try and pass the address via a "fast" GPR.
+ if (unsigned GPRReg = State.AllocateReg(GPRList)) {
+ LocInfo = CCValAssign::Indirect;
+ LocVT = TLI.getSubtarget().getXLenVT();
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
+ } else if (ValVT.isFixedLengthVector()) {
+ auto StackAlign =
+ MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+ unsigned StackOffset =
+ State.AllocateStack(ValVT.getStoreSize(), StackAlign);
+ State.addLoc(
+ CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+ } else {
+ // Can't pass scalable vectors on the stack.
+ return true;
+ }
+ }
+
+ return false;
+ }
+
return true; // CC didn't match.
}
@@ -3461,12 +7679,12 @@
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- if (CallConv == CallingConv::Fast)
- CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
- else if (CallConv == CallingConv::GHC)
+ if (CallConv == CallingConv::GHC)
CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC);
else
- analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
+ analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false,
+ CallConv == CallingConv::Fast ? CC_RISCV_FastCC
+ : CC_RISCV);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3483,16 +7701,21 @@
if (VA.getLocInfo() == CCValAssign::Indirect) {
// If the original argument was split and passed by reference (e.g. i128
// on RV32), we need to load all parts of it here (using the same
- // address).
+ // address). Vectors may be partly split to registers and partly to the
+ // stack, in which case the base address is partly offset and subsequent
+ // stores are relative to that.
InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
MachinePointerInfo()));
unsigned ArgIndex = Ins[i].OrigArgIndex;
- assert(Ins[i].PartOffset == 0);
+ unsigned ArgPartOffset = Ins[i].PartOffset;
+ assert(VA.getValVT().isVector() || ArgPartOffset == 0);
while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
CCValAssign &PartVA = ArgLocs[i + 1];
- unsigned PartOffset = Ins[i + 1].PartOffset;
- SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
- DAG.getIntPtrConstant(PartOffset, DL));
+ unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset;
+ SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+ if (PartVA.getValVT().isScalableVector())
+ Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset);
+ SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, Offset);
InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
MachinePointerInfo()));
++i;
@@ -3640,6 +7863,11 @@
return true;
}
+static Align getPrefTypeAlign(EVT VT, SelectionDAG &DAG) {
+ return DAG.getDataLayout().getPrefTypeAlign(
+ VT.getTypeForEVT(*DAG.getContext()));
+}
+
// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
// and output parameter nodes.
SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -3663,12 +7891,12 @@
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- if (CallConv == CallingConv::Fast)
- ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
- else if (CallConv == CallingConv::GHC)
+ if (CallConv == CallingConv::GHC)
ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC);
else
- analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
+ analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI,
+ CallConv == CallingConv::Fast ? CC_RISCV_FastCC
+ : CC_RISCV);
// Check if it's really possible to do a tail call.
if (IsTailCall)
@@ -3754,28 +7982,51 @@
// For now, only handle fully promoted and indirect arguments.
if (VA.getLocInfo() == CCValAssign::Indirect) {
// Store the argument in a stack slot and pass its address.
- SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
+ Align StackAlign =
+ std::max(getPrefTypeAlign(Outs[i].ArgVT, DAG),
+ getPrefTypeAlign(ArgValue.getValueType(), DAG));
+ TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
+ // If the original argument was split (e.g. i128), we need
+ // to store the required parts of it here (and pass just one address).
+ // Vectors may be partly split to registers and partly to the stack, in
+ // which case the base address is partly offset and subsequent stores are
+ // relative to that.
+ unsigned ArgIndex = Outs[i].OrigArgIndex;
+ unsigned ArgPartOffset = Outs[i].PartOffset;
+ assert(VA.getValVT().isVector() || ArgPartOffset == 0);
+ // Calculate the total size to store. We don't have access to what we're
+ // actually storing other than performing the loop and collecting the
+ // info.
+ SmallVector<std::pair<SDValue, SDValue>> Parts;
+ while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
+ SDValue PartValue = OutVals[i + 1];
+ unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset;
+ SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+ EVT PartVT = PartValue.getValueType();
+ if (PartVT.isScalableVector())
+ Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset);
+ StoredSize += PartVT.getStoreSize();
+ StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
+ Parts.push_back(std::make_pair(PartValue, Offset));
+ ++i;
+ }
+ SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, SpillSlot,
MachinePointerInfo::getFixedStack(MF, FI)));
- // If the original argument was split (e.g. i128), we need
- // to store all parts of it here (and pass just one address).
- unsigned ArgIndex = Outs[i].OrigArgIndex;
- assert(Outs[i].PartOffset == 0);
- while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
- SDValue PartValue = OutVals[i + 1];
- unsigned PartOffset = Outs[i + 1].PartOffset;
- SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
- DAG.getIntPtrConstant(PartOffset, DL));
+ for (const auto &Part : Parts) {
+ SDValue PartValue = Part.first;
+ SDValue PartOffset = Part.second;
+ SDValue Address =
+ DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
MemOpChains.push_back(
DAG.getStore(Chain, DL, PartValue, Address,
MachinePointerInfo::getFixedStack(MF, FI)));
- ++i;
}
ArgValue = SpillSlot;
} else {
- ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
+ ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL, Subtarget);
}
// Use local copy if it is a byval arg.
@@ -3890,7 +8141,7 @@
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
- analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true);
+ analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_RISCV);
// Copy all of the result registers out of their specified physreg.
for (auto &VA : RVLocs) {
@@ -3911,7 +8162,7 @@
RetValue2);
}
- RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
+ RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget);
InVals.push_back(RetValue);
}
@@ -3958,7 +8209,7 @@
*DAG.getContext());
analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
- nullptr);
+ nullptr, CC_RISCV);
if (CallConv == CallingConv::GHC && !RVLocs.empty())
report_fatal_error("GHC functions return void only");
@@ -3997,7 +8248,7 @@
RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
} else {
// Handle a 'normal' return.
- Val = convertValVTToLocVT(DAG, Val, VA, DL);
+ Val = convertValVTToLocVT(DAG, Val, VA, DL, Subtarget);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
if (STI.isRegisterReservedByUser(VA.getLocReg()))
@@ -4018,6 +8269,7 @@
RetOps.push_back(Glue);
}
+ unsigned RetOpc = RISCVISD::RET_FLAG;
// Interrupt service routines use different return instructions.
const Function &Func = DAG.getMachineFunction().getFunction();
if (Func.hasFnAttribute("interrupt")) {
@@ -4029,18 +8281,15 @@
StringRef Kind =
MF.getFunction().getFnAttribute("interrupt").getValueAsString();
- unsigned RetOpc;
if (Kind == "user")
RetOpc = RISCVISD::URET_FLAG;
else if (Kind == "supervisor")
RetOpc = RISCVISD::SRET_FLAG;
else
RetOpc = RISCVISD::MRET_FLAG;
-
- return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
}
- return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
+ return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
}
void RISCVTargetLowering::validateCCReservedRegs(
@@ -4074,9 +8323,11 @@
NODE_NAME_CASE(MRET_FLAG)
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(SELECT_CC)
+ NODE_NAME_CASE(BR_CC)
NODE_NAME_CASE(BuildPairF64)
NODE_NAME_CASE(SplitF64)
NODE_NAME_CASE(TAIL)
+ NODE_NAME_CASE(MULHSU)
NODE_NAME_CASE(SLLW)
NODE_NAME_CASE(SRAW)
NODE_NAME_CASE(SRLW)
@@ -4085,29 +8336,118 @@
NODE_NAME_CASE(REMUW)
NODE_NAME_CASE(ROLW)
NODE_NAME_CASE(RORW)
+ NODE_NAME_CASE(CLZW)
+ NODE_NAME_CASE(CTZW)
NODE_NAME_CASE(FSLW)
NODE_NAME_CASE(FSRW)
+ NODE_NAME_CASE(FSL)
+ NODE_NAME_CASE(FSR)
NODE_NAME_CASE(FMV_H_X)
NODE_NAME_CASE(FMV_X_ANYEXTH)
NODE_NAME_CASE(FMV_W_X_RV64)
NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
+ NODE_NAME_CASE(FCVT_W_RV64)
+ NODE_NAME_CASE(FCVT_WU_RV64)
NODE_NAME_CASE(READ_CYCLE_WIDE)
- NODE_NAME_CASE(GREVI)
- NODE_NAME_CASE(GREVIW)
- NODE_NAME_CASE(GORCI)
- NODE_NAME_CASE(GORCIW)
+ NODE_NAME_CASE(GREV)
+ NODE_NAME_CASE(GREVW)
+ NODE_NAME_CASE(GORC)
+ NODE_NAME_CASE(GORCW)
+ NODE_NAME_CASE(SHFL)
+ NODE_NAME_CASE(SHFLW)
+ NODE_NAME_CASE(UNSHFL)
+ NODE_NAME_CASE(UNSHFLW)
+ NODE_NAME_CASE(BCOMPRESS)
+ NODE_NAME_CASE(BCOMPRESSW)
+ NODE_NAME_CASE(BDECOMPRESS)
+ NODE_NAME_CASE(BDECOMPRESSW)
+ NODE_NAME_CASE(VMV_V_X_VL)
+ NODE_NAME_CASE(VFMV_V_F_VL)
NODE_NAME_CASE(VMV_X_S)
+ NODE_NAME_CASE(VMV_S_X_VL)
+ NODE_NAME_CASE(VFMV_S_F_VL)
NODE_NAME_CASE(SPLAT_VECTOR_I64)
+ NODE_NAME_CASE(SPLAT_VECTOR_SPLIT_I64_VL)
NODE_NAME_CASE(READ_VLENB)
- NODE_NAME_CASE(TRUNCATE_VECTOR)
- NODE_NAME_CASE(VLEFF)
- NODE_NAME_CASE(VLEFF_MASK)
- NODE_NAME_CASE(VLSEGFF)
- NODE_NAME_CASE(VLSEGFF_MASK)
- NODE_NAME_CASE(READ_VL)
- NODE_NAME_CASE(VSLIDEUP)
- NODE_NAME_CASE(VSLIDEDOWN)
- NODE_NAME_CASE(VID)
+ NODE_NAME_CASE(TRUNCATE_VECTOR_VL)
+ NODE_NAME_CASE(VSLIDEUP_VL)
+ NODE_NAME_CASE(VSLIDE1UP_VL)
+ NODE_NAME_CASE(VSLIDEDOWN_VL)
+ NODE_NAME_CASE(VSLIDE1DOWN_VL)
+ NODE_NAME_CASE(VID_VL)
+ NODE_NAME_CASE(VFNCVT_ROD_VL)
+ NODE_NAME_CASE(VECREDUCE_ADD_VL)
+ NODE_NAME_CASE(VECREDUCE_UMAX_VL)
+ NODE_NAME_CASE(VECREDUCE_SMAX_VL)
+ NODE_NAME_CASE(VECREDUCE_UMIN_VL)
+ NODE_NAME_CASE(VECREDUCE_SMIN_VL)
+ NODE_NAME_CASE(VECREDUCE_AND_VL)
+ NODE_NAME_CASE(VECREDUCE_OR_VL)
+ NODE_NAME_CASE(VECREDUCE_XOR_VL)
+ NODE_NAME_CASE(VECREDUCE_FADD_VL)
+ NODE_NAME_CASE(VECREDUCE_SEQ_FADD_VL)
+ NODE_NAME_CASE(VECREDUCE_FMIN_VL)
+ NODE_NAME_CASE(VECREDUCE_FMAX_VL)
+ NODE_NAME_CASE(ADD_VL)
+ NODE_NAME_CASE(AND_VL)
+ NODE_NAME_CASE(MUL_VL)
+ NODE_NAME_CASE(OR_VL)
+ NODE_NAME_CASE(SDIV_VL)
+ NODE_NAME_CASE(SHL_VL)
+ NODE_NAME_CASE(SREM_VL)
+ NODE_NAME_CASE(SRA_VL)
+ NODE_NAME_CASE(SRL_VL)
+ NODE_NAME_CASE(SUB_VL)
+ NODE_NAME_CASE(UDIV_VL)
+ NODE_NAME_CASE(UREM_VL)
+ NODE_NAME_CASE(XOR_VL)
+ NODE_NAME_CASE(SADDSAT_VL)
+ NODE_NAME_CASE(UADDSAT_VL)
+ NODE_NAME_CASE(SSUBSAT_VL)
+ NODE_NAME_CASE(USUBSAT_VL)
+ NODE_NAME_CASE(FADD_VL)
+ NODE_NAME_CASE(FSUB_VL)
+ NODE_NAME_CASE(FMUL_VL)
+ NODE_NAME_CASE(FDIV_VL)
+ NODE_NAME_CASE(FNEG_VL)
+ NODE_NAME_CASE(FABS_VL)
+ NODE_NAME_CASE(FSQRT_VL)
+ NODE_NAME_CASE(FMA_VL)
+ NODE_NAME_CASE(FCOPYSIGN_VL)
+ NODE_NAME_CASE(SMIN_VL)
+ NODE_NAME_CASE(SMAX_VL)
+ NODE_NAME_CASE(UMIN_VL)
+ NODE_NAME_CASE(UMAX_VL)
+ NODE_NAME_CASE(FMINNUM_VL)
+ NODE_NAME_CASE(FMAXNUM_VL)
+ NODE_NAME_CASE(MULHS_VL)
+ NODE_NAME_CASE(MULHU_VL)
+ NODE_NAME_CASE(FP_TO_SINT_VL)
+ NODE_NAME_CASE(FP_TO_UINT_VL)
+ NODE_NAME_CASE(SINT_TO_FP_VL)
+ NODE_NAME_CASE(UINT_TO_FP_VL)
+ NODE_NAME_CASE(FP_EXTEND_VL)
+ NODE_NAME_CASE(FP_ROUND_VL)
+ NODE_NAME_CASE(VWMUL_VL)
+ NODE_NAME_CASE(VWMULU_VL)
+ NODE_NAME_CASE(SETCC_VL)
+ NODE_NAME_CASE(VSELECT_VL)
+ NODE_NAME_CASE(VMAND_VL)
+ NODE_NAME_CASE(VMOR_VL)
+ NODE_NAME_CASE(VMXOR_VL)
+ NODE_NAME_CASE(VMCLR_VL)
+ NODE_NAME_CASE(VMSET_VL)
+ NODE_NAME_CASE(VRGATHER_VX_VL)
+ NODE_NAME_CASE(VRGATHER_VV_VL)
+ NODE_NAME_CASE(VRGATHEREI16_VV_VL)
+ NODE_NAME_CASE(VSEXT_VL)
+ NODE_NAME_CASE(VZEXT_VL)
+ NODE_NAME_CASE(VPOPC_VL)
+ NODE_NAME_CASE(VLE_VL)
+ NODE_NAME_CASE(VSE_VL)
+ NODE_NAME_CASE(READ_CSR)
+ NODE_NAME_CASE(WRITE_CSR)
+ NODE_NAME_CASE(SWAP_CSR)
}
// clang-format on
return nullptr;
@@ -4123,6 +8463,7 @@
default:
break;
case 'f':
+ case 'v':
return C_RegisterClass;
case 'I':
case 'J':
@@ -4130,6 +8471,8 @@
return C_Immediate;
case 'A':
return C_Memory;
+ case 'S': // A symbolic address
+ return C_Other;
}
}
return TargetLowering::getConstraintType(Constraint);
@@ -4153,6 +8496,14 @@
if (Subtarget.hasStdExtD() && VT == MVT::f64)
return std::make_pair(0U, &RISCV::FPR64RegClass);
break;
+ case 'v':
+ for (const auto *RC :
+ {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass,
+ &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
+ if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
+ return std::make_pair(0U, RC);
+ }
+ break;
default:
break;
}
@@ -4252,6 +8603,56 @@
}
}
+ if (Subtarget.hasStdExtV()) {
+ Register VReg = StringSwitch<Register>(Constraint.lower())
+ .Case("{v0}", RISCV::V0)
+ .Case("{v1}", RISCV::V1)
+ .Case("{v2}", RISCV::V2)
+ .Case("{v3}", RISCV::V3)
+ .Case("{v4}", RISCV::V4)
+ .Case("{v5}", RISCV::V5)
+ .Case("{v6}", RISCV::V6)
+ .Case("{v7}", RISCV::V7)
+ .Case("{v8}", RISCV::V8)
+ .Case("{v9}", RISCV::V9)
+ .Case("{v10}", RISCV::V10)
+ .Case("{v11}", RISCV::V11)
+ .Case("{v12}", RISCV::V12)
+ .Case("{v13}", RISCV::V13)
+ .Case("{v14}", RISCV::V14)
+ .Case("{v15}", RISCV::V15)
+ .Case("{v16}", RISCV::V16)
+ .Case("{v17}", RISCV::V17)
+ .Case("{v18}", RISCV::V18)
+ .Case("{v19}", RISCV::V19)
+ .Case("{v20}", RISCV::V20)
+ .Case("{v21}", RISCV::V21)
+ .Case("{v22}", RISCV::V22)
+ .Case("{v23}", RISCV::V23)
+ .Case("{v24}", RISCV::V24)
+ .Case("{v25}", RISCV::V25)
+ .Case("{v26}", RISCV::V26)
+ .Case("{v27}", RISCV::V27)
+ .Case("{v28}", RISCV::V28)
+ .Case("{v29}", RISCV::V29)
+ .Case("{v30}", RISCV::V30)
+ .Case("{v31}", RISCV::V31)
+ .Default(RISCV::NoRegister);
+ if (VReg != RISCV::NoRegister) {
+ if (TRI->isTypeLegalForClass(RISCV::VMRegClass, VT.SimpleTy))
+ return std::make_pair(VReg, &RISCV::VMRegClass);
+ if (TRI->isTypeLegalForClass(RISCV::VRRegClass, VT.SimpleTy))
+ return std::make_pair(VReg, &RISCV::VRRegClass);
+ for (const auto *RC :
+ {&RISCV::VRM2RegClass, &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
+ if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) {
+ VReg = TRI->getMatchingSuperReg(VReg, RISCV::sub_vrm1_0, RC);
+ return std::make_pair(VReg, RC);
+ }
+ }
+ }
+ }
+
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
@@ -4301,6 +8702,15 @@
DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
}
return;
+ case 'S':
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+ Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
+ GA->getValueType(0)));
+ } else if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+ Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
+ BA->getValueType(0)));
+ }
+ return;
default:
break;
}
@@ -4308,7 +8718,7 @@
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
@@ -4318,7 +8728,7 @@
return nullptr;
}
-Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
@@ -4392,7 +8802,7 @@
}
Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
- IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
+ IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering =
@@ -4444,7 +8854,7 @@
}
Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
- IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
+ IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
@@ -4465,6 +8875,10 @@
return Result;
}
+bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+ return false;
+}
+
bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
VT = VT.getScalarType();
@@ -4545,6 +8959,105 @@
return false;
}
+bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
+ if (!VT.isVector())
+ return false;
+
+ EVT ElemVT = VT.getVectorElementType();
+ if (Alignment >= ElemVT.getStoreSize()) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ return false;
+}
+
+bool RISCVTargetLowering::splitValueIntoRegisterParts(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+ bool IsABIRegCopy = CC.hasValue();
+ EVT ValueVT = Val.getValueType();
+ if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
+ // Cast the f16 to i16, extend to i32, pad with ones to make a float nan,
+ // and cast to f32.
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Val);
+ Val = DAG.getNode(ISD::OR, DL, MVT::i32, Val,
+ DAG.getConstant(0xFFFF0000, DL, MVT::i32));
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+ Parts[0] = Val;
+ return true;
+ }
+
+ if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+ LLVMContext &Context = *DAG.getContext();
+ EVT ValueEltVT = ValueVT.getVectorElementType();
+ EVT PartEltVT = PartVT.getVectorElementType();
+ unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize();
+ unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize();
+ if (PartVTBitSize % ValueVTBitSize == 0) {
+ // If the element types are different, bitcast to the same element type of
+ // PartVT first.
+ if (ValueEltVT != PartEltVT) {
+ unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits();
+ assert(Count != 0 && "The number of element should not be zero.");
+ EVT SameEltTypeVT =
+ EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true);
+ Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val);
+ }
+ Val = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
+ Val, DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ Parts[0] = Val;
+ return true;
+ }
+ }
+ return false;
+}
+
+SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
+ SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+ bool IsABIRegCopy = CC.hasValue();
+ if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
+ SDValue Val = Parts[0];
+
+ // Cast the f32 to i32, truncate to i16, and cast back to f16.
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Val);
+ Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Val);
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::f16, Val);
+ return Val;
+ }
+
+ if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+ LLVMContext &Context = *DAG.getContext();
+ SDValue Val = Parts[0];
+ EVT ValueEltVT = ValueVT.getVectorElementType();
+ EVT PartEltVT = PartVT.getVectorElementType();
+ unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize();
+ unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize();
+ if (PartVTBitSize % ValueVTBitSize == 0) {
+ EVT SameEltTypeVT = ValueVT;
+ // If the element types are different, convert it to the same element type
+ // of PartVT.
+ if (ValueEltVT != PartEltVT) {
+ unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits();
+ assert(Count != 0 && "The number of element should not be zero.");
+ SameEltTypeVT =
+ EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true);
+ }
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SameEltTypeVT, Val,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ if (ValueEltVT != PartEltVT)
+ Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+ return Val;
+ }
+ }
+ return SDValue();
+}
+
#define GET_REGISTER_MATCHER
#include "RISCVGenAsmMatcher.inc"
@@ -4572,10 +9085,4 @@
} // namespace RISCVVIntrinsicsTable
-namespace RISCVZvlssegTable {
-
-#define GET_RISCVZvlssegTable_IMPL
-#include "RISCVGenSearchableTables.inc"
-
-} // namespace RISCVZvlssegTable
} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 40b1a45..0e71220 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -15,11 +15,13 @@
#define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
#include "RISCV.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
class RISCVSubtarget;
+struct RISCVRegisterInfo;
namespace RISCVISD {
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -35,9 +37,12 @@
/// The lhs and rhs are XLenVT integers. The true and false values can be
/// integer or floating point.
SELECT_CC,
+ BR_CC,
BuildPairF64,
SplitF64,
TAIL,
+ // Multiply high for signedxunsigned.
+ MULHSU,
// RV64I shifts, directly matching the semantics of the named RISC-V
// instructions.
SLLW,
@@ -53,6 +58,14 @@
// instructions.
ROLW,
RORW,
+ // RV64IZbb bit counting instructions directly matching the semantics of the
+ // named RISC-V instructions.
+ CLZW,
+ CTZW,
+ // RV64IB/RV32IB funnel shifts, with the semantics of the named RISC-V
+ // instructions, but the same operand order as fshl/fshr intrinsics.
+ FSR,
+ FSL,
// RV64IB funnel shifts, with the semantics of the named RISC-V instructions,
// but the same operand order as fshl/fshr intrinsics.
FSRW,
@@ -71,46 +84,204 @@
FMV_X_ANYEXTH,
FMV_W_X_RV64,
FMV_X_ANYEXTW_RV64,
+ // FP to 32 bit int conversions for RV64. These are used to keep track of the
+ // result being sign extended to 64 bit.
+ FCVT_W_RV64,
+ FCVT_WU_RV64,
// READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
// (returns (Lo, Hi)). It takes a chain operand.
READ_CYCLE_WIDE,
// Generalized Reverse and Generalized Or-Combine - directly matching the
// semantics of the named RISC-V instructions. Lowered as custom nodes as
// TableGen chokes when faced with commutative permutations in deeply-nested
- // DAGs. Each node takes an input operand and a TargetConstant immediate
- // shift amount, and outputs a bit-manipulated version of input. All operands
- // are of type XLenVT.
- GREVI,
- GREVIW,
- GORCI,
- GORCIW,
+ // DAGs. Each node takes an input operand and a control operand and outputs a
+ // bit-manipulated version of input. All operands are i32 or XLenVT.
+ GREV,
+ GREVW,
+ GORC,
+ GORCW,
+ SHFL,
+ SHFLW,
+ UNSHFL,
+ UNSHFLW,
+ // Bit Compress/Decompress implement the generic bit extract and bit deposit
+ // functions. This operation is also referred to as bit gather/scatter, bit
+ // pack/unpack, parallel extract/deposit, compress/expand, or right
+ // compress/right expand.
+ BCOMPRESS,
+ BCOMPRESSW,
+ BDECOMPRESS,
+ BDECOMPRESSW,
// Vector Extension
- // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT
- // sign extended from the vector element size. NOTE: The result size will
- // never be less than the vector element size.
+ // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
+ // for the VL value to be used for the operation.
+ VMV_V_X_VL,
+ // VFMV_V_F_VL matches the semantics of vfmv.v.f but includes an extra operand
+ // for the VL value to be used for the operation.
+ VFMV_V_F_VL,
+ // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT sign
+ // extended from the vector element size.
VMV_X_S,
+ // VMV_S_X_VL matches the semantics of vmv.s.x. It carries a VL operand.
+ VMV_S_X_VL,
+ // VFMV_S_F_VL matches the semantics of vfmv.s.f. It carries a VL operand.
+ VFMV_S_F_VL,
// Splats an i64 scalar to a vector type (with element type i64) where the
// scalar is a sign-extended i32.
SPLAT_VECTOR_I64,
+ // Splats an 64-bit value that has been split into two i32 parts. This is
+ // expanded late to two scalar stores and a stride 0 vector load.
+ SPLAT_VECTOR_SPLIT_I64_VL,
// Read VLENB CSR
READ_VLENB,
- // Truncates a RVV integer vector by one power-of-two.
- TRUNCATE_VECTOR,
- // Unit-stride fault-only-first load
- VLEFF,
- VLEFF_MASK,
- // Unit-stride fault-only-first segment load
- VLSEGFF,
- VLSEGFF_MASK,
- // read vl CSR
- READ_VL,
+ // Truncates a RVV integer vector by one power-of-two. Carries both an extra
+ // mask and VL operand.
+ TRUNCATE_VECTOR_VL,
// Matches the semantics of vslideup/vslidedown. The first operand is the
- // pass-thru operand, the second is the source vector, and the third is the
- // XLenVT index (either constant or non-constant).
- VSLIDEUP,
- VSLIDEDOWN,
- // Matches the semantics of the unmasked vid.v instruction.
- VID,
+ // pass-thru operand, the second is the source vector, the third is the
+ // XLenVT index (either constant or non-constant), the fourth is the mask
+ // and the fifth the VL.
+ VSLIDEUP_VL,
+ VSLIDEDOWN_VL,
+ // Matches the semantics of vslide1up/slide1down. The first operand is the
+ // source vector, the second is the XLenVT scalar value. The third and fourth
+ // operands are the mask and VL operands.
+ VSLIDE1UP_VL,
+ VSLIDE1DOWN_VL,
+ // Matches the semantics of the vid.v instruction, with a mask and VL
+ // operand.
+ VID_VL,
+ // Matches the semantics of the vfcnvt.rod function (Convert double-width
+ // float to single-width float, rounding towards odd). Takes a double-width
+ // float vector and produces a single-width float vector. Also has a mask and
+ // VL operand.
+ VFNCVT_ROD_VL,
+ // These nodes match the semantics of the corresponding RVV vector reduction
+ // instructions. They produce a vector result which is the reduction
+ // performed over the first vector operand plus the first element of the
+ // second vector operand. The first operand is an unconstrained vector type,
+ // and the result and second operand's types are expected to be the
+ // corresponding full-width LMUL=1 type for the first operand:
+ // nxv8i8 = vecreduce_add nxv32i8, nxv8i8
+ // nxv2i32 = vecreduce_add nxv8i32, nxv2i32
+ // The different in types does introduce extra vsetvli instructions but
+ // similarly it reduces the number of registers consumed per reduction.
+ // Also has a mask and VL operand.
+ VECREDUCE_ADD_VL,
+ VECREDUCE_UMAX_VL,
+ VECREDUCE_SMAX_VL,
+ VECREDUCE_UMIN_VL,
+ VECREDUCE_SMIN_VL,
+ VECREDUCE_AND_VL,
+ VECREDUCE_OR_VL,
+ VECREDUCE_XOR_VL,
+ VECREDUCE_FADD_VL,
+ VECREDUCE_SEQ_FADD_VL,
+ VECREDUCE_FMIN_VL,
+ VECREDUCE_FMAX_VL,
+
+ // Vector binary and unary ops with a mask as a third operand, and VL as a
+ // fourth operand.
+ // FIXME: Can we replace these with ISD::VP_*?
+ ADD_VL,
+ AND_VL,
+ MUL_VL,
+ OR_VL,
+ SDIV_VL,
+ SHL_VL,
+ SREM_VL,
+ SRA_VL,
+ SRL_VL,
+ SUB_VL,
+ UDIV_VL,
+ UREM_VL,
+ XOR_VL,
+
+ SADDSAT_VL,
+ UADDSAT_VL,
+ SSUBSAT_VL,
+ USUBSAT_VL,
+
+ FADD_VL,
+ FSUB_VL,
+ FMUL_VL,
+ FDIV_VL,
+ FNEG_VL,
+ FABS_VL,
+ FSQRT_VL,
+ FMA_VL,
+ FCOPYSIGN_VL,
+ SMIN_VL,
+ SMAX_VL,
+ UMIN_VL,
+ UMAX_VL,
+ FMINNUM_VL,
+ FMAXNUM_VL,
+ MULHS_VL,
+ MULHU_VL,
+ FP_TO_SINT_VL,
+ FP_TO_UINT_VL,
+ SINT_TO_FP_VL,
+ UINT_TO_FP_VL,
+ FP_ROUND_VL,
+ FP_EXTEND_VL,
+
+ // Widening instructions
+ VWMUL_VL,
+ VWMULU_VL,
+
+ // Vector compare producing a mask. Fourth operand is input mask. Fifth
+ // operand is VL.
+ SETCC_VL,
+
+ // Vector select with an additional VL operand. This operation is unmasked.
+ VSELECT_VL,
+
+ // Mask binary operators.
+ VMAND_VL,
+ VMOR_VL,
+ VMXOR_VL,
+
+ // Set mask vector to all zeros or ones.
+ VMCLR_VL,
+ VMSET_VL,
+
+ // Matches the semantics of vrgather.vx and vrgather.vv with an extra operand
+ // for VL.
+ VRGATHER_VX_VL,
+ VRGATHER_VV_VL,
+ VRGATHEREI16_VV_VL,
+
+ // Vector sign/zero extend with additional mask & VL operands.
+ VSEXT_VL,
+ VZEXT_VL,
+
+ // vpopc.m with additional mask and VL operands.
+ VPOPC_VL,
+
+ // Reads value of CSR.
+ // The first operand is a chain pointer. The second specifies address of the
+ // required CSR. Two results are produced, the read value and the new chain
+ // pointer.
+ READ_CSR,
+ // Write value to CSR.
+ // The first operand is a chain pointer, the second specifies address of the
+ // required CSR and the third is the value to write. The result is the new
+ // chain pointer.
+ WRITE_CSR,
+ // Read and write value of CSR.
+ // The first operand is a chain pointer, the second specifies address of the
+ // required CSR and the third is the value to write. Two results are produced,
+ // the value read before the modification and the new chain pointer.
+ SWAP_CSR,
+
+ // Memory opcodes start here.
+ VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ VSE_VL,
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+ // opcodes will be thought as target memory ops!
};
} // namespace RISCVISD
@@ -140,7 +311,27 @@
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
+ bool softPromoteHalfType() const override { return true; }
+
+ /// Return the register type for a given MVT, ensuring vectors are treated
+ /// as a series of gpr sized integers.
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+ EVT VT) const override;
+
+ /// Return the number of registers for a given MVT, ensuring vectors are
+ /// treated as a series of gpr sized integers.
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const override;
+
+ /// Return true if the given shuffle mask can be codegen'd directly, or if it
+ /// should be stack expanded.
+ bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
bool hasBitPreservingFPLogic(EVT VT) const override;
+ bool
+ shouldExpandBuildVectorWithShuffles(EVT VT,
+ unsigned DefinedValues) const override;
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -193,9 +384,9 @@
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
return isa<LoadInst>(I) || isa<StoreInst>(I);
}
- Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
@@ -266,25 +457,67 @@
TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
- Value *emitMaskedAtomicRMWIntrinsic(IRBuilder<> &Builder, AtomicRMWInst *AI,
+ Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder, AtomicRMWInst *AI,
Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt,
AtomicOrdering Ord) const override;
TargetLowering::AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
- Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder,
+ Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilderBase &Builder,
AtomicCmpXchgInst *CI,
Value *AlignedAddr, Value *CmpVal,
Value *NewVal, Value *Mask,
AtomicOrdering Ord) const override;
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type.
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace = 0, Align Alignment = Align(1),
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *Fast = nullptr) const override;
+
+ bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT,
+ Optional<CallingConv::ID> CC) const override;
+
+ SDValue
+ joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+ const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT,
+ Optional<CallingConv::ID> CC) const override;
+
+ static RISCVII::VLMUL getLMUL(MVT VT);
+ static unsigned getRegClassIDForLMUL(RISCVII::VLMUL LMul);
+ static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
+ static unsigned getRegClassIDForVecVT(MVT VT);
+ static std::pair<unsigned, unsigned>
+ decomposeSubvectorInsertExtractToSubRegs(MVT VecVT, MVT SubVecVT,
+ unsigned InsertExtractIdx,
+ const RISCVRegisterInfo *TRI);
+ MVT getContainerForFixedLengthVector(MVT VT) const;
+
+ bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+
private:
+ /// RISCVCCAssignFn - This target-specific function extends the default
+ /// CCValAssign with additional information used to lower RISC-V calling
+ /// conventions.
+ typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI,
+ unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State,
+ bool IsFixed, bool IsRet, Type *OrigTy,
+ const RISCVTargetLowering &TLI,
+ Optional<unsigned> FirstMaskArgument);
+
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- bool IsRet) const;
+ const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
+ RISCVCCAssignFn Fn) const;
void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs,
- bool IsRet, CallLoweringInfo *CLI) const;
+ bool IsRet, CallLoweringInfo *CLI,
+ RISCVCCAssignFn Fn) const;
template <class NodeTy>
SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
@@ -299,12 +532,14 @@
SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
- SDValue lowerSPLATVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSPLAT_VECTOR_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVectorMaskSplat(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
int64_t ExtTrueVal) const;
SDValue lowerVectorMaskTrunc(SDValue Op, SelectionDAG &DAG) const;
@@ -312,6 +547,39 @@
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
+ SelectionDAG &DAG) const;
+ SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorLogicOpToRVV(SDValue Op, SelectionDAG &DAG,
+ unsigned MaskOpc,
+ unsigned VecOpc) const;
+ SDValue lowerFixedLengthVectorShiftToRVV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op,
+ SelectionDAG &DAG) const;
+ SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG, unsigned NewOpc,
+ bool HasMask = true) const;
+ SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG, unsigned RISCVISDOpc) const;
+ SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG,
+ unsigned ExtendOpc) const;
+ SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
@@ -322,13 +590,30 @@
void validateCCReservedRegs(
const SmallVectorImpl<std::pair<llvm::Register, llvm::SDValue>> &Regs,
MachineFunction &MF) const;
+
+ bool useRVVForFixedLengthVectorVT(MVT VT) const;
+
+ MVT getVPExplicitVectorLengthTy() const override;
+
+ /// RVV code generation for fixed length vectors does not lower all
+ /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
+ /// merge. However, merging them creates a BUILD_VECTOR that is just as
+ /// illegal as the original, thus leading to an infinite legalisation loop.
+ /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types,
+ /// this override can be removed.
+ bool mergeStoresAfterLegalization(EVT VT) const override;
};
+namespace RISCV {
+// We use 64 bits as the known part in the scalable vector types.
+static constexpr unsigned RVVBitsPerBlock = 64;
+} // namespace RISCV
+
namespace RISCVVIntrinsicsTable {
struct RISCVVIntrinsicInfo {
- unsigned int IntrinsicID;
- unsigned int ExtendedOperand;
+ unsigned IntrinsicID;
+ uint8_t SplatOperand;
};
using namespace RISCV;
@@ -338,22 +623,6 @@
} // end namespace RISCVVIntrinsicsTable
-namespace RISCVZvlssegTable {
-
-struct RISCVZvlsseg {
- unsigned int IntrinsicID;
- unsigned int SEW;
- unsigned int LMUL;
- unsigned int IndexLMUL;
- unsigned int Pseudo;
-};
-
-using namespace RISCV;
-
-#define GET_RISCVZvlssegTable_DECL
-#include "RISCVGenSearchableTables.inc"
-
-} // namespace RISCVZvlssegTable
-}
+} // end namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
new file mode 100644
index 0000000..fb7cb408
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -0,0 +1,745 @@
+//===- RISCVInsertVSETVLI.cpp - Insert VSETVLI instructions ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function pass that inserts VSETVLI instructions where
+// needed.
+//
+// This pass consists of 3 phases:
+//
+// Phase 1 collects how each basic block affects VL/VTYPE.
+//
+// Phase 2 uses the information from phase 1 to do a data flow analysis to
+// propagate the VL/VTYPE changes through the function. This gives us the
+// VL/VTYPE at the start of each basic block.
+//
+// Phase 3 inserts VSETVLI instructions in each basic block. Information from
+// phase 2 is used to prevent inserting a VSETVLI before the first vector
+// instruction in the block if possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include <queue>
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-insert-vsetvli"
+#define RISCV_INSERT_VSETVLI_NAME "RISCV Insert VSETVLI pass"
+
+static cl::opt<bool> DisableInsertVSETVLPHIOpt(
+ "riscv-disable-insert-vsetvl-phi-opt", cl::init(false), cl::Hidden,
+ cl::desc("Disable looking through phis when inserting vsetvlis."));
+
+namespace {
+
+class VSETVLIInfo {
+ union {
+ Register AVLReg;
+ unsigned AVLImm;
+ };
+
+ enum : uint8_t {
+ Uninitialized,
+ AVLIsReg,
+ AVLIsImm,
+ Unknown,
+ } State = Uninitialized;
+
+ // Fields from VTYPE.
+ RISCVII::VLMUL VLMul = RISCVII::LMUL_1;
+ uint8_t SEW = 0;
+ uint8_t TailAgnostic : 1;
+ uint8_t MaskAgnostic : 1;
+ uint8_t MaskRegOp : 1;
+ uint8_t SEWLMULRatioOnly : 1;
+
+public:
+ VSETVLIInfo()
+ : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), MaskRegOp(false),
+ SEWLMULRatioOnly(false) {}
+
+ static VSETVLIInfo getUnknown() {
+ VSETVLIInfo Info;
+ Info.setUnknown();
+ return Info;
+ }
+
+ bool isValid() const { return State != Uninitialized; }
+ void setUnknown() { State = Unknown; }
+ bool isUnknown() const { return State == Unknown; }
+
+ void setAVLReg(Register Reg) {
+ AVLReg = Reg;
+ State = AVLIsReg;
+ }
+
+ void setAVLImm(unsigned Imm) {
+ AVLImm = Imm;
+ State = AVLIsImm;
+ }
+
+ bool hasAVLImm() const { return State == AVLIsImm; }
+ bool hasAVLReg() const { return State == AVLIsReg; }
+ Register getAVLReg() const {
+ assert(hasAVLReg());
+ return AVLReg;
+ }
+ unsigned getAVLImm() const {
+ assert(hasAVLImm());
+ return AVLImm;
+ }
+
+ bool hasSameAVL(const VSETVLIInfo &Other) const {
+ assert(isValid() && Other.isValid() &&
+ "Can't compare invalid VSETVLIInfos");
+ assert(!isUnknown() && !Other.isUnknown() &&
+ "Can't compare AVL in unknown state");
+ if (hasAVLReg() && Other.hasAVLReg())
+ return getAVLReg() == Other.getAVLReg();
+
+ if (hasAVLImm() && Other.hasAVLImm())
+ return getAVLImm() == Other.getAVLImm();
+
+ return false;
+ }
+
+ void setVTYPE(unsigned VType) {
+ assert(isValid() && !isUnknown() &&
+ "Can't set VTYPE for uninitialized or unknown");
+ VLMul = RISCVVType::getVLMUL(VType);
+ SEW = RISCVVType::getSEW(VType);
+ TailAgnostic = RISCVVType::isTailAgnostic(VType);
+ MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+ }
+ void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO) {
+ assert(isValid() && !isUnknown() &&
+ "Can't set VTYPE for uninitialized or unknown");
+ VLMul = L;
+ SEW = S;
+ TailAgnostic = TA;
+ MaskAgnostic = MA;
+ MaskRegOp = MRO;
+ }
+
+ unsigned encodeVTYPE() const {
+ assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
+ "Can't encode VTYPE for uninitialized or unknown");
+ return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+ }
+
+ bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; }
+
+ bool hasSameVTYPE(const VSETVLIInfo &Other) const {
+ assert(isValid() && Other.isValid() &&
+ "Can't compare invalid VSETVLIInfos");
+ assert(!isUnknown() && !Other.isUnknown() &&
+ "Can't compare VTYPE in unknown state");
+ assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
+ "Can't compare when only LMUL/SEW ratio is valid.");
+ return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+ std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
+ Other.MaskAgnostic);
+ }
+
+ // Convert VLMUL to a fixed point value with 3 bits of fraction.
+ unsigned getSEWLMULRatio() const {
+ assert(isValid() && !isUnknown() &&
+ "Can't use VTYPE for uninitialized or unknown");
+ unsigned LMul;
+ bool Fractional;
+ std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(VLMul);
+
+ // Convert LMul to a fixed point value with 3 fractional bits.
+ LMul = Fractional ? (8 / LMul) : (LMul * 8);
+
+ assert(SEW >= 8 && "Unexpected SEW value");
+ return (SEW * 8) / LMul;
+ }
+
+ // Check if the VTYPE for these two VSETVLIInfos produce the same VLMAX.
+ bool hasSameVLMAX(const VSETVLIInfo &Other) const {
+ assert(isValid() && Other.isValid() &&
+ "Can't compare invalid VSETVLIInfos");
+ assert(!isUnknown() && !Other.isUnknown() &&
+ "Can't compare VTYPE in unknown state");
+ return getSEWLMULRatio() == Other.getSEWLMULRatio();
+ }
+
+ // Determine whether the vector instructions requirements represented by
+ // InstrInfo are compatible with the previous vsetvli instruction represented
+ // by this.
+ bool isCompatible(const VSETVLIInfo &InstrInfo) const {
+ assert(isValid() && InstrInfo.isValid() &&
+ "Can't compare invalid VSETVLIInfos");
+ assert(!InstrInfo.SEWLMULRatioOnly &&
+ "Expected a valid VTYPE for instruction!");
+ // Nothing is compatible with Unknown.
+ if (isUnknown() || InstrInfo.isUnknown())
+ return false;
+
+ // If only our VLMAX ratio is valid, then this isn't compatible.
+ if (SEWLMULRatioOnly)
+ return false;
+
+ // If the instruction doesn't need an AVLReg and the SEW matches, consider
+ // it compatible.
+ if (InstrInfo.hasAVLReg() && InstrInfo.AVLReg == RISCV::NoRegister) {
+ if (SEW == InstrInfo.SEW)
+ return true;
+ }
+
+ // VTypes must match unless the instruction is a mask reg operation, then it
+ // only care about VLMAX.
+ // FIXME: Mask reg operations are probably ok if "this" VLMAX is larger
+ // than "InstrInfo".
+ if (!hasSameVTYPE(InstrInfo) &&
+ !(InstrInfo.MaskRegOp && hasSameVLMAX(InstrInfo) &&
+ TailAgnostic == InstrInfo.TailAgnostic &&
+ MaskAgnostic == InstrInfo.MaskAgnostic))
+ return false;
+
+ return hasSameAVL(InstrInfo);
+ }
+
+ bool operator==(const VSETVLIInfo &Other) const {
+ // Uninitialized is only equal to another Uninitialized.
+ if (!isValid())
+ return !Other.isValid();
+ if (!Other.isValid())
+ return !isValid();
+
+ // Unknown is only equal to another Unknown.
+ if (isUnknown())
+ return Other.isUnknown();
+ if (Other.isUnknown())
+ return isUnknown();
+
+ if (!hasSameAVL(Other))
+ return false;
+
+ // If only the VLMAX is valid, check that it is the same.
+ if (SEWLMULRatioOnly && Other.SEWLMULRatioOnly)
+ return hasSameVLMAX(Other);
+
+ // If the full VTYPE is valid, check that it is the same.
+ if (!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly)
+ return hasSameVTYPE(Other);
+
+ // If the SEWLMULRatioOnly bits are different, then they aren't equal.
+ return false;
+ }
+
+ // Calculate the VSETVLIInfo visible to a block assuming this and Other are
+ // both predecessors.
+ VSETVLIInfo intersect(const VSETVLIInfo &Other) const {
+ // If the new value isn't valid, ignore it.
+ if (!Other.isValid())
+ return *this;
+
+ // If this value isn't valid, this must be the first predecessor, use it.
+ if (!isValid())
+ return Other;
+
+ // If either is unknown, the result is unknown.
+ if (isUnknown() || Other.isUnknown())
+ return VSETVLIInfo::getUnknown();
+
+ // If we have an exact, match return this.
+ if (*this == Other)
+ return *this;
+
+ // Not an exact match, but maybe the AVL and VLMAX are the same. If so,
+ // return an SEW/LMUL ratio only value.
+ if (hasSameAVL(Other) && hasSameVLMAX(Other)) {
+ VSETVLIInfo MergeInfo = *this;
+ MergeInfo.SEWLMULRatioOnly = true;
+ return MergeInfo;
+ }
+
+ // Otherwise the result is unknown.
+ return VSETVLIInfo::getUnknown();
+ }
+
+ // Calculate the VSETVLIInfo visible at the end of the block assuming this
+ // is the predecessor value, and Other is change for this block.
+ VSETVLIInfo merge(const VSETVLIInfo &Other) const {
+ assert(isValid() && "Can only merge with a valid VSETVLInfo");
+
+ // Nothing changed from the predecessor, keep it.
+ if (!Other.isValid())
+ return *this;
+
+ // If the change is compatible with the input, we won't create a VSETVLI
+ // and should keep the predecessor.
+ if (isCompatible(Other))
+ return *this;
+
+ // Otherwise just use whatever is in this block.
+ return Other;
+ }
+};
+
+struct BlockData {
+ // The VSETVLIInfo that represents the net changes to the VL/VTYPE registers
+ // made by this block. Calculated in Phase 1.
+ VSETVLIInfo Change;
+
+ // The VSETVLIInfo that represents the VL/VTYPE settings on exit from this
+ // block. Calculated in Phase 2.
+ VSETVLIInfo Exit;
+
+ // The VSETVLIInfo that represents the VL/VTYPE settings from all predecessor
+ // blocks. Calculated in Phase 2, and used by Phase 3.
+ VSETVLIInfo Pred;
+
+ // Keeps track of whether the block is already in the queue.
+ bool InQueue = false;
+
+ BlockData() {}
+};
+
+class RISCVInsertVSETVLI : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
+ std::vector<BlockData> BlockInfo;
+ std::queue<const MachineBasicBlock *> WorkList;
+
+public:
+ static char ID;
+
+ RISCVInsertVSETVLI() : MachineFunctionPass(ID) {
+ initializeRISCVInsertVSETVLIPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return RISCV_INSERT_VSETVLI_NAME; }
+
+private:
+ bool needVSETVLI(const VSETVLIInfo &Require, const VSETVLIInfo &CurInfo);
+ bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB);
+ void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
+ const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
+
+ bool computeVLVTYPEChanges(const MachineBasicBlock &MBB);
+ void computeIncomingVLVTYPE(const MachineBasicBlock &MBB);
+ void emitVSETVLIs(MachineBasicBlock &MBB);
+};
+
+} // end anonymous namespace
+
+char RISCVInsertVSETVLI::ID = 0;
+
+INITIALIZE_PASS(RISCVInsertVSETVLI, DEBUG_TYPE, RISCV_INSERT_VSETVLI_NAME,
+ false, false)
+
+static MachineInstr *elideCopies(MachineInstr *MI,
+ const MachineRegisterInfo *MRI) {
+ while (true) {
+ if (!MI->isFullCopy())
+ return MI;
+ if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
+ return nullptr;
+ MI = MRI->getVRegDef(MI->getOperand(1).getReg());
+ if (!MI)
+ return nullptr;
+ }
+}
+
+static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
+ const MachineRegisterInfo *MRI) {
+ VSETVLIInfo InstrInfo;
+ unsigned NumOperands = MI.getNumExplicitOperands();
+
+ RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+
+ unsigned Log2SEW = MI.getOperand(NumOperands - 1).getImm();
+ // A Log2SEW of 0 is an operation on mask registers only.
+ bool MaskRegOp = Log2SEW == 0;
+ unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+ assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+
+ // Default to tail agnostic unless the destination is tied to a source.
+ // Unless the source is undef. In that case the user would have some control
+ // over the tail values. Some pseudo instructions force a tail agnostic policy
+ // despite having a tied def.
+ bool ForceTailAgnostic = RISCVII::doesForceTailAgnostic(TSFlags);
+ bool TailAgnostic = true;
+ unsigned UseOpIdx;
+ if (!ForceTailAgnostic && MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
+ TailAgnostic = false;
+ // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
+ const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
+ MachineInstr *UseMI = MRI->getVRegDef(UseMO.getReg());
+ if (UseMI) {
+ UseMI = elideCopies(UseMI, MRI);
+ if (UseMI && UseMI->isImplicitDef())
+ TailAgnostic = true;
+ }
+ }
+
+ if (RISCVII::hasVLOp(TSFlags)) {
+ const MachineOperand &VLOp = MI.getOperand(MI.getNumExplicitOperands() - 2);
+ if (VLOp.isImm())
+ InstrInfo.setAVLImm(VLOp.getImm());
+ else
+ InstrInfo.setAVLReg(VLOp.getReg());
+ } else
+ InstrInfo.setAVLReg(RISCV::NoRegister);
+ InstrInfo.setVTYPE(VLMul, SEW, /*TailAgnostic*/ TailAgnostic,
+ /*MaskAgnostic*/ false, MaskRegOp);
+
+ return InstrInfo;
+}
+
+void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
+ const VSETVLIInfo &Info,
+ const VSETVLIInfo &PrevInfo) {
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
+ // VLMAX.
+ if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
+ Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ return;
+ }
+
+ if (Info.hasAVLImm()) {
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(Info.getAVLImm())
+ .addImm(Info.encodeVTYPE());
+ return;
+ }
+
+ Register AVLReg = Info.getAVLReg();
+ if (AVLReg == RISCV::NoRegister) {
+ // We can only use x0, x0 if there's no chance of the vtype change causing
+ // the previous vl to become invalid.
+ if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
+ Info.hasSameVLMAX(PrevInfo)) {
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ return;
+ }
+ // Otherwise use an AVL of 0 to avoid depending on previous vl.
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(0)
+ .addImm(Info.encodeVTYPE());
+ return;
+ }
+
+ // Use X0 as the DestReg unless AVLReg is X0.
+ Register DestReg = RISCV::X0;
+ if (AVLReg == RISCV::X0)
+ DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(AVLReg)
+ .addImm(Info.encodeVTYPE());
+}
+
+// Return a VSETVLIInfo representing the changes made by this VSETVLI or
+// VSETIVLI instruction.
+static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
+ VSETVLIInfo NewInfo;
+ if (MI.getOpcode() == RISCV::PseudoVSETVLI) {
+ Register AVLReg = MI.getOperand(1).getReg();
+ assert((AVLReg != RISCV::X0 || MI.getOperand(0).getReg() != RISCV::X0) &&
+ "Can't handle X0, X0 vsetvli yet");
+ NewInfo.setAVLReg(AVLReg);
+ } else {
+ assert(MI.getOpcode() == RISCV::PseudoVSETIVLI);
+ NewInfo.setAVLImm(MI.getOperand(1).getImm());
+ }
+ NewInfo.setVTYPE(MI.getOperand(2).getImm());
+
+ return NewInfo;
+}
+
+bool RISCVInsertVSETVLI::needVSETVLI(const VSETVLIInfo &Require,
+ const VSETVLIInfo &CurInfo) {
+ if (CurInfo.isCompatible(Require))
+ return false;
+
+ // We didn't find a compatible value. If our AVL is a virtual register,
+ // it might be defined by a VSET(I)VLI. If it has the same VTYPE we need
+ // and the last VL/VTYPE we observed is the same, we don't need a
+ // VSETVLI here.
+ if (!CurInfo.isUnknown() && Require.hasAVLReg() &&
+ Require.getAVLReg().isVirtual() && !CurInfo.hasSEWLMULRatioOnly() &&
+ Require.hasSameVTYPE(CurInfo)) {
+ if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
+ if (DefMI->getOpcode() == RISCV::PseudoVSETVLI ||
+ DefMI->getOpcode() == RISCV::PseudoVSETIVLI) {
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+ if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVTYPE(CurInfo))
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
+ bool HadVectorOp = false;
+
+ BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+ for (const MachineInstr &MI : MBB) {
+ // If this is an explicit VSETVLI or VSETIVLI, update our state.
+ if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
+ MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+ HadVectorOp = true;
+ BBInfo.Change = getInfoForVSETVLI(MI);
+ continue;
+ }
+
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ if (RISCVII::hasSEWOp(TSFlags)) {
+ HadVectorOp = true;
+
+ VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
+
+ if (!BBInfo.Change.isValid()) {
+ BBInfo.Change = NewInfo;
+ } else {
+ // If this instruction isn't compatible with the previous VL/VTYPE
+ // we need to insert a VSETVLI.
+ if (needVSETVLI(NewInfo, BBInfo.Change))
+ BBInfo.Change = NewInfo;
+ }
+ }
+
+ // If this is something that updates VL/VTYPE that we don't know about, set
+ // the state to unknown.
+ if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
+ MI.modifiesRegister(RISCV::VTYPE)) {
+ BBInfo.Change = VSETVLIInfo::getUnknown();
+ }
+ }
+
+ // Initial exit state is whatever change we found in the block.
+ BBInfo.Exit = BBInfo.Change;
+
+ return HadVectorOp;
+}
+
+void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
+ BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+
+ BBInfo.InQueue = false;
+
+ VSETVLIInfo InInfo;
+ if (MBB.pred_empty()) {
+ // There are no predecessors, so use the default starting status.
+ InInfo.setUnknown();
+ } else {
+ for (MachineBasicBlock *P : MBB.predecessors())
+ InInfo = InInfo.intersect(BlockInfo[P->getNumber()].Exit);
+ }
+
+ // If we don't have any valid predecessor value, wait until we do.
+ if (!InInfo.isValid())
+ return;
+
+ BBInfo.Pred = InInfo;
+
+ VSETVLIInfo TmpStatus = BBInfo.Pred.merge(BBInfo.Change);
+
+ // If the new exit value matches the old exit value, we don't need to revisit
+ // any blocks.
+ if (BBInfo.Exit == TmpStatus)
+ return;
+
+ BBInfo.Exit = TmpStatus;
+
+ // Add the successors to the work list so we can propagate the changed exit
+ // status.
+ for (MachineBasicBlock *S : MBB.successors())
+ if (!BlockInfo[S->getNumber()].InQueue)
+ WorkList.push(S);
+}
+
+// If we weren't able to prove a vsetvli was directly unneeded, it might still
+// be/ unneeded if the AVL is a phi node where all incoming values are VL
+// outputs from the last VSETVLI in their respective basic blocks.
+bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
+ const MachineBasicBlock &MBB) {
+ if (DisableInsertVSETVLPHIOpt)
+ return true;
+
+ if (!Require.hasAVLReg())
+ return true;
+
+ Register AVLReg = Require.getAVLReg();
+ if (!AVLReg.isVirtual())
+ return true;
+
+ // We need the AVL to be produce by a PHI node in this basic block.
+ MachineInstr *PHI = MRI->getVRegDef(AVLReg);
+ if (!PHI || PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
+ return true;
+
+ for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
+ PHIOp += 2) {
+ Register InReg = PHI->getOperand(PHIOp).getReg();
+ MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
+ const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
+ // If the exit from the predecessor has the VTYPE we are looking for
+ // we might be able to avoid a VSETVLI.
+ if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+ return true;
+
+ // We need the PHI input to the be the output of a VSET(I)VLI.
+ MachineInstr *DefMI = MRI->getVRegDef(InReg);
+ if (!DefMI || (DefMI->getOpcode() != RISCV::PseudoVSETVLI &&
+ DefMI->getOpcode() != RISCV::PseudoVSETIVLI))
+ return true;
+
+ // We found a VSET(I)VLI make sure it matches the output of the
+ // predecessor block.
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+ if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
+ !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+ return true;
+ }
+
+ // If all the incoming values to the PHI checked out, we don't need
+ // to insert a VSETVLI.
+ return false;
+}
+
+void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
+ VSETVLIInfo CurInfo;
+
+ for (MachineInstr &MI : MBB) {
+ // If this is an explicit VSETVLI or VSETIVLI, update our state.
+ if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
+ MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+ // Conservatively, mark the VL and VTYPE as live.
+ assert(MI.getOperand(3).getReg() == RISCV::VL &&
+ MI.getOperand(4).getReg() == RISCV::VTYPE &&
+ "Unexpected operands where VL and VTYPE should be");
+ MI.getOperand(3).setIsDead(false);
+ MI.getOperand(4).setIsDead(false);
+ CurInfo = getInfoForVSETVLI(MI);
+ continue;
+ }
+
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ if (RISCVII::hasSEWOp(TSFlags)) {
+ VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
+ if (RISCVII::hasVLOp(TSFlags)) {
+ MachineOperand &VLOp = MI.getOperand(MI.getNumExplicitOperands() - 2);
+ if (VLOp.isReg()) {
+ // Erase the AVL operand from the instruction.
+ VLOp.setReg(RISCV::NoRegister);
+ VLOp.setIsKill(false);
+ }
+ MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false,
+ /*isImp*/ true));
+ }
+ MI.addOperand(MachineOperand::CreateReg(RISCV::VTYPE, /*isDef*/ false,
+ /*isImp*/ true));
+
+ if (!CurInfo.isValid()) {
+ // We haven't found any vector instructions or VL/VTYPE changes yet,
+ // use the predecessor information.
+ assert(BlockInfo[MBB.getNumber()].Pred.isValid() &&
+ "Expected a valid predecessor state.");
+ if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) &&
+ needVSETVLIPHI(NewInfo, MBB)) {
+ insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred);
+ CurInfo = NewInfo;
+ }
+ } else {
+ // If this instruction isn't compatible with the previous VL/VTYPE
+ // we need to insert a VSETVLI.
+ if (needVSETVLI(NewInfo, CurInfo)) {
+ insertVSETVLI(MBB, MI, NewInfo, CurInfo);
+ CurInfo = NewInfo;
+ }
+ }
+ }
+
+ // If this is something updates VL/VTYPE that we don't know about, set
+ // the state to unknown.
+ if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
+ MI.modifiesRegister(RISCV::VTYPE)) {
+ CurInfo = VSETVLIInfo::getUnknown();
+ }
+ }
+}
+
+bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
+ // Skip if the vector extension is not enabled.
+ const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+ if (!ST.hasStdExtV())
+ return false;
+
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+
+ assert(BlockInfo.empty() && "Expect empty block infos");
+ BlockInfo.resize(MF.getNumBlockIDs());
+
+ bool HaveVectorOp = false;
+
+ // Phase 1 - determine how VL/VTYPE are affected by the each block.
+ for (const MachineBasicBlock &MBB : MF)
+ HaveVectorOp |= computeVLVTYPEChanges(MBB);
+
+ // If we didn't find any instructions that need VSETVLI, we're done.
+ if (HaveVectorOp) {
+ // Phase 2 - determine the exit VL/VTYPE from each block. We add all
+ // blocks to the list here, but will also add any that need to be revisited
+ // during Phase 2 processing.
+ for (const MachineBasicBlock &MBB : MF) {
+ WorkList.push(&MBB);
+ BlockInfo[MBB.getNumber()].InQueue = true;
+ }
+ while (!WorkList.empty()) {
+ const MachineBasicBlock &MBB = *WorkList.front();
+ WorkList.pop();
+ computeIncomingVLVTYPE(MBB);
+ }
+
+ // Phase 3 - add any vsetvli instructions needed in the block. Use the
+ // Phase 2 information to avoid adding vsetvlis before the first vector
+ // instruction in the block if the VL/VTYPE is satisfied by its
+ // predecessors.
+ for (MachineBasicBlock &MBB : MF)
+ emitVSETVLIs(MBB);
+ }
+
+ BlockInfo.clear();
+
+ return HaveVectorOp;
+}
+
+/// Returns an instance of the Insert VSETVLI pass.
+FunctionPass *llvm::createRISCVInsertVSETVLIPass() {
+ return new RISCVInsertVSETVLI();
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 7be74b7..8e9d245 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -167,8 +167,8 @@
bit HasDummyMask = 0;
let TSFlags{11} = HasDummyMask;
- bit WritesElement0 = 0;
- let TSFlags{12} = WritesElement0;
+ bit ForceTailAgnostic = false;
+ let TSFlags{12} = ForceTailAgnostic;
bit HasMergeOp = 0;
let TSFlags{13} = HasMergeOp;
@@ -236,8 +236,25 @@
let Opcode = opcode.Value;
}
-class RVInstR4<bits<2> funct2, RISCVOpcode opcode, dag outs, dag ins,
- string opcodestr, string argstr>
+class RVInstR4<bits<2> funct2, bits<3> funct3, RISCVOpcode opcode, dag outs,
+ dag ins, string opcodestr, string argstr>
+ : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR4> {
+ bits<5> rs3;
+ bits<5> rs2;
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31-27} = rs3;
+ let Inst{26-25} = funct2;
+ let Inst{24-20} = rs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode.Value;
+}
+
+class RVInstR4Frm<bits<2> funct2, RISCVOpcode opcode, dag outs, dag ins,
+ string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR4> {
bits<5> rs3;
bits<5> rs2;
@@ -302,16 +319,15 @@
let Opcode = opcode.Value;
}
-class RVInstIShift<bit arithshift, bits<3> funct3, RISCVOpcode opcode,
+class RVInstIShift<bits<5> imm11_7, bits<3> funct3, RISCVOpcode opcode,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
bits<6> shamt;
bits<5> rs1;
bits<5> rd;
- let Inst{31} = 0;
- let Inst{30} = arithshift;
- let Inst{29-26} = 0;
+ let Inst{31-27} = imm11_7;
+ let Inst{26} = 0;
let Inst{25-20} = shamt;
let Inst{19-15} = rs1;
let Inst{14-12} = funct3;
@@ -319,16 +335,14 @@
let Opcode = opcode.Value;
}
-class RVInstIShiftW<bit arithshift, bits<3> funct3, RISCVOpcode opcode,
+class RVInstIShiftW<bits<7> imm11_5, bits<3> funct3, RISCVOpcode opcode,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
bits<5> shamt;
bits<5> rs1;
bits<5> rd;
- let Inst{31} = 0;
- let Inst{30} = arithshift;
- let Inst{29-25} = 0;
+ let Inst{31-25} = imm11_5;
let Inst{24-20} = shamt;
let Inst{19-15} = rs1;
let Inst{14-12} = funct3;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 45a5e10..a541daa 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -13,14 +13,18 @@
#include "RISCVInstrInfo.h"
#include "MCTargetDesc/RISCVMatInt.h"
#include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
@@ -32,10 +36,30 @@
#define GET_INSTRINFO_CTOR_DTOR
#include "RISCVGenInstrInfo.inc"
+namespace llvm {
+namespace RISCVVPseudosTable {
+
+using namespace RISCV;
+
+#define GET_RISCVVPseudosTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVVPseudosTable
+} // namespace llvm
+
RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
: RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
STI(STI) {}
+MCInst RISCVInstrInfo::getNop() const {
+ if (STI.getFeatureBits()[RISCV::FeatureStdExtC])
+ return MCInstBuilder(RISCV::C_NOP);
+ return MCInstBuilder(RISCV::ADDI)
+ .addReg(RISCV::X0)
+ .addReg(RISCV::X0)
+ .addImm(0);
+}
+
unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {
@@ -87,6 +111,13 @@
return 0;
}
+static bool forwardCopyWillClobberTuple(unsigned DstReg, unsigned SrcReg,
+ unsigned NumRegs) {
+ // We really want the positive remainder mod 32 here, that happens to be
+ // easily obtainable with a mask.
+ return ((DstReg - SrcReg) & 0x1f) < NumRegs;
+}
+
void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, MCRegister DstReg,
@@ -100,35 +131,113 @@
// FPR->FPR copies and VR->VR copies.
unsigned Opc;
- bool IsScalableVector = false;
- if (RISCV::FPR16RegClass.contains(DstReg, SrcReg))
+ bool IsScalableVector = true;
+ unsigned NF = 1;
+ unsigned LMul = 1;
+ unsigned SubRegIdx = RISCV::sub_vrm1_0;
+ if (RISCV::FPR16RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::FSGNJ_H;
- else if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
+ IsScalableVector = false;
+ } else if (RISCV::FPR32RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::FSGNJ_S;
- else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
+ IsScalableVector = false;
+ } else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::FSGNJ_D;
- else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
+ IsScalableVector = false;
+ } else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::PseudoVMV1R_V;
- IsScalableVector = true;
} else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::PseudoVMV2R_V;
- IsScalableVector = true;
} else if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::PseudoVMV4R_V;
- IsScalableVector = true;
} else if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
Opc = RISCV::PseudoVMV8R_V;
- IsScalableVector = true;
- } else
+ } else if (RISCV::VRN2M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 2;
+ LMul = 1;
+ } else if (RISCV::VRN2M2RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV2R_V;
+ SubRegIdx = RISCV::sub_vrm2_0;
+ NF = 2;
+ LMul = 2;
+ } else if (RISCV::VRN2M4RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV4R_V;
+ SubRegIdx = RISCV::sub_vrm4_0;
+ NF = 2;
+ LMul = 4;
+ } else if (RISCV::VRN3M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 3;
+ LMul = 1;
+ } else if (RISCV::VRN3M2RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV2R_V;
+ SubRegIdx = RISCV::sub_vrm2_0;
+ NF = 3;
+ LMul = 2;
+ } else if (RISCV::VRN4M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 4;
+ LMul = 1;
+ } else if (RISCV::VRN4M2RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV2R_V;
+ SubRegIdx = RISCV::sub_vrm2_0;
+ NF = 4;
+ LMul = 2;
+ } else if (RISCV::VRN5M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 5;
+ LMul = 1;
+ } else if (RISCV::VRN6M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 6;
+ LMul = 1;
+ } else if (RISCV::VRN7M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 7;
+ LMul = 1;
+ } else if (RISCV::VRN8M1RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ SubRegIdx = RISCV::sub_vrm1_0;
+ NF = 8;
+ LMul = 1;
+ } else {
llvm_unreachable("Impossible reg-to-reg copy");
+ }
- if (IsScalableVector)
- BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else
+ if (IsScalableVector) {
+ if (NF == 1) {
+ BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+ int I = 0, End = NF, Incr = 1;
+ unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
+ unsigned DstEncoding = TRI->getEncodingValue(DstReg);
+ if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMul)) {
+ I = NF - 1;
+ End = -1;
+ Incr = -1;
+ }
+
+ for (; I != End; I += Incr) {
+ BuildMI(MBB, MBBI, DL, get(Opc), TRI->getSubReg(DstReg, SubRegIdx + I))
+ .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
+ getKillRegState(KillSrc));
+ }
+ }
+ } else {
BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addReg(SrcReg, getKillRegState(KillSrc));
+ }
}
void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -141,29 +250,88 @@
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
- const MachineFrameInfo &MFI = MF->getFrameInfo();
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
- MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ MachineFrameInfo &MFI = MF->getFrameInfo();
unsigned Opcode;
- if (RISCV::GPRRegClass.hasSubClassEq(RC))
+ bool IsScalableVector = true;
+ bool IsZvlsseg = true;
+ if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
RISCV::SW : RISCV::SD;
- else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FSH;
- else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FSW;
- else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FSD;
+ IsScalableVector = false;
+ } else if (RISCV::VRRegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVSPILL_M1;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVSPILL_M2;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM4RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVSPILL_M4;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM8RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVSPILL_M8;
+ IsZvlsseg = false;
+ } else if (RISCV::VRN2M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL2_M1;
+ else if (RISCV::VRN2M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL2_M2;
+ else if (RISCV::VRN2M4RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL2_M4;
+ else if (RISCV::VRN3M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL3_M1;
+ else if (RISCV::VRN3M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL3_M2;
+ else if (RISCV::VRN4M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL4_M1;
+ else if (RISCV::VRN4M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL4_M2;
+ else if (RISCV::VRN5M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL5_M1;
+ else if (RISCV::VRN6M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL6_M1;
+ else if (RISCV::VRN7M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL7_M1;
+ else if (RISCV::VRN8M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVSPILL8_M1;
else
llvm_unreachable("Can't store this register to stack slot");
- BuildMI(MBB, I, DL, get(Opcode))
- .addReg(SrcReg, getKillRegState(IsKill))
- .addFrameIndex(FI)
- .addImm(0)
- .addMemOperand(MMO);
+ if (IsScalableVector) {
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+
+ MFI.setStackID(FI, TargetStackID::ScalableVector);
+ auto MIB = BuildMI(MBB, I, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addMemOperand(MMO);
+ if (IsZvlsseg) {
+ // For spilling/reloading Zvlsseg registers, append the dummy field for
+ // the scaled vector length. The argument will be used when expanding
+ // these pseudo instructions.
+ MIB.addReg(RISCV::X0);
+ }
+ } else {
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+ BuildMI(MBB, I, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ }
}
void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -176,28 +344,86 @@
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
- const MachineFrameInfo &MFI = MF->getFrameInfo();
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ MachineFrameInfo &MFI = MF->getFrameInfo();
unsigned Opcode;
- if (RISCV::GPRRegClass.hasSubClassEq(RC))
+ bool IsScalableVector = true;
+ bool IsZvlsseg = true;
+ if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
RISCV::LW : RISCV::LD;
- else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FLH;
- else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FLW;
- else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+ IsScalableVector = false;
+ } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) {
Opcode = RISCV::FLD;
+ IsScalableVector = false;
+ } else if (RISCV::VRRegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVRELOAD_M1;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVRELOAD_M2;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM4RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVRELOAD_M4;
+ IsZvlsseg = false;
+ } else if (RISCV::VRM8RegClass.hasSubClassEq(RC)) {
+ Opcode = RISCV::PseudoVRELOAD_M8;
+ IsZvlsseg = false;
+ } else if (RISCV::VRN2M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD2_M1;
+ else if (RISCV::VRN2M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD2_M2;
+ else if (RISCV::VRN2M4RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD2_M4;
+ else if (RISCV::VRN3M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD3_M1;
+ else if (RISCV::VRN3M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD3_M2;
+ else if (RISCV::VRN4M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD4_M1;
+ else if (RISCV::VRN4M2RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD4_M2;
+ else if (RISCV::VRN5M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD5_M1;
+ else if (RISCV::VRN6M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD6_M1;
+ else if (RISCV::VRN7M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD7_M1;
+ else if (RISCV::VRN8M1RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::PseudoVRELOAD8_M1;
else
llvm_unreachable("Can't load this register from stack slot");
- BuildMI(MBB, I, DL, get(Opcode), DstReg)
- .addFrameIndex(FI)
- .addImm(0)
- .addMemOperand(MMO);
+ if (IsScalableVector) {
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+
+ MFI.setStackID(FI, TargetStackID::ScalableVector);
+ auto MIB = BuildMI(MBB, I, DL, get(Opcode), DstReg)
+ .addFrameIndex(FI)
+ .addMemOperand(MMO);
+ if (IsZvlsseg) {
+ // For spilling/reloading Zvlsseg registers, append the dummy field for
+ // the scaled vector length. The argument will be used when expanding
+ // these pseudo instructions.
+ MIB.addReg(RISCV::X0);
+ }
+ } else {
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+ BuildMI(MBB, I, DL, get(Opcode), DstReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ }
}
void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
@@ -206,17 +432,16 @@
MachineInstr::MIFlag Flag) const {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- bool IsRV64 = MF->getSubtarget<RISCVSubtarget>().is64Bit();
Register SrcReg = RISCV::X0;
Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass);
unsigned Num = 0;
- if (!IsRV64 && !isInt<32>(Val))
+ if (!STI.is64Bit() && !isInt<32>(Val))
report_fatal_error("Should only materialize 32-bit constants for RV32");
- RISCVMatInt::InstSeq Seq;
- RISCVMatInt::generateInstSeq(Val, IsRV64, Seq);
- assert(Seq.size() > 0);
+ RISCVMatInt::InstSeq Seq =
+ RISCVMatInt::generateInstSeq(Val, STI.getFeatureBits());
+ assert(!Seq.empty());
for (RISCVMatInt::Inst &Inst : Seq) {
// Write the final result to DstReg if it's the last instruction in the Seq.
@@ -228,6 +453,11 @@
BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
.addImm(Inst.Imm)
.setMIFlag(Flag);
+ } else if (Inst.Opc == RISCV::ADDUW) {
+ BuildMI(MBB, MBBI, DL, get(RISCV::ADDUW), Result)
+ .addReg(SrcReg, RegState::Kill)
+ .addReg(RISCV::X0)
+ .setMIFlag(Flag);
} else {
BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
.addReg(SrcReg, RegState::Kill)
@@ -546,6 +776,33 @@
return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
*TM.getMCAsmInfo());
}
+ case RISCV::PseudoVSPILL2_M1:
+ case RISCV::PseudoVSPILL2_M2:
+ case RISCV::PseudoVSPILL2_M4:
+ case RISCV::PseudoVSPILL3_M1:
+ case RISCV::PseudoVSPILL3_M2:
+ case RISCV::PseudoVSPILL4_M1:
+ case RISCV::PseudoVSPILL4_M2:
+ case RISCV::PseudoVSPILL5_M1:
+ case RISCV::PseudoVSPILL6_M1:
+ case RISCV::PseudoVSPILL7_M1:
+ case RISCV::PseudoVSPILL8_M1:
+ case RISCV::PseudoVRELOAD2_M1:
+ case RISCV::PseudoVRELOAD2_M2:
+ case RISCV::PseudoVRELOAD2_M4:
+ case RISCV::PseudoVRELOAD3_M1:
+ case RISCV::PseudoVRELOAD3_M2:
+ case RISCV::PseudoVRELOAD4_M1:
+ case RISCV::PseudoVRELOAD4_M2:
+ case RISCV::PseudoVRELOAD5_M1:
+ case RISCV::PseudoVRELOAD6_M1:
+ case RISCV::PseudoVRELOAD7_M1:
+ case RISCV::PseudoVRELOAD8_M1: {
+ // The values are determined based on expandVSPILL and expandVRELOAD that
+ // expand the pseudos depending on NF.
+ unsigned NF = isRVVSpillForZvlsseg(Opcode)->first;
+ return 4 * (2 * NF - 1);
+ }
}
}
@@ -879,3 +1136,482 @@
RISCVII::MO_CALL));
return It;
}
+
+// clang-format off
+#define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL) \
+ RISCV::PseudoV##OP##_##TYPE##_##LMUL##_COMMUTABLE
+
+#define CASE_VFMA_OPCODE_LMULS(OP, TYPE) \
+ CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF4): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF2): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M1): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M2): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M4): \
+ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M8)
+
+#define CASE_VFMA_SPLATS(OP) \
+ CASE_VFMA_OPCODE_LMULS(OP, VF16): \
+ case CASE_VFMA_OPCODE_LMULS(OP, VF32): \
+ case CASE_VFMA_OPCODE_LMULS(OP, VF64)
+// clang-format on
+
+bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+ if (!Desc.isCommutable())
+ return false;
+
+ switch (MI.getOpcode()) {
+ case CASE_VFMA_SPLATS(FMADD):
+ case CASE_VFMA_SPLATS(FMSUB):
+ case CASE_VFMA_SPLATS(FMACC):
+ case CASE_VFMA_SPLATS(FMSAC):
+ case CASE_VFMA_SPLATS(FNMADD):
+ case CASE_VFMA_SPLATS(FNMSUB):
+ case CASE_VFMA_SPLATS(FNMACC):
+ case CASE_VFMA_SPLATS(FNMSAC):
+ case CASE_VFMA_OPCODE_LMULS(FMACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FMSAC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV):
+ case CASE_VFMA_OPCODE_LMULS(MADD, VX):
+ case CASE_VFMA_OPCODE_LMULS(NMSUB, VX):
+ case CASE_VFMA_OPCODE_LMULS(MACC, VX):
+ case CASE_VFMA_OPCODE_LMULS(NMSAC, VX):
+ case CASE_VFMA_OPCODE_LMULS(MACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(NMSAC, VV): {
+ // For these instructions we can only swap operand 1 and operand 3 by
+ // changing the opcode.
+ unsigned CommutableOpIdx1 = 1;
+ unsigned CommutableOpIdx2 = 3;
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
+ CommutableOpIdx2))
+ return false;
+ return true;
+ }
+ case CASE_VFMA_OPCODE_LMULS(FMADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(FMSUB, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV):
+ case CASE_VFMA_OPCODE_LMULS(MADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): {
+ // For these instructions we have more freedom. We can commute with the
+ // other multiplicand or with the addend/subtrahend/minuend.
+
+ // Any fixed operand must be from source 1, 2 or 3.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex && SrcOpIdx1 > 3)
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex && SrcOpIdx2 > 3)
+ return false;
+
+ // It both ops are fixed one must be the tied source.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ SrcOpIdx2 != CommuteAnyOperandIndex && SrcOpIdx1 != 1 && SrcOpIdx2 != 1)
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later if
+ // needed.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ unsigned CommutableOpIdx1 = SrcOpIdx1;
+ if (SrcOpIdx1 == SrcOpIdx2) {
+ // Both of operands are not fixed. Set one of commutable
+ // operands to the tied source.
+ CommutableOpIdx1 = 1;
+ } else if (SrcOpIdx1 == CommutableOpIdx1) {
+ // Only one of the operands is not fixed.
+ CommutableOpIdx1 = SrcOpIdx2;
+ }
+
+ // CommutableOpIdx1 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx2.
+ unsigned CommutableOpIdx2;
+ if (CommutableOpIdx1 != 1) {
+ // If we haven't already used the tied source, we must use it now.
+ CommutableOpIdx2 = 1;
+ } else {
+ Register Op1Reg = MI.getOperand(CommutableOpIdx1).getReg();
+
+ // The commuted operands should have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless. We use this as a hint to make our decision.
+ if (Op1Reg != MI.getOperand(2).getReg())
+ CommutableOpIdx2 = 2;
+ else
+ CommutableOpIdx2 = 3;
+ }
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and
+ // SrcOpIdx2 to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
+ CommutableOpIdx2))
+ return false;
+ }
+
+ return true;
+ }
+ }
+
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
+
+#define CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, LMUL) \
+ case RISCV::PseudoV##OLDOP##_##TYPE##_##LMUL##_COMMUTABLE: \
+ Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL##_COMMUTABLE; \
+ break;
+
+#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M1) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M2) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M4) \
+ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M8)
+
+#define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP) \
+ CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF16) \
+ CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF32) \
+ CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF64)
+
+MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+ if (NewMI)
+ return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ return MI;
+ };
+
+ switch (MI.getOpcode()) {
+ case CASE_VFMA_SPLATS(FMACC):
+ case CASE_VFMA_SPLATS(FMADD):
+ case CASE_VFMA_SPLATS(FMSAC):
+ case CASE_VFMA_SPLATS(FMSUB):
+ case CASE_VFMA_SPLATS(FNMACC):
+ case CASE_VFMA_SPLATS(FNMADD):
+ case CASE_VFMA_SPLATS(FNMSAC):
+ case CASE_VFMA_SPLATS(FNMSUB):
+ case CASE_VFMA_OPCODE_LMULS(FMACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FMSAC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV):
+ case CASE_VFMA_OPCODE_LMULS(MADD, VX):
+ case CASE_VFMA_OPCODE_LMULS(NMSUB, VX):
+ case CASE_VFMA_OPCODE_LMULS(MACC, VX):
+ case CASE_VFMA_OPCODE_LMULS(NMSAC, VX):
+ case CASE_VFMA_OPCODE_LMULS(MACC, VV):
+ case CASE_VFMA_OPCODE_LMULS(NMSAC, VV): {
+ // It only make sense to toggle these between clobbering the
+ // addend/subtrahend/minuend one of the multiplicands.
+ assert((OpIdx1 == 1 || OpIdx2 == 1) && "Unexpected opcode index");
+ assert((OpIdx1 == 3 || OpIdx2 == 3) && "Unexpected opcode index");
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FMACC, FMADD)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FMADD, FMACC)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FMSAC, FMSUB)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FMSUB, FMSAC)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMACC, FNMADD)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMADD, FNMACC)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSAC, FNMSUB)
+ CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSUB, FNMSAC)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FMACC, FMADD, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FMSAC, FMSUB, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FNMACC, FNMADD, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSAC, FNMSUB, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(MACC, MADD, VX)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VX)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(NMSAC, NMSUB, VX)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(NMSUB, NMSAC, VX)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(MACC, MADD, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(NMSAC, NMSUB, VV)
+ }
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case CASE_VFMA_OPCODE_LMULS(FMADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(FMSUB, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV):
+ case CASE_VFMA_OPCODE_LMULS(MADD, VV):
+ case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): {
+ assert((OpIdx1 == 1 || OpIdx2 == 1) && "Unexpected opcode index");
+ // If one of the operands, is the addend we need to change opcode.
+ // Otherwise we're just swapping 2 of the multiplicands.
+ if (OpIdx1 == 3 || OpIdx2 == 3) {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FMADD, FMACC, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FMSUB, FMSAC, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FNMADD, FNMACC, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSUB, FNMSAC, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VV)
+ CASE_VFMA_CHANGE_OPCODE_LMULS(NMSUB, NMSAC, VV)
+ }
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ // Let the default code handle it.
+ break;
+ }
+ }
+
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+#undef CASE_VFMA_CHANGE_OPCODE_SPLATS
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS
+#undef CASE_VFMA_CHANGE_OPCODE_COMMON
+#undef CASE_VFMA_SPLATS
+#undef CASE_VFMA_OPCODE_LMULS
+#undef CASE_VFMA_OPCODE_COMMON
+
+// clang-format off
+#define CASE_WIDEOP_OPCODE_COMMON(OP, LMUL) \
+ RISCV::PseudoV##OP##_##LMUL##_TIED
+
+#define CASE_WIDEOP_OPCODE_LMULS(OP) \
+ CASE_WIDEOP_OPCODE_COMMON(OP, MF8): \
+ case CASE_WIDEOP_OPCODE_COMMON(OP, MF4): \
+ case CASE_WIDEOP_OPCODE_COMMON(OP, MF2): \
+ case CASE_WIDEOP_OPCODE_COMMON(OP, M1): \
+ case CASE_WIDEOP_OPCODE_COMMON(OP, M2): \
+ case CASE_WIDEOP_OPCODE_COMMON(OP, M4)
+// clang-format on
+
+#define CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, LMUL) \
+ case RISCV::PseudoV##OP##_##LMUL##_TIED: \
+ NewOpc = RISCV::PseudoV##OP##_##LMUL; \
+ break;
+
+#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2) \
+ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4)
+
+MachineInstr *RISCVInstrInfo::convertToThreeAddress(
+ MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case CASE_WIDEOP_OPCODE_LMULS(FWADD_WV):
+ case CASE_WIDEOP_OPCODE_LMULS(FWSUB_WV):
+ case CASE_WIDEOP_OPCODE_LMULS(WADD_WV):
+ case CASE_WIDEOP_OPCODE_LMULS(WADDU_WV):
+ case CASE_WIDEOP_OPCODE_LMULS(WSUB_WV):
+ case CASE_WIDEOP_OPCODE_LMULS(WSUBU_WV): {
+ // clang-format off
+ unsigned NewOpc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADD_WV)
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADDU_WV)
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUB_WV)
+ CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUBU_WV)
+ }
+ //clang-format on
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4));
+ MIB.copyImplicitOps(MI);
+
+ if (LV) {
+ unsigned NumOps = MI.getNumOperands();
+ for (unsigned I = 1; I < NumOps; ++I) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (Op.isReg() && Op.isKill())
+ LV->replaceKillInstruction(Op.getReg(), MI, *MIB);
+ }
+ }
+
+ return MIB;
+ }
+ }
+
+ return nullptr;
+}
+
+#undef CASE_WIDEOP_CHANGE_OPCODE_LMULS
+#undef CASE_WIDEOP_CHANGE_OPCODE_COMMON
+#undef CASE_WIDEOP_OPCODE_LMULS
+#undef CASE_WIDEOP_OPCODE_COMMON
+
+Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II,
+ const DebugLoc &DL,
+ int64_t Amount,
+ MachineInstr::MIFlag Flag) const {
+ assert(Amount > 0 && "There is no need to get VLEN scaled value.");
+ assert(Amount % 8 == 0 &&
+ "Reserve the stack by the multiple of one vector size.");
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const RISCVInstrInfo *TII = MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
+ int64_t NumOfVReg = Amount / 8;
+
+ Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL)
+ .setMIFlag(Flag);
+ assert(isInt<32>(NumOfVReg) &&
+ "Expect the number of vector registers within 32-bits.");
+ if (isPowerOf2_32(NumOfVReg)) {
+ uint32_t ShiftAmount = Log2_32(NumOfVReg);
+ if (ShiftAmount == 0)
+ return VL;
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
+ .addReg(VL, RegState::Kill)
+ .addImm(ShiftAmount)
+ .setMIFlag(Flag);
+ } else if (isPowerOf2_32(NumOfVReg - 1)) {
+ Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ uint32_t ShiftAmount = Log2_32(NumOfVReg - 1);
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister)
+ .addReg(VL)
+ .addImm(ShiftAmount)
+ .setMIFlag(Flag);
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADD), VL)
+ .addReg(ScaledRegister, RegState::Kill)
+ .addReg(VL, RegState::Kill)
+ .setMIFlag(Flag);
+ } else if (isPowerOf2_32(NumOfVReg + 1)) {
+ Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ uint32_t ShiftAmount = Log2_32(NumOfVReg + 1);
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister)
+ .addReg(VL)
+ .addImm(ShiftAmount)
+ .setMIFlag(Flag);
+ BuildMI(MBB, II, DL, TII->get(RISCV::SUB), VL)
+ .addReg(ScaledRegister, RegState::Kill)
+ .addReg(VL, RegState::Kill)
+ .setMIFlag(Flag);
+ } else {
+ Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ if (!isInt<12>(NumOfVReg))
+ movImm(MBB, II, DL, N, NumOfVReg);
+ else {
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), N)
+ .addReg(RISCV::X0)
+ .addImm(NumOfVReg)
+ .setMIFlag(Flag);
+ }
+ if (!MF.getSubtarget<RISCVSubtarget>().hasStdExtM())
+ MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
+ MF.getFunction(),
+ "M-extension must be enabled to calculate the vscaled size/offset."});
+ BuildMI(MBB, II, DL, TII->get(RISCV::MUL), VL)
+ .addReg(VL, RegState::Kill)
+ .addReg(N, RegState::Kill)
+ .setMIFlag(Flag);
+ }
+
+ return VL;
+}
+
+static bool isRVVWholeLoadStore(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case RISCV::VS1R_V:
+ case RISCV::VS2R_V:
+ case RISCV::VS4R_V:
+ case RISCV::VS8R_V:
+ case RISCV::VL1RE8_V:
+ case RISCV::VL2RE8_V:
+ case RISCV::VL4RE8_V:
+ case RISCV::VL8RE8_V:
+ case RISCV::VL1RE16_V:
+ case RISCV::VL2RE16_V:
+ case RISCV::VL4RE16_V:
+ case RISCV::VL8RE16_V:
+ case RISCV::VL1RE32_V:
+ case RISCV::VL2RE32_V:
+ case RISCV::VL4RE32_V:
+ case RISCV::VL8RE32_V:
+ case RISCV::VL1RE64_V:
+ case RISCV::VL2RE64_V:
+ case RISCV::VL4RE64_V:
+ case RISCV::VL8RE64_V:
+ return true;
+ }
+}
+
+bool RISCVInstrInfo::isRVVSpill(const MachineInstr &MI, bool CheckFIs) const {
+ // RVV lacks any support for immediate addressing for stack addresses, so be
+ // conservative.
+ unsigned Opcode = MI.getOpcode();
+ if (!RISCVVPseudosTable::getPseudoInfo(Opcode) &&
+ !isRVVWholeLoadStore(Opcode) && !isRVVSpillForZvlsseg(Opcode))
+ return false;
+ return !CheckFIs || any_of(MI.operands(), [](const MachineOperand &MO) {
+ return MO.isFI();
+ });
+}
+
+Optional<std::pair<unsigned, unsigned>>
+RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const {
+ switch (Opcode) {
+ default:
+ return None;
+ case RISCV::PseudoVSPILL2_M1:
+ case RISCV::PseudoVRELOAD2_M1:
+ return std::make_pair(2u, 1u);
+ case RISCV::PseudoVSPILL2_M2:
+ case RISCV::PseudoVRELOAD2_M2:
+ return std::make_pair(2u, 2u);
+ case RISCV::PseudoVSPILL2_M4:
+ case RISCV::PseudoVRELOAD2_M4:
+ return std::make_pair(2u, 4u);
+ case RISCV::PseudoVSPILL3_M1:
+ case RISCV::PseudoVRELOAD3_M1:
+ return std::make_pair(3u, 1u);
+ case RISCV::PseudoVSPILL3_M2:
+ case RISCV::PseudoVRELOAD3_M2:
+ return std::make_pair(3u, 2u);
+ case RISCV::PseudoVSPILL4_M1:
+ case RISCV::PseudoVRELOAD4_M1:
+ return std::make_pair(4u, 1u);
+ case RISCV::PseudoVSPILL4_M2:
+ case RISCV::PseudoVRELOAD4_M2:
+ return std::make_pair(4u, 2u);
+ case RISCV::PseudoVSPILL5_M1:
+ case RISCV::PseudoVRELOAD5_M1:
+ return std::make_pair(5u, 1u);
+ case RISCV::PseudoVSPILL6_M1:
+ case RISCV::PseudoVRELOAD6_M1:
+ return std::make_pair(6u, 1u);
+ case RISCV::PseudoVSPILL7_M1:
+ case RISCV::PseudoVRELOAD7_M1:
+ return std::make_pair(7u, 1u);
+ case RISCV::PseudoVSPILL8_M1:
+ case RISCV::PseudoVRELOAD8_M1:
+ return std::make_pair(8u, 1u);
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0b03421..d80fc48 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -15,6 +15,7 @@
#include "RISCVRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
#define GET_INSTRINFO_HEADER
#include "RISCVGenInstrInfo.inc"
@@ -28,6 +29,8 @@
public:
explicit RISCVInstrInfo(RISCVSubtarget &STI);
+ MCInst getNop() const override;
+
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
unsigned isStoreToStackSlot(const MachineInstr &MI,
@@ -133,9 +136,45 @@
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It, MachineFunction &MF,
const outliner::Candidate &C) const override;
+
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+
+ Register getVLENFactoredAmount(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II, const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+
+ // Returns true if the given MI is an RVV instruction opcode for which we may
+ // expect to see a FrameIndex operand. When CheckFIs is true, the instruction
+ // must contain at least one FrameIndex operand.
+ bool isRVVSpill(const MachineInstr &MI, bool CheckFIs) const;
+
+ Optional<std::pair<unsigned, unsigned>>
+ isRVVSpillForZvlsseg(unsigned Opcode) const;
+
protected:
const RISCVSubtarget &STI;
};
+namespace RISCVVPseudosTable {
+
+struct PseudoInfo {
+ uint16_t Pseudo;
+ uint16_t BaseInstr;
+};
+
+#define GET_RISCVVPseudosTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // end namespace RISCVVPseudosTable
+
} // end namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index a07b589..949fff2 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -25,8 +25,24 @@
def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
SDTCisSameAs<0, 4>,
SDTCisSameAs<4, 5>]>;
+def SDT_RISCVBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+ SDTCisVT<2, OtherVT>,
+ SDTCisVT<3, OtherVT>]>;
+def SDT_RISCVReadCSR : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDT_RISCVWriteCSR : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDT_RISCVSwapCSR : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisInt<2>]>;
def SDT_RISCVReadCycleWide : SDTypeProfile<2, 0, [SDTCisVT<0, i32>,
SDTCisVT<1, i32>]>;
+def SDT_RISCVIntUnaryOpW : SDTypeProfile<1, 1, [
+ SDTCisSameAs<0, 1>, SDTCisVT<0, i64>
+]>;
+def SDT_RISCVIntBinOpW : SDTypeProfile<1, 2, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>
+]>;
+def SDT_RISCVIntShiftDOpW : SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>, SDTCisVT<3, i64>
+]>;
// Target-independent nodes, but with target-specific formats.
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
@@ -47,12 +63,20 @@
def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC>;
+def riscv_brcc : SDNode<"RISCVISD::BR_CC", SDT_RISCVBrCC,
+ [SDNPHasChain]>;
def riscv_tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
-def riscv_sllw : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>;
-def riscv_sraw : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>;
-def riscv_srlw : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>;
+def riscv_sllw : SDNode<"RISCVISD::SLLW", SDT_RISCVIntBinOpW>;
+def riscv_sraw : SDNode<"RISCVISD::SRAW", SDT_RISCVIntBinOpW>;
+def riscv_srlw : SDNode<"RISCVISD::SRLW", SDT_RISCVIntBinOpW>;
+def riscv_read_csr : SDNode<"RISCVISD::READ_CSR", SDT_RISCVReadCSR,
+ [SDNPHasChain]>;
+def riscv_write_csr : SDNode<"RISCVISD::WRITE_CSR", SDT_RISCVWriteCSR,
+ [SDNPHasChain]>;
+def riscv_swap_csr : SDNode<"RISCVISD::SWAP_CSR", SDT_RISCVSwapCSR,
+ [SDNPHasChain]>;
def riscv_read_cycle_wide : SDNode<"RISCVISD::READ_CYCLE_WIDE",
SDT_RISCVReadCycleWide,
@@ -121,7 +145,7 @@
if (!MCOp.evaluateAsConstantImm(Imm))
return false;
if (STI.getTargetTriple().isArch64Bit())
- return isUInt<6>(Imm);
+ return isUInt<6>(Imm);
return isUInt<5>(Imm);
}];
let OperandType = "OPERAND_UIMMLOG2XLEN";
@@ -149,20 +173,6 @@
let OperandNamespace = "RISCVOp";
}
-// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
-def simm12_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
- [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]> {
- let ParserMatchClass = SImmAsmOperand<12>;
- let EncoderMethod = "getImmOpValue";
- let DecoderMethod = "decodeSImmOperand<12>";
- let MCOperandPredicate = [{
- int64_t Imm;
- if (MCOp.evaluateAsConstantImm(Imm))
- return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;
- return MCOp.isBareSymbolRef();
- }];
-}
-
// A 13-bit signed immediate where the least significant bit is zero.
def simm13_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
@@ -287,14 +297,10 @@
}
// Standalone (codegen-only) immleaf patterns.
-def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
-def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
-// A mask value that won't affect significant shift bits.
-def immbottomxlenset : ImmLeaf<XLenVT, [{
- if (Subtarget->is64Bit())
- return countTrailingOnes<uint64_t>(Imm) >= 6;
- return countTrailingOnes<uint64_t>(Imm) >= 5;
-}]>;
+
+// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
+def simm12_plus1 : ImmLeaf<XLenVT,
+ [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>;
// A 6-bit constant greater than 32.
def uimm6gt32 : ImmLeaf<XLenVT, [{
@@ -304,21 +310,7 @@
// Addressing modes.
// Necessary because a frameindex can't be matched directly in a pattern.
def AddrFI : ComplexPattern<iPTR, 1, "SelectAddrFI", [frameindex], []>;
-
-// Extract least significant 12 bits from an immediate value and sign extend
-// them.
-def LO12Sext : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(SignExtend64<12>(N->getZExtValue()),
- SDLoc(N), N->getValueType(0));
-}]>;
-
-// Extract the most significant 20 bits from an immediate value. Add 1 if bit
-// 11 is 1, to compensate for the low 12 bits in the matching immediate addi
-// or ld/st being negative.
-def HI20 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(((N->getZExtValue()+0x800) >> 12) & 0xfffff,
- SDLoc(N), N->getValueType(0));
-}]>;
+def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
// Return the negation of an immediate value.
def NegImm : SDNodeXForm<imm, [{
@@ -332,6 +324,12 @@
N->getValueType(0));
}]>;
+// Return an immediate value plus 32.
+def ImmPlus32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() + 32, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
// Return an immediate subtracted from XLen.
def ImmSubFromXLen : SDNodeXForm<imm, [{
uint64_t XLen = Subtarget->getXLen();
@@ -345,6 +343,29 @@
N->getValueType(0));
}]>;
+// Check if (add r, imm) can be optimized to (ADDI (ADDI r, imm0), imm1),
+// in which imm = imm0 + imm1 and both imm0 and imm1 are simm12.
+def AddiPair : PatLeaf<(imm), [{
+ if (!N->hasOneUse())
+ return false;
+ // The immediate operand must be in range [-4096,-2049] or [2048,4094].
+ int64_t Imm = N->getSExtValue();
+ return (-4096 <= Imm && Imm <= -2049) || (2048 <= Imm && Imm <= 4094);
+}]>;
+
+// Return imm/2.
+def AddiPairImmA : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+// Return imm - imm/2.
+def AddiPairImmB : SDNodeXForm<imm, [{
+ int64_t Imm = N->getSExtValue();
+ return CurDAG->getTargetConstant(Imm - Imm / 2, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction Formats
//===----------------------------------------------------------------------===//
@@ -386,11 +407,11 @@
Sched<[WriteIALU, ReadIALU]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class Shift_ri<bit arithshift, bits<3> funct3, string opcodestr>
- : RVInstIShift<arithshift, funct3, OPC_OP_IMM, (outs GPR:$rd),
+class Shift_ri<bits<5> imm11_7, bits<3> funct3, string opcodestr>
+ : RVInstIShift<imm11_7, funct3, OPC_OP_IMM, (outs GPR:$rd),
(ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
"$rd, $rs1, $shamt">,
- Sched<[WriteShift, ReadShift]>;
+ Sched<[WriteShiftImm, ReadShiftImm]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
@@ -411,11 +432,11 @@
opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class ShiftW_ri<bit arithshift, bits<3> funct3, string opcodestr>
- : RVInstIShiftW<arithshift, funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+class ShiftW_ri<bits<7> imm11_5, bits<3> funct3, string opcodestr>
+ : RVInstIShiftW<imm11_5, funct3, OPC_OP_IMM_32, (outs GPR:$rd),
(ins GPR:$rs1, uimm5:$shamt), opcodestr,
"$rd, $rs1, $shamt">,
- Sched<[WriteShift32, ReadShift32]>;
+ Sched<[WriteShiftImm32, ReadShiftImm32]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
@@ -482,18 +503,18 @@
def ANDI : ALU_ri<0b111, "andi">;
-def SLLI : Shift_ri<0, 0b001, "slli">;
-def SRLI : Shift_ri<0, 0b101, "srli">;
-def SRAI : Shift_ri<1, 0b101, "srai">;
+def SLLI : Shift_ri<0b00000, 0b001, "slli">;
+def SRLI : Shift_ri<0b00000, 0b101, "srli">;
+def SRAI : Shift_ri<0b01000, 0b101, "srai">;
def ADD : ALU_rr<0b0000000, 0b000, "add">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
def SUB : ALU_rr<0b0100000, 0b000, "sub">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SLL : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLL : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
def SLT : ALU_rr<0b0000000, 0b010, "slt">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
def SLTU : ALU_rr<0b0000000, 0b011, "sltu">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
def XOR : ALU_rr<0b0000000, 0b100, "xor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SRL : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SRA : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SRL : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
+def SRA : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
def OR : ALU_rr<0b0000000, 0b110, "or">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
def AND : ALU_rr<0b0000000, 0b111, "and">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
@@ -566,20 +587,20 @@
"addiw", "$rd, $rs1, $imm12">,
Sched<[WriteIALU32, ReadIALU32]>;
-def SLLIW : ShiftW_ri<0, 0b001, "slliw">;
-def SRLIW : ShiftW_ri<0, 0b101, "srliw">;
-def SRAIW : ShiftW_ri<1, 0b101, "sraiw">;
+def SLLIW : ShiftW_ri<0b0000000, 0b001, "slliw">;
+def SRLIW : ShiftW_ri<0b0000000, 0b101, "srliw">;
+def SRAIW : ShiftW_ri<0b0100000, 0b101, "sraiw">;
def ADDW : ALUW_rr<0b0000000, 0b000, "addw">,
Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
def SUBW : ALUW_rr<0b0100000, 0b000, "subw">,
Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
def SLLW : ALUW_rr<0b0000000, 0b001, "sllw">,
- Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+ Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>;
def SRLW : ALUW_rr<0b0000000, 0b101, "srlw">,
- Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+ Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>;
def SRAW : ALUW_rr<0b0100000, 0b101, "sraw">,
- Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+ Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>;
} // Predicates = [IsRV64]
//===----------------------------------------------------------------------===//
@@ -837,13 +858,18 @@
/// Generic pattern classes
+class PatGpr<SDPatternOperator OpNode, RVInst Inst>
+ : Pat<(OpNode GPR:$rs1), (Inst GPR:$rs1)>;
class PatGprGpr<SDPatternOperator OpNode, RVInst Inst>
: Pat<(OpNode GPR:$rs1, GPR:$rs2), (Inst GPR:$rs1, GPR:$rs2)>;
+
+class PatGprImm<SDPatternOperator OpNode, RVInst Inst, ImmLeaf ImmType>
+ : Pat<(XLenVT (OpNode (XLenVT GPR:$rs1), ImmType:$imm)),
+ (Inst GPR:$rs1, ImmType:$imm)>;
class PatGprSimm12<SDPatternOperator OpNode, RVInstI Inst>
- : Pat<(OpNode GPR:$rs1, simm12:$imm12), (Inst GPR:$rs1, simm12:$imm12)>;
+ : PatGprImm<OpNode, Inst, simm12>;
class PatGprUimmLog2XLen<SDPatternOperator OpNode, RVInstIShift Inst>
- : Pat<(OpNode GPR:$rs1, uimmlog2xlen:$shamt),
- (Inst GPR:$rs1, uimmlog2xlen:$shamt)>;
+ : PatGprImm<OpNode, Inst, uimmlog2xlen>;
/// Predicates
@@ -853,35 +879,20 @@
def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
}]>;
-def sexti32 : PatFrags<(ops node:$src),
- [(sext_inreg node:$src, i32),
- (assertsexti32 node:$src)]>;
+def sexti32 : ComplexPattern<i64, 1, "selectSExti32">;
def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
}]>;
-def zexti32 : PatFrags<(ops node:$src),
- [(and node:$src, 0xffffffff),
- (assertzexti32 node:$src)]>;
+def zexti32 : ComplexPattern<i64, 1, "selectZExti32">;
-def SRLIWPat : PatFrag<(ops node:$A, node:$B),
- (srl (and node:$A, imm), node:$B), [{
- return MatchSRLIW(N);
+def add_oneuse : PatFrag<(ops node:$A, node:$B), (add node:$A, node:$B), [{
+ return N->hasOneUse();
}]>;
-// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
-// on RV64). Also used to optimize the same sequence without SLLIUW.
-def SLLIUWPat : PatFrag<(ops node:$A, node:$B),
- (and (shl node:$A, node:$B), imm), [{
- return MatchSLLIUW(N);
+def mul_oneuse : PatFrag<(ops node:$A, node:$B), (mul node:$A, node:$B), [{
+ return N->hasOneUse();
}]>;
-/// Immediates
-
-def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
-def : Pat<(simm32hi20:$imm), (LUI (HI20 imm:$imm))>;
-def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>,
- Requires<[IsRV32]>;
-
/// Simple arithmetic operations
def : PatGprGpr<add, ADD>;
@@ -901,14 +912,15 @@
// typically introduced when the legalizer promotes the shift amount and
// zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
// ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I).
+def shiftMaskXLen : ComplexPattern<XLenVT, 1, "selectShiftMaskXLen", [], [], 0>;
+def shiftMask32 : ComplexPattern<i64, 1, "selectShiftMask32", [], [], 0>;
+
class shiftop<SDPatternOperator operator>
- : PatFrags<(ops node:$val, node:$count),
- [(operator node:$val, node:$count),
- (operator node:$val, (and node:$count, immbottomxlenset))]>;
+ : PatFrag<(ops node:$val, node:$count),
+ (operator node:$val, (XLenVT (shiftMaskXLen node:$count)))>;
class shiftopw<SDPatternOperator operator>
- : PatFrags<(ops node:$val, node:$count),
- [(operator node:$val, node:$count),
- (operator node:$val, (and node:$count, (XLenVT 31)))]>;
+ : PatFrag<(ops node:$val, node:$count),
+ (operator node:$val, (i64 (shiftMask32 node:$count)))>;
def : PatGprGpr<shiftop<shl>, SLL>;
def : PatGprGpr<shiftop<srl>, SRL>;
@@ -960,55 +972,35 @@
(ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
valty:$truev, valty:$falsev),
[(set valty:$dst, (riscv_selectcc cmpty:$lhs, cmpty:$rhs,
- (XLenVT imm:$imm), valty:$truev, valty:$falsev))]>;
+ (XLenVT timm:$imm), valty:$truev, valty:$falsev))]>;
def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;
/// Branches and jumps
-// Match `(brcond (CondOp ..), ..)` and lower to the appropriate RISC-V branch
-// instruction.
-class BccPat<PatFrag CondOp, RVInstB Inst>
- : Pat<(brcond (XLenVT (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
+// Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
+class BccPat<CondCode Cond, RVInstB Inst>
+ : Pat<(riscv_brcc GPR:$rs1, GPR:$rs2, Cond, bb:$imm12),
(Inst GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12)>;
-def : BccPat<seteq, BEQ>;
-def : BccPat<setne, BNE>;
-def : BccPat<setlt, BLT>;
-def : BccPat<setge, BGE>;
-def : BccPat<setult, BLTU>;
-def : BccPat<setuge, BGEU>;
-
-class BccSwapPat<PatFrag CondOp, RVInst InstBcc>
- : Pat<(brcond (XLenVT (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
- (InstBcc GPR:$rs2, GPR:$rs1, bb:$imm12)>;
-
-// Condition codes that don't have matching RISC-V branch instructions, but
-// are trivially supported by swapping the two input operands
-def : BccSwapPat<setgt, BLT>;
-def : BccSwapPat<setle, BGE>;
-def : BccSwapPat<setugt, BLTU>;
-def : BccSwapPat<setule, BGEU>;
-
-// Extra patterns are needed for a brcond without a setcc (i.e. where the
-// condition was calculated elsewhere).
-def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;
-// In this pattern, the `(xor $cond, 1)` functions like (boolean) `not`, as the
-// `brcond` only uses the lowest bit.
-def : Pat<(brcond (XLenVT (xor GPR:$cond, 1)), bb:$imm12),
- (BEQ GPR:$cond, X0, bb:$imm12)>;
+def : BccPat<SETEQ, BEQ>;
+def : BccPat<SETNE, BNE>;
+def : BccPat<SETLT, BLT>;
+def : BccPat<SETGE, BGE>;
+def : BccPat<SETULT, BLTU>;
+def : BccPat<SETUGE, BGEU>;
let isBarrier = 1, isBranch = 1, isTerminator = 1 in
def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>,
PseudoInstExpansion<(JAL X0, simm21_lsb0_jal:$imm20)>;
let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
-def PseudoBRIND : Pseudo<(outs), (ins GPR:$rs1, simm12:$imm12), []>,
+def PseudoBRIND : Pseudo<(outs), (ins GPRJALR:$rs1, simm12:$imm12), []>,
PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;
-def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>;
-def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
- (PseudoBRIND GPR:$rs1, simm12:$imm12)>;
+def : Pat<(brind GPRJALR:$rs1), (PseudoBRIND GPRJALR:$rs1, 0)>;
+def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)),
+ (PseudoBRIND GPRJALR:$rs1, simm12:$imm12)>;
// PseudoCALLReg is a generic pseudo instruction for calls which will eventually
// expand to auipc and jalr while encoding, with any given register used as the
@@ -1040,8 +1032,8 @@
def : Pat<(riscv_mret_flag), (MRET X0, X0)>;
let isCall = 1, Defs = [X1] in
-def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1),
- [(riscv_call GPR:$rs1)]>,
+def PseudoCALLIndirect : Pseudo<(outs), (ins GPRJALR:$rs1),
+ [(riscv_call GPRJALR:$rs1)]>,
PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
let isBarrier = 1, isReturn = 1, isTerminator = 1 in
@@ -1114,14 +1106,11 @@
/// Loads
-multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
- def : Pat<(LoadOp GPR:$rs1), (Inst GPR:$rs1, 0)>;
- def : Pat<(LoadOp AddrFI:$rs1), (Inst AddrFI:$rs1, 0)>;
- def : Pat<(LoadOp (add GPR:$rs1, simm12:$imm12)),
- (Inst GPR:$rs1, simm12:$imm12)>;
- def : Pat<(LoadOp (add AddrFI:$rs1, simm12:$imm12)),
- (Inst AddrFI:$rs1, simm12:$imm12)>;
- def : Pat<(LoadOp (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
+multiclass LdPat<PatFrag LoadOp, RVInst Inst, ValueType vt = XLenVT> {
+ def : Pat<(vt (LoadOp BaseAddr:$rs1)), (Inst BaseAddr:$rs1, 0)>;
+ def : Pat<(vt (LoadOp (add BaseAddr:$rs1, simm12:$imm12))),
+ (Inst BaseAddr:$rs1, simm12:$imm12)>;
+ def : Pat<(vt (LoadOp (IsOrAdd AddrFI:$rs1, simm12:$imm12))),
(Inst AddrFI:$rs1, simm12:$imm12)>;
}
@@ -1129,26 +1118,25 @@
defm : LdPat<extloadi8, LB>;
defm : LdPat<sextloadi16, LH>;
defm : LdPat<extloadi16, LH>;
-defm : LdPat<load, LW>, Requires<[IsRV32]>;
+defm : LdPat<load, LW, i32>, Requires<[IsRV32]>;
defm : LdPat<zextloadi8, LBU>;
defm : LdPat<zextloadi16, LHU>;
/// Stores
-multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
- def : Pat<(StoreOp StTy:$rs2, GPR:$rs1), (Inst StTy:$rs2, GPR:$rs1, 0)>;
- def : Pat<(StoreOp StTy:$rs2, AddrFI:$rs1), (Inst StTy:$rs2, AddrFI:$rs1, 0)>;
- def : Pat<(StoreOp StTy:$rs2, (add GPR:$rs1, simm12:$imm12)),
- (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp StTy:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
- (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp StTy:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
+multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
+ ValueType vt> {
+ def : Pat<(StoreOp (vt StTy:$rs2), BaseAddr:$rs1),
+ (Inst StTy:$rs2, BaseAddr:$rs1, 0)>;
+ def : Pat<(StoreOp (vt StTy:$rs2), (add BaseAddr:$rs1, simm12:$imm12)),
+ (Inst StTy:$rs2, BaseAddr:$rs1, simm12:$imm12)>;
+ def : Pat<(StoreOp (vt StTy:$rs2), (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
(Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
}
-defm : StPat<truncstorei8, SB, GPR>;
-defm : StPat<truncstorei16, SH, GPR>;
-defm : StPat<store, SW, GPR>, Requires<[IsRV32]>;
+defm : StPat<truncstorei8, SB, GPR, XLenVT>;
+defm : StPat<truncstorei16, SH, GPR, XLenVT>;
+defm : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>;
/// Fences
@@ -1170,6 +1158,58 @@
// present. This is necessary as it isn't valid to mix __atomic_* libcalls
// with inline atomic operations for the same object.
+/// Access to system registers
+
+// Helpers for defining specific operations. They are defined for each system
+// register separately. Side effect is not used because dependencies are
+// expressed via use-def properties.
+
+class ReadSysReg<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs GPR:$rd), (ins),
+ [(set GPR:$rd, (riscv_read_csr (XLenVT SR.Encoding)))]>,
+ PseudoInstExpansion<(CSRRS GPR:$rd, SR.Encoding, X0)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+}
+
+class WriteSysReg<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins GPR:$val),
+ [(riscv_write_csr (XLenVT SR.Encoding), GPR:$val)]>,
+ PseudoInstExpansion<(CSRRW X0, SR.Encoding, GPR:$val)> {
+ let hasSideEffects = 0;
+ let Defs = Regs;
+}
+
+class WriteSysRegImm<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins uimm5:$val),
+ [(riscv_write_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+ PseudoInstExpansion<(CSRRWI X0, SR.Encoding, uimm5:$val)> {
+ let hasSideEffects = 0;
+ let Defs = Regs;
+}
+
+class SwapSysReg<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs GPR:$rd), (ins GPR:$val),
+ [(set GPR:$rd, (riscv_swap_csr (XLenVT SR.Encoding), GPR:$val))]>,
+ PseudoInstExpansion<(CSRRW GPR:$rd, SR.Encoding, GPR:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
+class SwapSysRegImm<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs GPR:$rd), (ins uimm5:$val),
+ [(set GPR:$rd, (riscv_swap_csr (XLenVT SR.Encoding), uimm5:$val))]>,
+ PseudoInstExpansion<(CSRRWI GPR:$rd, SR.Encoding, uimm5:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
+def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>;
+def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>;
+def WriteFRMImm : WriteSysRegImm<SysRegFRM, [FRM]>;
+
/// Other pseudo-instructions
// Pessimistically assume the stack pointer will be clobbered
@@ -1183,14 +1223,11 @@
/// RV64 patterns
let Predicates = [IsRV64, NotHasStdExtZba] in {
-def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (SLLI GPR:$rs1, 32), 32)>;
// If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
// shifts instead of 3. This can occur when unsigned is used to index an array.
-def : Pat<(shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt),
- (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
-// shl/and can appear in the other order too.
-def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
(SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
}
@@ -1210,13 +1247,13 @@
(SUBW GPR:$rs1, GPR:$rs2)>;
def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
(SLLIW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(SRLIWPat GPR:$rs1, uimm5:$shamt),
+def : Pat<(i64 (srl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
(SRLIW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+def : Pat<(i64 (srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
(SRLIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
(SRAIW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+def : Pat<(i64 (sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
(SRAIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
@@ -1225,21 +1262,21 @@
/// Loads
-defm : LdPat<sextloadi32, LW>;
-defm : LdPat<extloadi32, LW>;
-defm : LdPat<zextloadi32, LWU>;
-defm : LdPat<load, LD>;
+defm : LdPat<sextloadi32, LW, i64>;
+defm : LdPat<extloadi32, LW, i64>;
+defm : LdPat<zextloadi32, LWU, i64>;
+defm : LdPat<load, LD, i64>;
/// Stores
-defm : StPat<truncstorei32, SW, GPR>;
-defm : StPat<store, SD, GPR>;
+defm : StPat<truncstorei32, SW, GPR, i64>;
+defm : StPat<store, SD, GPR, i64>;
} // Predicates = [IsRV64]
/// readcyclecounter
// On RV64, we can directly read the 64-bit "cycle" CSR.
let Predicates = [IsRV64] in
-def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>;
+def : Pat<(i64 (readcyclecounter)), (CSRRS CYCLE.Encoding, X0)>;
// On RV32, ReadCycleWide will be expanded to the suggested loop reading both
// halves of the 64-bit "cycle" CSR.
let Predicates = [IsRV32], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
@@ -1257,6 +1294,17 @@
// debugger if possible.
def : Pat<(debugtrap), (EBREAK)>;
+/// Simple optimization
+def : Pat<(add GPR:$rs1, (AddiPair:$rs2)),
+ (ADDI (ADDI GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
+ (AddiPairImmA GPR:$rs2))>;
+
+let Predicates = [IsRV64] in {
+def : Pat<(sext_inreg (add_oneuse GPR:$rs1, (AddiPair:$rs2)), i32),
+ (ADDIW (ADDIW GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
+ (AddiPairImmA AddiPair:$rs2))>;
+}
+
//===----------------------------------------------------------------------===//
// Standard extensions
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 7fce375..ee10c3a 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -61,14 +61,13 @@
def _AQ_RL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">;
}
-multiclass AtomicStPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
- def : Pat<(StoreOp GPR:$rs1, StTy:$rs2), (Inst StTy:$rs2, GPR:$rs1, 0)>;
- def : Pat<(StoreOp AddrFI:$rs1, StTy:$rs2), (Inst StTy:$rs2, AddrFI:$rs1, 0)>;
- def : Pat<(StoreOp (add GPR:$rs1, simm12:$imm12), StTy:$rs2),
- (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp (add AddrFI:$rs1, simm12:$imm12), StTy:$rs2),
- (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp (IsOrAdd AddrFI:$rs1, simm12:$imm12), StTy:$rs2),
+multiclass AtomicStPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
+ ValueType vt = XLenVT> {
+ def : Pat<(StoreOp BaseAddr:$rs1, (vt StTy:$rs2)),
+ (Inst StTy:$rs2, BaseAddr:$rs1, 0)>;
+ def : Pat<(StoreOp (add BaseAddr:$rs1, simm12:$imm12), (vt StTy:$rs2)),
+ (Inst StTy:$rs2, BaseAddr:$rs1, simm12:$imm12)>;
+ def : Pat<(StoreOp (IsOrAdd AddrFI:$rs1, simm12:$imm12), (vt StTy:$rs2)),
(Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
}
@@ -320,8 +319,8 @@
// Fences will be inserted for atomic load/stores according to the logic in
// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
-defm : LdPat<atomic_load_64, LD>;
-defm : AtomicStPat<atomic_store_64, SD, GPR>;
+defm : LdPat<atomic_load_64, LD, i64>;
+defm : AtomicStPat<atomic_store_64, SD, GPR, i64>;
defm : AMOPat<"atomic_swap_64", "AMOSWAP_D">;
defm : AMOPat<"atomic_load_add_64", "AMOADD_D">;
@@ -335,15 +334,15 @@
/// 64-bit AMOs
-def : Pat<(atomic_load_sub_64_monotonic GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_sub_64_monotonic GPR:$addr, GPR:$incr)),
(AMOADD_D GPR:$addr, (SUB X0, GPR:$incr))>;
-def : Pat<(atomic_load_sub_64_acquire GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_sub_64_acquire GPR:$addr, GPR:$incr)),
(AMOADD_D_AQ GPR:$addr, (SUB X0, GPR:$incr))>;
-def : Pat<(atomic_load_sub_64_release GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_sub_64_release GPR:$addr, GPR:$incr)),
(AMOADD_D_RL GPR:$addr, (SUB X0, GPR:$incr))>;
-def : Pat<(atomic_load_sub_64_acq_rel GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_sub_64_acq_rel GPR:$addr, GPR:$incr)),
(AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>;
-def : Pat<(atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr)),
(AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>;
/// 64-bit pseudo AMOs
@@ -351,15 +350,15 @@
def PseudoAtomicLoadNand64 : PseudoAMO;
// Ordering constants must be kept in sync with the AtomicOrdering enum in
// AtomicOrdering.h.
-def : Pat<(atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 2)>;
-def : Pat<(atomic_load_nand_64_acquire GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_nand_64_acquire GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 4)>;
-def : Pat<(atomic_load_nand_64_release GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_nand_64_release GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 5)>;
-def : Pat<(atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 6)>;
-def : Pat<(atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr),
+def : Pat<(i64 (atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 7)>;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i64,
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 7888ac7..7359e56 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -17,10 +17,26 @@
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
-def riscv_rolw : SDNode<"RISCVISD::ROLW", SDTIntShiftOp>;
-def riscv_rorw : SDNode<"RISCVISD::RORW", SDTIntShiftOp>;
-def riscv_fslw : SDNode<"RISCVISD::FSLW", SDTIntShiftDOp>;
-def riscv_fsrw : SDNode<"RISCVISD::FSRW", SDTIntShiftDOp>;
+def riscv_clzw : SDNode<"RISCVISD::CLZW", SDT_RISCVIntUnaryOpW>;
+def riscv_ctzw : SDNode<"RISCVISD::CTZW", SDT_RISCVIntUnaryOpW>;
+def riscv_rolw : SDNode<"RISCVISD::ROLW", SDT_RISCVIntBinOpW>;
+def riscv_rorw : SDNode<"RISCVISD::RORW", SDT_RISCVIntBinOpW>;
+def riscv_fslw : SDNode<"RISCVISD::FSLW", SDT_RISCVIntShiftDOpW>;
+def riscv_fsrw : SDNode<"RISCVISD::FSRW", SDT_RISCVIntShiftDOpW>;
+def riscv_fsl : SDNode<"RISCVISD::FSL", SDTIntShiftDOp>;
+def riscv_fsr : SDNode<"RISCVISD::FSR", SDTIntShiftDOp>;
+def riscv_grev : SDNode<"RISCVISD::GREV", SDTIntBinOp>;
+def riscv_grevw : SDNode<"RISCVISD::GREVW", SDT_RISCVIntBinOpW>;
+def riscv_gorc : SDNode<"RISCVISD::GORC", SDTIntBinOp>;
+def riscv_gorcw : SDNode<"RISCVISD::GORCW", SDT_RISCVIntBinOpW>;
+def riscv_shfl : SDNode<"RISCVISD::SHFL", SDTIntBinOp>;
+def riscv_shflw : SDNode<"RISCVISD::SHFLW", SDT_RISCVIntBinOpW>;
+def riscv_unshfl : SDNode<"RISCVISD::UNSHFL", SDTIntBinOp>;
+def riscv_unshflw: SDNode<"RISCVISD::UNSHFLW",SDT_RISCVIntBinOpW>;
+def riscv_bcompress : SDNode<"RISCVISD::BCOMPRESS", SDTIntBinOp>;
+def riscv_bcompressw : SDNode<"RISCVISD::BCOMPRESSW", SDT_RISCVIntBinOpW>;
+def riscv_bdecompress : SDNode<"RISCVISD::BDECOMPRESS", SDTIntBinOp>;
+def riscv_bdecompressw : SDNode<"RISCVISD::BDECOMPRESSW",SDT_RISCVIntBinOpW>;
def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
let Name = "UImmLog2XLenHalf";
@@ -45,20 +61,6 @@
}];
}
-// Checks if this mask has a single 0 bit and cannot be used with ANDI.
-def BCLRMask : ImmLeaf<XLenVT, [{
- if (Subtarget->is64Bit())
- return !isInt<12>(Imm) && isPowerOf2_64(~Imm);
- return !isInt<12>(Imm) && isPowerOf2_32(~Imm);
-}]>;
-
-// Checks if this mask has a single 1 bit and cannot be used with ORI/XORI.
-def BSETINVMask : ImmLeaf<XLenVT, [{
- if (Subtarget->is64Bit())
- return !isInt<12>(Imm) && isPowerOf2_64(Imm);
- return !isInt<12>(Imm) && isPowerOf2_32(Imm);
-}]>;
-
def BCLRXForm : SDNodeXForm<imm, [{
// Find the lowest 0.
return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
@@ -71,16 +73,117 @@
SDLoc(N), N->getValueType(0));
}]>;
-// Similar to above, but makes sure the immediate has 33 sign bits. When used
-// with an AND/OR/XOR where the other operand has at least 33 sign bits, the
-// result will have 33 sign bits. This can match BCLRIW/BSETIW/BINVIW.
-def BCLRWMask : ImmLeaf<i64, [{
- // After checking the sign bits, truncate to 32 bits for power of 2 check.
- return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+// Checks if this mask has a single 0 bit and cannot be used with ANDI.
+def BCLRMask : ImmLeaf<XLenVT, [{
+ if (Subtarget->is64Bit())
+ return !isInt<12>(Imm) && isPowerOf2_64(~Imm);
+ return !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}], BCLRXForm>;
+
+// Checks if this mask has a single 1 bit and cannot be used with ORI/XORI.
+def BSETINVMask : ImmLeaf<XLenVT, [{
+ if (Subtarget->is64Bit())
+ return !isInt<12>(Imm) && isPowerOf2_64(Imm);
+ return !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}], BSETINVXForm>;
+
+// Check if (or r, i) can be optimized to (BSETI (BSETI r, i0), i1),
+// in which i = (1 << i0) | (1 << i1).
+def BSETINVTwoBitsMask : PatLeaf<(imm), [{
+ if (!N->hasOneUse())
+ return false;
+ // The immediate should not be a simm12.
+ if (isInt<12>(N->getSExtValue()))
+ return false;
+ // The immediate must have exactly two bits set.
+ return countPopulation(N->getZExtValue()) == 2;
}]>;
-def BSETINVWMask : ImmLeaf<i64, [{
- return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(Imm);
+def TrailingZerosXForm : SDNodeXForm<imm, [{
+ uint64_t I = N->getZExtValue();
+ return CurDAG->getTargetConstant(countTrailingZeros(I), SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+def BSETINVTwoBitsMaskHigh : SDNodeXForm<imm, [{
+ uint64_t I = N->getZExtValue();
+ return CurDAG->getTargetConstant(63 - countLeadingZeros(I), SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+// Check if (or r, imm) can be optimized to (BSETI (ORI r, i0), i1),
+// in which imm = i0 | (1 << i1).
+def BSETINVORIMask : PatLeaf<(imm), [{
+ if (!N->hasOneUse())
+ return false;
+ // The immediate should not be a simm12.
+ if (isInt<12>(N->getSExtValue()))
+ return false;
+ // There should be only one set bit from bit 11 to the top.
+ return isPowerOf2_64(N->getZExtValue() & ~0x7ff);
+}]>;
+
+def BSETINVORIMaskLow : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 0x7ff,
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+// Check if (and r, i) can be optimized to (BCLRI (BCLRI r, i0), i1),
+// in which i = ~((1<<i0) | (1<<i1)).
+def BCLRITwoBitsMask : PatLeaf<(imm), [{
+ if (!N->hasOneUse())
+ return false;
+ // The immediate should not be a simm12.
+ if (isInt<12>(N->getSExtValue()))
+ return false;
+ // The immediate must have exactly two bits clear.
+ return countPopulation(N->getZExtValue()) == Subtarget->getXLen() - 2;
+}]>;
+
+def BCLRITwoBitsMaskLow : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(countTrailingZeros(~N->getZExtValue()),
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+def BCLRITwoBitsMaskHigh : SDNodeXForm<imm, [{
+ uint64_t I = N->getSExtValue();
+ if (!Subtarget->is64Bit())
+ I |= 0xffffffffull << 32;
+ return CurDAG->getTargetConstant(63 - countLeadingZeros(~I), SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+// Check if (and r, i) can be optimized to (BCLRI (ANDI r, i0), i1),
+// in which i = i0 & ~(1<<i1).
+def BCLRIANDIMask : PatLeaf<(imm), [{
+ if (!N->hasOneUse())
+ return false;
+ // The immediate should not be a simm12.
+ if (isInt<12>(N->getSExtValue()))
+ return false;
+ // There should be only one clear bit from bit 11 to the top.
+ uint64_t I = N->getZExtValue() | 0x7ff;
+ return Subtarget->is64Bit() ? isPowerOf2_64(~I) : isPowerOf2_32(~I);
+}]>;
+
+def BCLRIANDIMaskLow : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() & 0x7ff) | ~0x7ffull,
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+def C3LeftShift : PatLeaf<(imm), [{
+ uint64_t C = N->getZExtValue();
+ return C > 3 && ((C % 3) == 0) && isPowerOf2_64(C / 3);
+}]>;
+
+def C5LeftShift : PatLeaf<(imm), [{
+ uint64_t C = N->getZExtValue();
+ return C > 5 && ((C % 5) == 0) && isPowerOf2_64(C / 5);
+}]>;
+
+def C9LeftShift : PatLeaf<(imm), [{
+ uint64_t C = N->getZExtValue();
+ return C > 9 && ((C % 9) == 0) && isPowerOf2_64(C / 9);
}]>;
//===----------------------------------------------------------------------===//
@@ -95,84 +198,75 @@
RISCVOpcode opcode, string opcodestr>
: RVInstR<funct7, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
- let Inst{24-20} = funct5;
+ let rs2 = funct5;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
+class RVBShift_ri<bits<5> imm11_7, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
- : RVInstI<funct3, opcode, (outs GPR:$rd),
- (ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
- "$rd, $rs1, $shamt"> {
- bits<6> shamt;
-
- let Inst{31-27} = funct5;
- // NOTE: the bit op(26)=1 is used to select funnel shifts. All other
- // shifts operations and operations that live in the encoding space
- // of the shifts (single bit operations, grev, gorc) use op(26) = 0
- let Inst{26} = 0;
- let Inst{25-20} = shamt;
-}
+ : RVInstIShift<imm11_7, funct3, opcode, (outs GPR:$rd),
+ (ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
+ "$rd, $rs1, $shamt">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBShiftW_ri<bits<7> funct7, bits<3> funct3, RISCVOpcode opcode,
+class RVBShiftW_ri<bits<7> imm11_5, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
- : RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, uimm5:$shamt),
- opcodestr, "$rd, $rs1, $shamt"> {
- bits<5> shamt;
+ : RVInstIShiftW<imm11_5, funct3, opcode, (outs GPR:$rd),
+ (ins GPR:$rs1, uimm5:$shamt), opcodestr,
+ "$rd, $rs1, $shamt">;
- let Inst{31-25} = funct7;
- let Inst{24-20} = shamt;
-}
-
+// Using RVInstIShiftW since it allocates 5 bits instead of 6 to shamt.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBShfl_ri<bits<6> funct6, bits<3> funct3, RISCVOpcode opcode,
+class RVBShfl_ri<bits<7> imm11_5, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
- : RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, shfl_uimm:$shamt),
- opcodestr, "$rd, $rs1, $shamt"> {
- bits<6> shamt;
-
- let Inst{31-26} = funct6;
- let Inst{25-20} = shamt;
-}
+ : RVInstIShiftW<imm11_5, funct3, opcode, (outs GPR:$rd),
+ (ins GPR:$rs1, shfl_uimm:$shamt), opcodestr,
+ "$rd, $rs1, $shamt">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBTernaryR<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
+class RVBTernaryR<bits<2> funct2, bits<3> funct3, RISCVOpcode opcode,
string opcodestr, string argstr>
- : RVInstR4<funct2, opcode, (outs GPR:$rd),
- (ins GPR:$rs1, GPR:$rs2, GPR:$rs3), opcodestr, argstr> {
- let Inst{14-12} = funct3_b;
-}
+ : RVInstR4<funct2, funct3, opcode, (outs GPR:$rd),
+ (ins GPR:$rs1, GPR:$rs2, GPR:$rs3), opcodestr, argstr>;
// Currently used by FSRI only
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBTernaryImm6<bits<3> funct3_b, RISCVOpcode opcode,
+class RVBTernaryImm6<bits<3> funct3, RISCVOpcode opcode,
string opcodestr, string argstr>
- : RVInstR4<0b10, opcode, (outs GPR:$rd),
- (ins GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
- opcodestr, argstr> {
+ : RVInst<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
+ opcodestr, argstr, [], InstFormatR4> {
+ bits<5> rs3;
bits<6> shamt;
+ bits<5> rs1;
+ bits<5> rd;
- // NOTE: the first argument of RVInstR4 is hardcoded to 0b10 like the other
- // funnel shift instructions. The second bit of the argument though is
- // overwritten by the shamt as the encoding of this particular instruction
- // requires. This is to obtain op(26) = 1 as required by funnel shift
- // instructions without the need of a confusing argument in the definition
- // of the instruction.
+ let Inst{31-27} = rs3;
+ let Inst{26} = 1;
let Inst{25-20} = shamt;
- let Inst{14-12} = funct3_b;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode.Value;
}
// Currently used by FSRIW only
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBTernaryImm5<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
+class RVBTernaryImm5<bits<2> funct2, bits<3> funct3, RISCVOpcode opcode,
string opcodestr, string argstr>
- : RVInstR4<funct2, opcode, (outs GPR:$rd),
- (ins GPR:$rs1, GPR:$rs3, uimm5:$shamt), opcodestr, argstr> {
+ : RVInst<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs3, uimm5:$shamt),
+ opcodestr, argstr, [], InstFormatR4> {
+ bits<5> rs3;
bits<5> shamt;
+ bits<5> rs1;
+ bits<5> rd;
+ let Inst{31-27} = rs3;
+ let Inst{26-25} = funct2;
let Inst{24-20} = shamt;
- let Inst{14-12} = funct3_b;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode.Value;
}
//===----------------------------------------------------------------------===//
@@ -180,20 +274,28 @@
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtZbbOrZbp] in {
-def ANDN : ALU_rr<0b0100000, 0b111, "andn">, Sched<[]>;
-def ORN : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
-def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
+def ANDN : ALU_rr<0b0100000, 0b111, "andn">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def ORN : ALU_rr<0b0100000, 0b110, "orn">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def XNOR : ALU_rr<0b0100000, 0b100, "xnor">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZba] in {
-def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, Sched<[]>;
-def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[]>;
-def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[]>;
+def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">,
+ Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
+def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">,
+ Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
+def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">,
+ Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
} // Predicates = [HasStdExtZba]
let Predicates = [HasStdExtZbbOrZbp] in {
-def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
-def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
+def ROL : ALU_rr<0b0110000, 0b001, "rol">,
+ Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
+def ROR : ALU_rr<0b0110000, 0b101, "ror">,
+ Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbs] in {
@@ -215,7 +317,8 @@
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbbOrZbp] in
-def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
+def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">,
+ Sched<[WriteRotateImm, ReadRotateImm]>;
let Predicates = [HasStdExtZbs] in {
def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[]>;
@@ -244,11 +347,11 @@
let Predicates = [HasStdExtZbb] in {
def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
- Sched<[]>;
+ Sched<[WriteCLZ, ReadCLZ]>;
def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
- Sched<[]>;
+ Sched<[WriteCTZ, ReadCTZ]>;
def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "cpop">,
- Sched<[]>;
+ Sched<[WriteCPOP, ReadCPOP]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbm, IsRV64] in
@@ -257,9 +360,9 @@
let Predicates = [HasStdExtZbb] in {
def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>,
- "sext.b">, Sched<[]>;
+ "sext.b">, Sched<[WriteIALU, ReadIALU]>;
def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>,
- "sext.h">, Sched<[]>;
+ "sext.h">, Sched<[WriteIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbr] in {
@@ -295,10 +398,14 @@
} // Predicates = [HasStdExtZbc]
let Predicates = [HasStdExtZbb] in {
-def MIN : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
-def MINU : ALU_rr<0b0000101, 0b101, "minu">, Sched<[]>;
-def MAX : ALU_rr<0b0000101, 0b110, "max">, Sched<[]>;
-def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
+def MIN : ALU_rr<0b0000101, 0b100, "min">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def MINU : ALU_rr<0b0000101, 0b101, "minu">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def MAX : ALU_rr<0b0000101, 0b110, "max">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def MAXU : ALU_rr<0b0000101, 0b111, "maxu">,
+ Sched<[WriteIALU, ReadIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbp] in {
@@ -328,24 +435,33 @@
def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
let Predicates = [HasStdExtZbp] in {
-def SHFLI : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
-def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
+def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
+def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZba, IsRV64] in {
-def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">, Sched<[]>;
-def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">, Sched<[]>;
-def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">, Sched<[]>;
-def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, Sched<[]>;
-def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, Sched<[]>;
+def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">,
+ Sched<[WriteShiftImm32, ReadShiftImm32]>;
+def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">,
+ Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
+def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
+def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
-def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
+def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">,
+ Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
+def RORW : ALUW_rr<0b0110000, 0b101, "rorw">,
+ Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
let Predicates = [HasStdExtZbs, IsRV64] in {
+// NOTE: These instructions have been removed from the 0.94 spec. As a result
+// we have no isel patterns for them.
def BCLRW : ALUW_rr<0b0100100, 0b001, "bclrw">, Sched<[]>;
def BSETW : ALUW_rr<0b0010100, 0b001, "bsetw">, Sched<[]>;
def BINVW : ALUW_rr<0b0110100, 0b001, "binvw">, Sched<[]>;
@@ -362,9 +478,12 @@
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
-def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
+def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
+ Sched<[WriteRotateImm32, ReadRotateImm32]>;
let Predicates = [HasStdExtZbs, IsRV64] in {
+// NOTE: These instructions have been removed from the 0.94 spec. As a result
+// we have no isel patterns for them.
def BCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "bclriw">,
Sched<[]>;
def BSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "bsetiw">,
@@ -389,11 +508,11 @@
let Predicates = [HasStdExtZbb, IsRV64] in {
def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
- "clzw">, Sched<[]>;
+ "clzw">, Sched<[WriteCLZ32, ReadCLZ32]>;
def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
- "ctzw">, Sched<[]>;
+ "ctzw">, Sched<[WriteCTZ32, ReadCTZ32]>;
def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
- "cpopw">, Sched<[]>;
+ "cpopw">, Sched<[WriteCPOP32, ReadCPOP32]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -419,7 +538,8 @@
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def ZEXTH_RV32 : RVInstR<0b0000100, 0b100, OPC_OP, (outs GPR:$rd),
- (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+ (ins GPR:$rs1), "zext.h", "$rd, $rs1">,
+ Sched<[WriteIALU, ReadIALU]> {
let rs2 = 0b00000;
}
} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
@@ -427,7 +547,8 @@
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd),
- (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+ (ins GPR:$rs1), "zext.h", "$rd, $rs1">,
+ Sched<[WriteIALU, ReadIALU]> {
let rs2 = 0b00000;
}
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
@@ -442,7 +563,7 @@
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def REV8_RV32 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
- "rev8", "$rd, $rs1">, Sched<[]> {
+ "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> {
let imm12 = { 0b01101, 0b0011000 };
}
} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
@@ -450,7 +571,7 @@
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def REV8_RV64 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
- "rev8", "$rd, $rs1">, Sched<[]> {
+ "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> {
let imm12 = { 0b01101, 0b0111000 };
}
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
@@ -458,7 +579,7 @@
let Predicates = [HasStdExtZbbOrZbp] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
- "orc.b", "$rd, $rs1">, Sched<[]> {
+ "orc.b", "$rd, $rs1">, Sched<[WriteORCB, ReadORCB]> {
let imm12 = { 0b00101, 0b0000111 };
}
} // Predicates = [HasStdExtZbbOrZbp]
@@ -609,6 +730,45 @@
def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111111)>;
} // Predicates = [HasStdExtZbp, IsRV64]
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : InstAlias<"ror $rd, $rs1, $shamt",
+ (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : InstAlias<"rorw $rd, $rs1, $shamt",
+ (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbp] in {
+def : InstAlias<"grev $rd, $rs1, $shamt",
+ (GREVI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+def : InstAlias<"gorc $rd, $rs1, $shamt",
+ (GORCI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+def : InstAlias<"shfl $rd, $rs1, $shamt",
+ (SHFLI GPR:$rd, GPR:$rs1, shfl_uimm:$shamt), 0>;
+def : InstAlias<"unshfl $rd, $rs1, $shamt",
+ (UNSHFLI GPR:$rd, GPR:$rs1, shfl_uimm:$shamt), 0>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : InstAlias<"grevw $rd, $rs1, $shamt",
+ (GREVIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
+def : InstAlias<"gorcw $rd, $rs1, $shamt",
+ (GORCIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbs] in {
+def : InstAlias<"bset $rd, $rs1, $shamt",
+ (BSETI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+def : InstAlias<"bclr $rd, $rs1, $shamt",
+ (BCLRI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+def : InstAlias<"binv $rd, $rs1, $shamt",
+ (BINVI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+def : InstAlias<"bext $rd, $rs1, $shamt",
+ (BEXTI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+} // Predicates = [HasStdExtZbs]
+
//===----------------------------------------------------------------------===//
// Compressed Instruction patterns
//===----------------------------------------------------------------------===//
@@ -635,8 +795,8 @@
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbbOrZbp] in {
-def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
-def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
+def : PatGprGpr<rotl, ROL>;
+def : PatGprGpr<rotr, ROR>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbs] in {
@@ -654,49 +814,73 @@
(BSET X0, GPR:$rs2)>;
def : Pat<(and GPR:$rs1, BCLRMask:$mask),
- (BCLRI GPR:$rs1, (BCLRXForm imm:$mask))>;
+ (BCLRI GPR:$rs1, BCLRMask:$mask)>;
def : Pat<(or GPR:$rs1, BSETINVMask:$mask),
- (BSETI GPR:$rs1, (BSETINVXForm imm:$mask))>;
+ (BSETI GPR:$rs1, BSETINVMask:$mask)>;
def : Pat<(xor GPR:$rs1, BSETINVMask:$mask),
- (BINVI GPR:$rs1, (BSETINVXForm imm:$mask))>;
+ (BINVI GPR:$rs1, BSETINVMask:$mask)>;
def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
(BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+
+def : Pat<(or GPR:$r, BSETINVTwoBitsMask:$i),
+ (BSETI (BSETI GPR:$r, (TrailingZerosXForm BSETINVTwoBitsMask:$i)),
+ (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
+def : Pat<(xor GPR:$r, BSETINVTwoBitsMask:$i),
+ (BINVI (BINVI GPR:$r, (TrailingZerosXForm BSETINVTwoBitsMask:$i)),
+ (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
+def : Pat<(or GPR:$r, BSETINVORIMask:$i),
+ (BSETI (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+ (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
+def : Pat<(xor GPR:$r, BSETINVORIMask:$i),
+ (BINVI (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+ (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
+def : Pat<(and GPR:$r, BCLRITwoBitsMask:$i),
+ (BCLRI (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i)),
+ (BCLRITwoBitsMaskHigh BCLRITwoBitsMask:$i))>;
+def : Pat<(and GPR:$r, BCLRIANDIMask:$i),
+ (BCLRI (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i)),
+ (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>;
}
// There's no encoding for roli in the the 'B' extension as it can be
// implemented with rori by negating the immediate.
let Predicates = [HasStdExtZbbOrZbp] in {
-def : Pat<(rotr GPR:$rs1, uimmlog2xlen:$shamt),
- (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : PatGprImm<rotr, RORI, uimmlog2xlen>;
def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
(RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
+
+// We treat orc.b as a separate instruction, so match it directly. We also
+// lower the Zbb orc.b intrinsic to this.
+def : Pat<(riscv_gorc GPR:$rs1, 7), (ORCB GPR:$rs1)>;
}
-def riscv_grevi : SDNode<"RISCVISD::GREVI", SDTIntBinOp, []>;
-def riscv_greviw : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>;
-def riscv_gorci : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>;
-def riscv_gorciw : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>;
-
let Predicates = [HasStdExtZbp] in {
-def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>;
-def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;
-
-// We treat orc.b as a separate instruction, so match it directly.
-def : Pat<(riscv_gorci GPR:$rs1, (XLenVT 7)), (ORCB GPR:$rs1)>;
+def : PatGprGpr<riscv_grev, GREV>;
+def : PatGprGpr<riscv_gorc, GORC>;
+def : PatGprGpr<riscv_shfl, SHFL>;
+def : PatGprGpr<riscv_unshfl, UNSHFL>;
+def : PatGprGpr<int_riscv_xperm_n, XPERMN>;
+def : PatGprGpr<int_riscv_xperm_b, XPERMB>;
+def : PatGprGpr<int_riscv_xperm_h, XPERMH>;
+def : PatGprGpr<int_riscv_xperm_w, XPERMW>;
+def : PatGprImm<riscv_shfl, SHFLI, shfl_uimm>;
+def : PatGprImm<riscv_unshfl, UNSHFLI, shfl_uimm>;
+def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>;
+def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(rotr (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
-def : Pat<(rotl (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
+def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
+def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
// We treat rev8 as a separate instruction, so match it directly.
-def : Pat<(riscv_grevi GPR:$rs1, (i32 24)), (REV8_RV32 GPR:$rs1)>;
+def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>;
} // Predicates = [HasStdExtZbp, IsRV32]
let Predicates = [HasStdExtZbp, IsRV64] in {
// We treat rev8 as a separate instruction, so match it directly.
-def : Pat<(riscv_grevi GPR:$rs1, (i64 56)), (REV8_RV64 GPR:$rs1)>;
+def : Pat<(i64 (riscv_grev GPR:$rs1, 56)), (REV8_RV64 GPR:$rs1)>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt] in {
@@ -731,26 +915,12 @@
// instruction use different orders. fshl will return its first operand for
// shift of zero, fshr will return its second operand. fsl and fsr both return
// $rs1 so the patterns need to have different operand orders.
-//
-// fshl and fshr only read the lower log2(xlen) bits of the shift amount, but
-// fsl/fsr instructions read log2(xlen)+1 bits. DAG combine may have removed
-// an AND mask on the shift amount that we need to add back to avoid a one in
-// the extra bit.
-// FIXME: If we can prove that the extra bit in the shift amount is zero, we
-// don't need this mask.
-let Predicates = [HasStdExtZbt, IsRV32] in {
-def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
- (FSL GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
- (FSR GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
-}
-let Predicates = [HasStdExtZbt, IsRV64] in {
-def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
- (FSL GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
- (FSR GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
-}
let Predicates = [HasStdExtZbt] in {
+def : Pat<(riscv_fsl GPR:$rs1, GPR:$rs3, GPR:$rs2),
+ (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_fsr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+ (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+
def : Pat<(fshr GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
(FSRI GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt)>;
// We can use FSRI for fshl by immediate if we subtract the immediate from
@@ -760,9 +930,9 @@
} // Predicates = [HasStdExtZbt]
let Predicates = [HasStdExtZbb] in {
-def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
-def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
-def : Pat<(ctpop GPR:$rs1), (CPOP GPR:$rs1)>;
+def : PatGpr<ctlz, CLZ>;
+def : PatGpr<cttz, CTZ>;
+def : PatGpr<ctpop, CPOP>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbb] in {
@@ -771,162 +941,126 @@
}
let Predicates = [HasStdExtZbb] in {
-def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN GPR:$rs1, GPR:$rs2)>;
-def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX GPR:$rs1, GPR:$rs2)>;
-def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
+def : PatGprGpr<smin, MIN>;
+def : PatGprGpr<smax, MAX>;
+def : PatGprGpr<umin, MINU>;
+def : PatGprGpr<umax, MAXU>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbb, IsRV32] in {
-def : Pat<(bswap GPR:$rs1), (REV8_RV32 GPR:$rs1)>;
+def : Pat<(i32 (bswap GPR:$rs1)), (REV8_RV32 GPR:$rs1)>;
} // Predicates = [HasStdExtZbb, IsRV32]
let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(bswap GPR:$rs1), (REV8_RV64 GPR:$rs1)>;
+def : Pat<(i64 (bswap GPR:$rs1)), (REV8_RV64 GPR:$rs1)>;
} // Predicates = [HasStdExtZbb, IsRV64]
-let Predicates = [HasStdExtZbp, IsRV32] in
-def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(i32 (or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbp, IsRV64] in
-def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
+def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))),
+ (PACKU GPR:$rs1, GPR:$rs2)>;
+
+}
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))),
(PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbp, IsRV32] in
-def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
+def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))),
(PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbp, IsRV64] in
-def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
- (PACKU GPR:$rs1, GPR:$rs2)>;
+}
let Predicates = [HasStdExtZbp] in
-def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
+def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
(and GPR:$rs1, 0x00FF)),
(PACKH GPR:$rs1, GPR:$rs2)>;
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
-def : Pat<(and GPR:$rs, 0x0000FFFF), (ZEXTH_RV32 GPR:$rs)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(and GPR:$rs, 0x000000000000FFFF), (ZEXTH_RV64 GPR:$rs)>;
-}
+def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV32 GPR:$rs)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV64 GPR:$rs)>;
-let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
- (and GPR:$rs1, (i32 0xFF0000FF))),
- (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
- (SHFLI GPR:$rs1, (i32 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
- (and GPR:$rs1, (i32 0xF00FF00F))),
- (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
- (SHFLI GPR:$rs1, (i32 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
- (and GPR:$rs1, (i32 0xC3C3C3C3))),
- (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
- (SHFLI GPR:$rs1, (i32 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
- (and GPR:$rs1, (i32 0x99999999))),
- (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
- (SHFLI GPR:$rs1, (i32 1))>;
-} // Predicates = [HasStdExtZbp, IsRV32]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
- (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
- (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
- (SHFLI GPR:$rs1, (i64 16))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
- (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
- (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
- (SHFLI GPR:$rs1, (i64 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
- (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
- (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
- (SHFLI GPR:$rs1, (i64 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
- (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
- (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
- (SHFLI GPR:$rs1, (i64 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
- (and GPR:$rs1, (i64 0x9999999999999999))),
- (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
- (SHFLI GPR:$rs1, (i64 1))>;
-} // Predicates = [HasStdExtZbp, IsRV64]
+// Pattern to exclude simm12 immediates from matching.
+def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{
+ auto *C = dyn_cast<ConstantSDNode>(N);
+ return !C || !isInt<12>(C->getSExtValue());
+}]>;
let Predicates = [HasStdExtZba] in {
-def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), non_imm12:$rs2),
(SH1ADD GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), non_imm12:$rs2),
(SH2ADD GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), non_imm12:$rs2),
(SH3ADD GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
+ (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
+ (SH1ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
+ (SH1ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
+ (SH2ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
+ (SH2ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
+ (SH2ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
+ (SH3ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
+ (SH3ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
+ (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+
+def : Pat<(mul GPR:$r, C3LeftShift:$i),
+ (SLLI (SH1ADD GPR:$r, GPR:$r),
+ (TrailingZerosXForm C3LeftShift:$i))>;
+def : Pat<(mul GPR:$r, C5LeftShift:$i),
+ (SLLI (SH2ADD GPR:$r, GPR:$r),
+ (TrailingZerosXForm C5LeftShift:$i))>;
+def : Pat<(mul GPR:$r, C9LeftShift:$i),
+ (SLLI (SH3ADD GPR:$r, GPR:$r),
+ (TrailingZerosXForm C9LeftShift:$i))>;
} // Predicates = [HasStdExtZba]
let Predicates = [HasStdExtZba, IsRV64] in {
-def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)),
(SLLIUW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt),
- (SLLIUW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(add (and GPR:$rs1, (i64 0xFFFFFFFF)), GPR:$rs2),
+def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFFF), non_imm12:$rs2)),
(ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and GPR:$rs, 0x00000000FFFFFFFF), (ADDUW GPR:$rs, X0)>;
+def : Pat<(i64 (and GPR:$rs, 0xFFFFFFFF)), (ADDUW GPR:$rs, X0)>;
-def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 1)), GPR:$rs2),
+def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 1)), non_imm12:$rs2)),
(SH1ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 2)), GPR:$rs2),
+def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 2)), non_imm12:$rs2)),
(SH2ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 3)), GPR:$rs2),
+def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 3)), non_imm12:$rs2)),
(SH3ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 1)), 0x1FFFFFFFF), non_imm12:$rs2)),
(SH1ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 2)), 0x3FFFFFFFF), non_imm12:$rs2)),
(SH2ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2)),
(SH3ADDUW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZba, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(riscv_rolw GPR:$rs1, GPR:$rs2),
- (ROLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_rorw GPR:$rs1, GPR:$rs2),
- (RORW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_rorw GPR:$rs1, uimm5:$rs2),
- (RORIW GPR:$rs1, uimm5:$rs2)>;
+def : PatGprGpr<riscv_rolw, ROLW>;
+def : PatGprGpr<riscv_rorw, RORW>;
+def : PatGprImm<riscv_rorw, RORIW, uimm5>;
def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
(RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
-let Predicates = [HasStdExtZbs, IsRV64] in {
-def : Pat<(and (not (riscv_sllw 1, GPR:$rs2)), (assertsexti32 GPR:$rs1)),
- (BCLRW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (and (not (riscv_sllw 1, GPR:$rs2)), GPR:$rs1), i32),
- (BCLRW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
- (BSETW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (or (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
- (BSETW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
- (BINVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (xor (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
- (BINVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and (riscv_srlw GPR:$rs1, GPR:$rs2), 1),
- (BEXTW GPR:$rs1, GPR:$rs2)>;
-
-def : Pat<(riscv_sllw 1, GPR:$rs2),
- (BSETW X0, GPR:$rs2)>;
-
-def : Pat<(and (assertsexti32 GPR:$rs1), BCLRWMask:$mask),
- (BCLRIW GPR:$rs1, (BCLRXForm imm:$mask))>;
-def : Pat<(or (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
- (BSETIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
-def : Pat<(xor (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
- (BINVIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
-
-} // Predicates = [HasStdExtZbs, IsRV64]
-
let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(riscv_rorw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
-def : Pat<(riscv_rolw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
-def : Pat<(riscv_greviw GPR:$rs1, timm:$shamt), (GREVIW GPR:$rs1, timm:$shamt)>;
-def : Pat<(riscv_gorciw GPR:$rs1, timm:$shamt), (GORCIW GPR:$rs1, timm:$shamt)>;
+def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
+def : PatGprGpr<riscv_grevw, GREVW>;
+def : PatGprGpr<riscv_gorcw, GORCW>;
+def : PatGprGpr<riscv_shflw, SHFLW>;
+def : PatGprGpr<riscv_unshflw, UNSHFLW>;
+def : PatGprImm<riscv_grevw, GREVIW, uimm5>;
+def : PatGprImm<riscv_gorcw, GORCIW, uimm5>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt, IsRV64] in {
@@ -941,24 +1075,47 @@
} // Predicates = [HasStdExtZbt, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
- (CLZW GPR:$rs1)>;
-// computeKnownBits can't figure out that the and mask on the add result is
-// unnecessary so we need to pattern match it away.
-def : Pat<(and (add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
- (i64 0xFFFFFFFF)),
- (CLZW GPR:$rs1)>;
-def : Pat<(cttz (or GPR:$rs1, (i64 0x100000000))),
- (CTZW GPR:$rs1)>;
-def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (CPOPW GPR:$rs1)>;
+def : PatGpr<riscv_clzw, CLZW>;
+def : PatGpr<riscv_ctzw, CTZW>;
+def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(sext_inreg (or (shl GPR:$rs2, (i64 16)),
- (and GPR:$rs1, 0x000000000000FFFF)),
- i32),
+def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
+ (and GPR:$rs1, 0x000000000000FFFF)),
+ i32)),
(PACKW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
- (SRLIWPat GPR:$rs1, (i64 16))),
+def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
+ (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
(PACKUW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbc] in {
+def : PatGprGpr<int_riscv_clmul, CLMUL>;
+def : PatGprGpr<int_riscv_clmulh, CLMULH>;
+def : PatGprGpr<int_riscv_clmulr, CLMULR>;
+} // Predicates = [HasStdExtZbc]
+
+let Predicates = [HasStdExtZbe] in {
+def : PatGprGpr<riscv_bcompress, BCOMPRESS>;
+def : PatGprGpr<riscv_bdecompress, BDECOMPRESS>;
+} // Predicates = [HasStdExtZbe]
+
+let Predicates = [HasStdExtZbe, IsRV64] in {
+def : PatGprGpr<riscv_bcompressw, BCOMPRESSW>;
+def : PatGprGpr<riscv_bdecompressw, BDECOMPRESSW>;
+} // Predicates = [HasStdExtZbe, IsRV64]
+
+let Predicates = [HasStdExtZbr] in {
+def : PatGpr<int_riscv_crc32_b, CRC32B>;
+def : PatGpr<int_riscv_crc32_h, CRC32H>;
+def : PatGpr<int_riscv_crc32_w, CRC32W>;
+def : PatGpr<int_riscv_crc32c_b, CRC32CB>;
+def : PatGpr<int_riscv_crc32c_h, CRC32CH>;
+def : PatGpr<int_riscv_crc32c_w, CRC32CW>;
+} // Predicates = [HasStdExtZbr]
+
+let Predicates = [HasStdExtZbr, IsRV64] in {
+def : PatGpr<int_riscv_crc32_d, CRC32D>;
+def : PatGpr<int_riscv_crc32c_d, CRC32CD>;
+} // Predicates = [HasStdExtZbr, IsRV64]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 30df455..86f96c1 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -435,9 +435,9 @@
}
def C_SRLI : Shift_right<0b00, "c.srli", GPRC, uimmlog2xlennonzero>,
- Sched<[WriteShift, ReadShift]>;
+ Sched<[WriteShiftImm, ReadShiftImm]>;
def C_SRAI : Shift_right<0b01, "c.srai", GPRC, uimmlog2xlennonzero>,
- Sched<[WriteShift, ReadShift]>;
+ Sched<[WriteShiftImm, ReadShiftImm]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ANDI : RVInst16CB<0b100, 0b01, (outs GPRC:$rs1_wb), (ins GPRC:$rs1, simm6:$imm),
@@ -479,8 +479,8 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
(ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm),
- "c.slli" ,"$rd, $imm">,
- Sched<[WriteShift, ReadShift]> {
+ "c.slli", "$rd, $imm">,
+ Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = imm{4-0};
}
@@ -652,8 +652,8 @@
def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
(ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
- "c.slli" ,"$rd, $imm">,
- Sched<[WriteShift, ReadShift]> {
+ "c.slli", "$rd, $imm">,
+ Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = imm{4-0};
let Inst{11-7} = 0;
@@ -661,8 +661,8 @@
}
def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
- "c.slli64" ,"$rd">,
- Sched<[WriteShift, ReadShift]> {
+ "c.slli64", "$rd">,
+ Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = 0;
let Inst{12} = 0;
@@ -671,7 +671,7 @@
def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
(ins GPRC:$rd),
"c.srli64", "$rd">,
- Sched<[WriteShift, ReadShift]> {
+ Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = 0;
let Inst{11-10} = 0;
@@ -681,7 +681,7 @@
def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
(ins GPRC:$rd),
"c.srai64", "$rd">,
- Sched<[WriteShift, ReadShift]> {
+ Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = 0;
let Inst{11-10} = 1;
@@ -792,9 +792,9 @@
} // Predicates = [HasStdExtC]
let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
-def : CompressPat<(FSW FPR32C:$rs2, GPRC:$rs1,uimm7_lsb00:$imm),
+def : CompressPat<(FSW FPR32C:$rs2, GPRC:$rs1, uimm7_lsb00:$imm),
(C_FSW FPR32C:$rs2, GPRC:$rs1, uimm7_lsb00:$imm)>;
-} // Predicate = [HasStdExtC, HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
let Predicates = [HasStdExtC, IsRV64] in {
def : CompressPat<(SD GPRC:$rs2, GPRC:$rs1, uimm8_lsb000:$imm),
@@ -848,7 +848,7 @@
let isCompressOnly = true in
def : CompressPat<(AND GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_AND GPRC:$rs1, GPRC:$rs2)>;
-} // Predicates = [HasStdExtC]
+} // Predicates = [HasStdExtC]
let Predicates = [HasStdExtC, IsRV64] in {
let isCompressOnly = true in
@@ -870,13 +870,13 @@
(C_BEQZ GPRC:$rs1, simm9_lsb0:$imm)>;
def : CompressPat<(BNE GPRC:$rs1, X0, simm9_lsb0:$imm),
(C_BNEZ GPRC:$rs1, simm9_lsb0:$imm)>;
-} // Predicates = [HasStdExtC]
+} // Predicates = [HasStdExtC]
// Quadrant 2
let Predicates = [HasStdExtC] in {
def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm),
(C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>;
-} // Predicates = [HasStdExtC]
+} // Predicates = [HasStdExtC]
let Predicates = [HasStdExtC, HasStdExtD] in {
def : CompressPat<(FLD FPR64:$rd, SP:$rs1, uimm9_lsb000:$imm),
@@ -938,4 +938,4 @@
let Predicates = [HasStdExtC, IsRV64] in {
def : CompressPat<(SD GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm),
(C_SDSP GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm)>;
-} // Predicates = [HasStdExtC, IsRV64]
+} // Predicates = [HasStdExtC, IsRV64]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 133599e..41eff2e 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -31,9 +31,9 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class FPFMAD_rrr_frm<RISCVOpcode opcode, string opcodestr>
- : RVInstR4<0b01, opcode, (outs FPR64:$rd),
- (ins FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, frmarg:$funct3),
- opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+ : RVInstR4Frm<0b01, opcode, (outs FPR64:$rd),
+ (ins FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, frmarg:$funct3),
+ opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
class FPFMADDynFrmAlias<FPFMAD_rrr_frm Inst, string OpcodeStr>
: InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
@@ -82,16 +82,16 @@
Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>;
def FMADD_D : FPFMAD_rrr_frm<OPC_MADD, "fmadd.d">,
- Sched<[WriteFMulAdd64, ReadFMulAdd64, ReadFMulAdd64, ReadFMulAdd64]>;
+ Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
def : FPFMADDynFrmAlias<FMADD_D, "fmadd.d">;
def FMSUB_D : FPFMAD_rrr_frm<OPC_MSUB, "fmsub.d">,
- Sched<[WriteFMulSub64, ReadFMulSub64, ReadFMulSub64, ReadFMulSub64]>;
+ Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
def : FPFMADDynFrmAlias<FMSUB_D, "fmsub.d">;
def FNMSUB_D : FPFMAD_rrr_frm<OPC_NMSUB, "fnmsub.d">,
- Sched<[WriteFMulSub64, ReadFMulSub64, ReadFMulSub64, ReadFMulSub64]>;
+ Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
def : FPFMADDynFrmAlias<FNMSUB_D, "fnmsub.d">;
def FNMADD_D : FPFMAD_rrr_frm<OPC_NMADD, "fnmadd.d">,
- Sched<[WriteFMulAdd64, ReadFMulAdd64, ReadFMulAdd64, ReadFMulAdd64]>;
+ Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
def : FPFMADDynFrmAlias<FNMADD_D, "fnmadd.d">;
def FADD_D : FPALUD_rr_frm<0b0000001, "fadd.d">,
@@ -281,11 +281,8 @@
def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)),
(FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
-// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
-// canonical NaN when giving a signaling NaN. This doesn't match the LLVM
-// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
-// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
-// matches LLVM's fminnum and fmaxnum
+// The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
+// LLVM's fminnum and fmaxnum.
// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
def : PatFpr64Fpr64<fminnum, FMIN_D>;
def : PatFpr64Fpr64<fmaxnum, FMAX_D>;
@@ -303,11 +300,11 @@
/// Loads
-defm : LdPat<load, FLD>;
+defm : LdPat<load, FLD, f64>;
/// Stores
-defm : StPat<store, FSD, FPR64>;
+defm : StPat<store, FSD, FPR64, f64>;
/// Pseudo-instructions needed for the soft-float ABI with RV32D
@@ -328,41 +325,55 @@
let Predicates = [HasStdExtD, IsRV32] in {
/// Float constants
-def : Pat<(f64 (fpimm0)), (FCVT_D_W X0)>;
+def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>;
// double->[u]int. Round-to-zero must be used.
-def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_W_D FPR64:$rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>;
+def : Pat<(i32 (fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>;
+def : Pat<(i32 (fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>;
+
+// float->int32 with current rounding mode.
+def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>;
+
+// float->int32 rounded to nearest with ties rounded away from zero.
+def : Pat<(i32 (lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>;
// [u]int->double.
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_W GPR:$rs1)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>;
+def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>;
+def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1)>;
} // Predicates = [HasStdExtD, IsRV32]
let Predicates = [HasStdExtD, IsRV64] in {
/// Float constants
-def : Pat<(f64 (fpimm0)), (FMV_D_X X0)>;
+def : Pat<(f64 (fpimm0)), (FMV_D_X (i64 X0))>;
// Moves (no conversion)
-def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>;
-def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
+def : Pat<(bitconvert (i64 GPR:$rs1)), (FMV_D_X GPR:$rs1)>;
+def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>;
-// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
-// because fpto[u|s]i produce poison if the value can't fit into the target.
-// We match the single case below because fcvt.wu.d sign-extends its result so
-// is cheaper than fcvt.lu.d+sext.w.
-def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR64:$rs1)), i32),
- (FCVT_WU_D $rs1, 0b001)>;
+// Use target specific isd nodes to help us remember the result is sign
+// extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
+// duplicated if it has another user that didn't need the sign_extend.
+def : Pat<(riscv_fcvt_w_rv64 FPR64:$rs1), (FCVT_W_D $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
// [u]int32->fp
-def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_D_W $rs1)>;
-def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_D_WU $rs1)>;
+def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>;
+def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>;
-def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_L_D FPR64:$rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_LU_D FPR64:$rs1, 0b001)>;
+// double->[u]int64. Round-to-zero must be used.
+def : Pat<(i64 (fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>;
+def : Pat<(i64 (fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>;
+
+// double->int64 with current rounding mode.
+def : Pat<(i64 (lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
+def : Pat<(i64 (llrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
+
+// double->int64 rounded to nearest with ties rounded away from zero.
+def : Pat<(i64 (lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
+def : Pat<(i64 (llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
// [u]int64->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_L GPR:$rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_LU GPR:$rs1, 0b111)>;
+def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>;
+def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_D_LU GPR:$rs1, 0b111)>;
} // Predicates = [HasStdExtD, IsRV64]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 4529949..6b5c961 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -19,11 +19,17 @@
: SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
def SDT_RISCVFMV_X_ANYEXTW_RV64
: SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
+def STD_RISCVFCVT_W_RV64
+ : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>;
def riscv_fmv_w_x_rv64
: SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
def riscv_fmv_x_anyextw_rv64
: SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>;
+def riscv_fcvt_w_rv64
+ : SDNode<"RISCVISD::FCVT_W_RV64", STD_RISCVFCVT_W_RV64>;
+def riscv_fcvt_wu_rv64
+ : SDNode<"RISCVISD::FCVT_WU_RV64", STD_RISCVFCVT_W_RV64>;
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
@@ -49,9 +55,9 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class FPFMAS_rrr_frm<RISCVOpcode opcode, string opcodestr>
- : RVInstR4<0b00, opcode, (outs FPR32:$rd),
- (ins FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, frmarg:$funct3),
- opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+ : RVInstR4Frm<0b00, opcode, (outs FPR32:$rd),
+ (ins FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, frmarg:$funct3),
+ opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
class FPFMASDynFrmAlias<FPFMAS_rrr_frm Inst, string OpcodeStr>
: InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
@@ -117,16 +123,16 @@
Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>;
def FMADD_S : FPFMAS_rrr_frm<OPC_MADD, "fmadd.s">,
- Sched<[WriteFMulAdd32, ReadFMulAdd32, ReadFMulAdd32, ReadFMulAdd32]>;
+ Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
def : FPFMASDynFrmAlias<FMADD_S, "fmadd.s">;
def FMSUB_S : FPFMAS_rrr_frm<OPC_MSUB, "fmsub.s">,
- Sched<[WriteFMulSub32, ReadFMulSub32, ReadFMulSub32, ReadFMulSub32]>;
+ Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
def : FPFMASDynFrmAlias<FMSUB_S, "fmsub.s">;
def FNMSUB_S : FPFMAS_rrr_frm<OPC_NMSUB, "fnmsub.s">,
- Sched<[WriteFMulSub32, ReadFMulSub32, ReadFMulSub32, ReadFMulSub32]>;
+ Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
def : FPFMASDynFrmAlias<FNMSUB_S, "fnmsub.s">;
def FNMADD_S : FPFMAS_rrr_frm<OPC_NMADD, "fnmadd.s">,
- Sched<[WriteFMulAdd32, ReadFMulAdd32, ReadFMulAdd32, ReadFMulAdd32]>;
+ Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
def : FPFMASDynFrmAlias<FNMADD_S, "fnmadd.s">;
def FADD_S : FPALUS_rr_frm<0b0000000, "fadd.s">,
@@ -251,27 +257,27 @@
// The following csr instructions actually alias instructions from the base ISA.
// However, it only makes sense to support them when the F extension is enabled.
// NOTE: "frcsr", "frrm", and "frflags" are more specialized version of "csrr".
-def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>;
-def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>;
-def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>;
+def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, SysRegFCSR.Encoding, X0), 2>;
+def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, SysRegFCSR.Encoding, GPR:$rs)>;
+def : InstAlias<"fscsr $rs", (CSRRW X0, SysRegFCSR.Encoding, GPR:$rs), 2>;
// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them
// zero weight.
-def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>;
-def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>;
-def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>;
+def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, SysRegFCSR.Encoding, X0), 0>;
+def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, SysRegFCSR.Encoding, GPR:$rs), 0>;
+def : InstAlias<"fssr $rs", (CSRRW X0, SysRegFCSR.Encoding, GPR:$rs), 0>;
-def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>;
-def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>;
-def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>;
-def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, FRM.Encoding, uimm5:$imm)>;
-def : InstAlias<"fsrmi $imm", (CSRRWI X0, FRM.Encoding, uimm5:$imm), 2>;
+def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, SysRegFRM.Encoding, X0), 2>;
+def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, SysRegFRM.Encoding, GPR:$rs)>;
+def : InstAlias<"fsrm $rs", (CSRRW X0, SysRegFRM.Encoding, GPR:$rs), 2>;
+def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, SysRegFRM.Encoding, uimm5:$imm)>;
+def : InstAlias<"fsrmi $imm", (CSRRWI X0, SysRegFRM.Encoding, uimm5:$imm), 2>;
-def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, FFLAGS.Encoding, X0), 2>;
-def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, FFLAGS.Encoding, GPR:$rs)>;
-def : InstAlias<"fsflags $rs", (CSRRW X0, FFLAGS.Encoding, GPR:$rs), 2>;
-def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, FFLAGS.Encoding, uimm5:$imm)>;
-def : InstAlias<"fsflagsi $imm", (CSRRWI X0, FFLAGS.Encoding, uimm5:$imm), 2>;
+def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, SysRegFFLAGS.Encoding, X0), 2>;
+def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, SysRegFFLAGS.Encoding, GPR:$rs)>;
+def : InstAlias<"fsflags $rs", (CSRRW X0, SysRegFFLAGS.Encoding, GPR:$rs), 2>;
+def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, SysRegFFLAGS.Encoding, uimm5:$imm)>;
+def : InstAlias<"fsflagsi $imm", (CSRRWI X0, SysRegFFLAGS.Encoding, uimm5:$imm), 2>;
// fmv.w.x and fmv.x.w were previously known as fmv.s.x and fmv.x.s. Both
// spellings should be supported by standard tools.
@@ -337,11 +343,8 @@
def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)),
(FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
-// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
-// canonical NaN when given a signaling NaN. This doesn't match the LLVM
-// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
-// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
-// matches LLVM's fminnum and fmaxnum
+// The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
+// LLVM's fminnum and fmaxnum
// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
def : PatFpr32Fpr32<fminnum, FMIN_S>;
def : PatFpr32Fpr32<fmaxnum, FMAX_S>;
@@ -359,26 +362,32 @@
/// Loads
-defm : LdPat<load, FLW>;
+defm : LdPat<load, FLW, f32>;
/// Stores
-defm : StPat<store, FSW, FPR32>;
+defm : StPat<store, FSW, FPR32, f32>;
} // Predicates = [HasStdExtF]
let Predicates = [HasStdExtF, IsRV32] in {
// Moves (no conversion)
-def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
-def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
+def : Pat<(bitconvert (i32 GPR:$rs1)), (FMV_W_X GPR:$rs1)>;
+def : Pat<(i32 (bitconvert FPR32:$rs1)), (FMV_X_W FPR32:$rs1)>;
// float->[u]int. Round-to-zero must be used.
-def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(i32 (fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(i32 (fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
+
+// float->int32 with current rounding mode.
+def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>;
+
+// float->int32 rounded to nearest with ties rounded away from zero.
+def : Pat<(i32 (lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>;
// [u]int->float. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
} // Predicates = [HasStdExtF, IsRV32]
let Predicates = [HasStdExtF, IsRV64] in {
@@ -388,20 +397,27 @@
def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
(FMV_X_W FPR32:$src)>;
-// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
-// because fpto[u|s]i produces poison if the value can't fit into the target.
-// We match the single case below because fcvt.wu.s sign-extends its result so
-// is cheaper than fcvt.lu.s+sext.w.
-def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR32:$rs1)), i32),
- (FCVT_WU_S $rs1, 0b001)>;
+// Use target specific isd nodes to help us remember the result is sign
+// extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
+// duplicated if it has another user that didn't need the sign_extend.
+def : Pat<(riscv_fcvt_w_rv64 FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
-// FP->[u]int64
-def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_L_S $rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_LU_S $rs1, 0b001)>;
+// float->[u]int64. Round-to-zero must be used.
+def : Pat<(i64 (fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
+def : Pat<(i64 (fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
+
+// float->int64 with current rounding mode.
+def : Pat<(i64 (lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
+def : Pat<(i64 (llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
+
+// float->int64 rounded to neartest with ties rounded away from zero.
+def : Pat<(i64 (lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
+def : Pat<(i64 (llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
// [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_L $rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_LU $rs1, 0b111)>;
+def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_S_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_S_L $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_S_LU $rs1, 0b111)>;
} // Predicates = [HasStdExtF, IsRV64]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 8cfb903..f654ed1 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -15,9 +15,10 @@
// RISC-V specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def riscv_divw : SDNode<"RISCVISD::DIVW", SDTIntBinOp>;
-def riscv_divuw : SDNode<"RISCVISD::DIVUW", SDTIntBinOp>;
-def riscv_remuw : SDNode<"RISCVISD::REMUW", SDTIntBinOp>;
+def riscv_mulhsu : SDNode<"RISCVISD::MULHSU", SDTIntBinOp>;
+def riscv_divw : SDNode<"RISCVISD::DIVW", SDT_RISCVIntBinOpW>;
+def riscv_divuw : SDNode<"RISCVISD::DIVUW", SDT_RISCVIntBinOpW>;
+def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>;
//===----------------------------------------------------------------------===//
// Instructions
@@ -63,7 +64,7 @@
def : PatGprGpr<mul, MUL>;
def : PatGprGpr<mulhs, MULH>;
def : PatGprGpr<mulhu, MULHU>;
-// No ISDOpcode for mulhsu
+def : PatGprGpr<riscv_mulhsu, MULHSU>;
def : PatGprGpr<sdiv, DIV>;
def : PatGprGpr<udiv, DIVU>;
def : PatGprGpr<srem, REM>;
@@ -91,9 +92,24 @@
// Although the sexti32 operands may not have originated from an i32 srem,
// this pattern is safe as it is impossible for two sign extended inputs to
// produce a result where res[63:32]=0 and res[31]=1.
-def : Pat<(srem (sexti32 GPR:$rs1), (sexti32 GPR:$rs2)),
- (REMW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (srem (sexti32 GPR:$rs1),
- (sexti32 GPR:$rs2)), i32),
+def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
(REMW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtM, IsRV64]
+
+let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
+// Special case for calculating the full 64-bit product of a 32x32 unsigned
+// multiply where the inputs aren't known to be zero extended. We can shift the
+// inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
+// zeroing the upper 32 bits.
+// TODO: If one of the operands is zero extended and the other isn't, we might
+// still be better off shifting both left by 32.
+def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
+ (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
+// Prevent matching the first part of this pattern to mulw. The mul here has
+// additionals users or the ANDs would have been removed. The above pattern
+// will be used for the other users. If we form a mulw we'll keep the ANDs alive
+// and they'll still become SLLI+SRLI.
+def : Pat<(sext_inreg (mul (and GPR:$rs1, 0xffffffff),
+ (and GPR:$rs2, 0xffffffff)), i32),
+ (ADDIW (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32)), 0)>;
+} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index b3fc76a..8af3c8f 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -67,17 +67,50 @@
}
def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
- [{return isInt<5>(Imm - 1);}]> {
+ [{return (isInt<5>(Imm) && Imm != -16) || Imm == 16;}]> {
let ParserMatchClass = SImm5Plus1AsmOperand;
let MCOperandPredicate = [{
int64_t Imm;
if (MCOp.evaluateAsConstantImm(Imm))
- return isInt<5>(Imm - 1);
+ return (isInt<5>(Imm) && Imm != -16) || Imm == 16;
return MCOp.isBareSymbolRef();
}];
}
//===----------------------------------------------------------------------===//
+// Scheduling definitions.
+//===----------------------------------------------------------------------===//
+
+class VMVRSched<int n>: Sched <[!cast<SchedReadWrite>("WriteVMov" # n # "V"),
+ !cast<SchedReadWrite>("ReadVMov" # n # "V")]>;
+
+class VLESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDE" # n),
+ ReadVLDX, ReadVMask]>;
+
+class VSESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTE" # n),
+ !cast<SchedReadWrite>("ReadVSTE" # n # "V"),
+ ReadVSTX, ReadVMask]>;
+
+class VLSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDS" # n),
+ ReadVLDX, ReadVLDSX, ReadVMask]>;
+
+class VSSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTS" # n),
+ !cast<SchedReadWrite>("ReadVSTS" # n # "V"),
+ ReadVSTX, ReadVSTSX, ReadVMask]>;
+
+class VLXSched<int n, string o> :
+ Sched <[!cast<SchedReadWrite>("WriteVLD" # o # "X" # n),
+ ReadVLDX, !cast<SchedReadWrite>("ReadVLD" # o # "XV"), ReadVMask]>;
+
+class VSXSched<int n, string o> :
+ Sched <[!cast<SchedReadWrite>("WriteVST" # o # "X" # n),
+ !cast<SchedReadWrite>("ReadVST" # o # "X" # n),
+ ReadVSTX, !cast<SchedReadWrite>("ReadVST" # o # "XV"), ReadVMask]>;
+
+class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n),
+ ReadVLDX, ReadVMask]>;
+
+//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -86,7 +119,10 @@
class VUnitStrideLoadMask<string opcodestr>
: RVInstVLU<0b000, LSWidth8.Value{3}, LUMOPUnitStrideMask, LSWidth8.Value{2-0},
(outs VR:$vd),
- (ins GPR:$rs1), opcodestr, "$vd, (${rs1})">;
+ (ins GPR:$rs1), opcodestr, "$vd, (${rs1})"> {
+ let vm = 1;
+ let RVVConstraint = NoConstraint;
+}
// load vd, (rs1), vm
class VUnitStrideLoad<RISCVLSUMOP lumop, RISCVWidth width,
@@ -110,9 +146,9 @@
"$vd, (${rs1}), $vs2$vm">;
// vl<nf>r.v vd, (rs1)
-class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr, RegisterClass VRC>
: RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideWholeReg,
- width.Value{2-0}, (outs VR:$vd), (ins GPR:$rs1),
+ width.Value{2-0}, (outs VRC:$vd), (ins GPR:$rs1),
opcodestr, "$vd, (${rs1})"> {
let vm = 1;
let Uses = [];
@@ -143,11 +179,13 @@
} // hasSideEffects = 0, mayLoad = 1, mayStore = 0
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-// store vd, vs3, (rs1), vm
+// store vd, vs3, (rs1)
class VUnitStrideStoreMask<string opcodestr>
: RVInstVSU<0b000, LSWidth8.Value{3}, SUMOPUnitStrideMask, LSWidth8.Value{2-0},
(outs), (ins VR:$vs3, GPR:$rs1), opcodestr,
- "$vs3, (${rs1})">;
+ "$vs3, (${rs1})"> {
+ let vm = 1;
+}
// store vd, vs3, (rs1), vm
class VUnitStrideStore<RISCVLSUMOP sumop, RISCVWidth width,
@@ -169,9 +207,9 @@
opcodestr, "$vs3, (${rs1}), $vs2$vm">;
// vs<nf>r.v vd, (rs1)
-class VWholeStore<bits<3> nf, string opcodestr>
+class VWholeStore<bits<3> nf, string opcodestr, RegisterClass VRC>
: RVInstVSU<nf, 0, SUMOPUnitStrideWholeReg,
- 0b000, (outs), (ins VR:$vs3, GPR:$rs1),
+ 0b000, (outs), (ins VRC:$vs3, GPR:$rs1),
opcodestr, "$vs3, (${rs1})"> {
let vm = 1;
let Uses = [];
@@ -323,106 +361,417 @@
// Use these multiclasses to define instructions more easily.
//===----------------------------------------------------------------------===//
multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
- def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
- def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
- def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
}
multiclass VALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
- def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUrVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
- def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
}
multiclass VALU_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
- def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
- def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
-}
-
-multiclass VALU_IV_V<string opcodestr, bits<6> funct6> {
- def _VS : VALUVV<funct6, OPIVV, opcodestr # ".vs">;
-}
-
-multiclass VALUr_IV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
}
multiclass VALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
- def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+ def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>;
}
-multiclass VALU_MV_V<string opcodestr, bits<6> funct6> {
- def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">;
+multiclass VMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVMask]>;
+ def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIMulAddX, ReadVIMulAddV, ReadVIMulAddX, ReadVMask]>;
}
-multiclass VALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
- def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">;
+multiclass VWMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVMask]>;
+ def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
}
-multiclass VALU_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
- def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
- def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VWMAC_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
}
multiclass VALU_MV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
- def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>;
+ def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+ Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
}
multiclass VALUm_IV_V_X_I<string opcodestr, bits<6> funct6> {
- def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
- def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
- def IM : VALUmVI<funct6, opcodestr # ".vim">;
+ def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+ Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+ def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+ Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+ def IM : VALUmVI<funct6, opcodestr # ".vim">,
+ Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
+}
+
+multiclass VMRG_IV_V_X_I<string opcodestr, bits<6> funct6> {
+ def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+ Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>;
+ def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+ Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
+ def IM : VALUmVI<funct6, opcodestr # ".vim">,
+ Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
}
multiclass VALUm_IV_V_X<string opcodestr, bits<6> funct6> {
- def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
- def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
+ def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+ Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+ def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+ Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
}
multiclass VALUNoVm_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5> {
- def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
- def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
- def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>;
+ def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+ Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+ def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+ Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
+ def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>,
+ Sched<[WriteVICALUI, ReadVIALUCV]>;
}
multiclass VALUNoVm_IV_V_X<string opcodestr, bits<6> funct6> {
- def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
- def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
+ def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+ Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+ def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+ Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
}
multiclass VALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
- def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
}
multiclass VALU_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
- def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
}
-multiclass VALUr_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
- def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
- def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+multiclass VWALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
}
-multiclass VALU_FV_V<string opcodestr, bits<6> funct6> {
- def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">;
+multiclass VMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>;
}
-multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
- def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
+multiclass VDIV_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VRDIV_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VWMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>;
+}
+
+multiclass VMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>;
+ def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>;
+}
+
+multiclass VWMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>;
+ def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>;
+}
+
+multiclass VSQR_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
+}
+
+multiclass VRCP_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+}
+
+multiclass VCMP_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VCMP_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VSGNJ_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>;
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>;
+}
+
+multiclass VCLS_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
+}
+
+multiclass VCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>;
+}
+
+multiclass VCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>;
+}
+
+multiclass VWCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>;
+}
+
+multiclass VNCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>;
+}
+
+multiclass VNCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>;
+}
+
+multiclass VNCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+ Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
+}
+
+multiclass VRED_MV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">,
+ Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV0, ReadVMask]>;
+}
+
+multiclass VWRED_IV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPIVV, opcodestr # ".vs">,
+ Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV0, ReadVMask]>;
+}
+
+multiclass VRED_FV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+ Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV0, ReadVMask]>;
+}
+
+multiclass VREDO_FV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+ Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV0, ReadVMask]>;
+}
+
+multiclass VWRED_FV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+ Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV0, ReadVMask]>;
+}
+
+multiclass VWREDO_FV_V<string opcodestr, bits<6> funct6> {
+ def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+ Sched<[WriteVFWRedOV, ReadVFWRedOV, ReadVFWRedOV0, ReadVMask]>;
+}
+
+multiclass VMALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+ def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+ Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
+}
+
+multiclass VMSFS_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+ Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>;
+}
+
+multiclass VMIOT_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+ def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+ Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
+}
+
+multiclass VSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVShiftV, ReadVShiftV, ReadVShiftV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVShiftX, ReadVShiftV, ReadVShiftX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVShiftI, ReadVShiftV, ReadVMask]>;
+}
+
+multiclass VNSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVNShiftV, ReadVNShiftV, ReadVNShiftV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVNShiftX, ReadVNShiftV, ReadVNShiftX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVNShiftI, ReadVNShiftV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+}
+
+multiclass VMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIMulV, ReadVIMulV, ReadVIMulV, ReadVMask]>;
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIMulX, ReadVIMulV, ReadVIMulX, ReadVMask]>;
+}
+
+multiclass VWMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIWMulV, ReadVIWMulV, ReadVIWMulV, ReadVMask]>;
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIWMulX, ReadVIWMulV, ReadVIWMulX, ReadVMask]>;
+}
+
+multiclass VDIV_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVIDivV, ReadVIDivV, ReadVIDivV, ReadVMask]>;
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVIDivX, ReadVIDivV, ReadVIDivX, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVSALUI, ReadVSALUV, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+}
+
+multiclass VAALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVAALUV, ReadVAALUV, ReadVAALUV, ReadVMask]>;
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVAALUX, ReadVAALUV, ReadVAALUX, ReadVMask]>;
+}
+
+multiclass VSMUL_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVSMulV, ReadVSMulV, ReadVSMulV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVSMulX, ReadVSMulV, ReadVSMulX, ReadVMask]>;
+}
+
+multiclass VSSHF_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVSShiftV, ReadVSShiftV, ReadVSShiftV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVSShiftX, ReadVSShiftV, ReadVSShiftX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVSShiftI, ReadVSShiftV, ReadVMask]>;
+}
+
+multiclass VNCLP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVNClipV, ReadVNClipV, ReadVNClipV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVNClipX, ReadVNClipV, ReadVNClipX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVNClipI, ReadVNClipV, ReadVMask]>;
+}
+
+multiclass VSLD_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVISlideI, ReadVISlideV, ReadVMask]>;
+}
+
+multiclass VSLD1_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def X : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+}
+
+multiclass VSLD1_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+ def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+ Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>;
+}
+
+multiclass VGTR_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+ def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+ Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV, ReadVMask]>;
+ def X : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+ Sched<[WriteVGatherX, ReadVGatherV, ReadVGatherX, ReadVMask]>;
+ def I : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+ Sched<[WriteVGatherI, ReadVGatherV, ReadVMask]>;
+}
+
+multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+ def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+ Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
}
multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
@@ -430,11 +779,48 @@
def _UNWD : VAMONoWd<amoop, width, opcodestr>;
}
-multiclass VWholeLoad<bits<3> nf, string opcodestr> {
- def E8_V : VWholeLoad<nf, LSWidth8, opcodestr # "e8.v">;
- def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v">;
- def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v">;
- def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v">;
+multiclass VWholeLoad1<string opcodestr, RegisterClass VRC> {
+ def E8_V : VWholeLoad<0, LSWidth8, opcodestr # "e8.v", VRC>,
+ Sched<[WriteVLD1R8, ReadVLDX]>;
+ def E16_V : VWholeLoad<0, LSWidth16, opcodestr # "e16.v", VRC>,
+ Sched<[WriteVLD1R16, ReadVLDX]>;
+ def E32_V : VWholeLoad<0, LSWidth32, opcodestr # "e32.v", VRC>,
+ Sched<[WriteVLD1R32, ReadVLDX]>;
+ def E64_V : VWholeLoad<0, LSWidth64, opcodestr # "e64.v", VRC>,
+ Sched<[WriteVLD1R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad2<string opcodestr, RegisterClass VRC> {
+ def E8_V : VWholeLoad<1, LSWidth8, opcodestr # "e8.v", VRC>,
+ Sched<[WriteVLD2R8, ReadVLDX]>;
+ def E16_V : VWholeLoad<1, LSWidth16, opcodestr # "e16.v", VRC>,
+ Sched<[WriteVLD2R16, ReadVLDX]>;
+ def E32_V : VWholeLoad<1, LSWidth32, opcodestr # "e32.v", VRC>,
+ Sched<[WriteVLD2R32, ReadVLDX]>;
+ def E64_V : VWholeLoad<1, LSWidth64, opcodestr # "e64.v", VRC>,
+ Sched<[WriteVLD2R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad4<string opcodestr, RegisterClass VRC> {
+ def E8_V : VWholeLoad<3, LSWidth8, opcodestr # "e8.v", VRC>,
+ Sched<[WriteVLD4R8, ReadVLDX]>;
+ def E16_V : VWholeLoad<3, LSWidth16, opcodestr # "e16.v", VRC>,
+ Sched<[WriteVLD4R16, ReadVLDX]>;
+ def E32_V : VWholeLoad<3, LSWidth32, opcodestr # "e32.v", VRC>,
+ Sched<[WriteVLD4R32, ReadVLDX]>;
+ def E64_V : VWholeLoad<3, LSWidth64, opcodestr # "e64.v", VRC>,
+ Sched<[WriteVLD1R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad8<string opcodestr, RegisterClass VRC> {
+ def E8_V : VWholeLoad<7, LSWidth8, opcodestr # "e8.v", VRC>,
+ Sched<[WriteVLD8R8, ReadVLDX]>;
+ def E16_V : VWholeLoad<7, LSWidth16, opcodestr # "e16.v", VRC>,
+ Sched<[WriteVLD8R16, ReadVLDX]>;
+ def E32_V : VWholeLoad<7, LSWidth32, opcodestr # "e32.v", VRC>,
+ Sched<[WriteVLD8R32, ReadVLDX]>;
+ def E64_V : VWholeLoad<7, LSWidth64, opcodestr # "e64.v", VRC>,
+ Sched<[WriteVLD8R64, ReadVLDX]>;
}
//===----------------------------------------------------------------------===//
@@ -454,69 +840,94 @@
} // hasSideEffects = 1, mayLoad = 0, mayStore = 0
// Vector Unit-Stride Instructions
-def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
-def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
-def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
-def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
+def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">,
+ VLESched<8>;
+def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">,
+ VLESched<16>;
+def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">,
+ VLESched<32>;
+def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">,
+ VLESched<64>;
-def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
-def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
-def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
-def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
+def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">,
+ VLFSched<8>;
+def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">,
+ VLFSched<16>;
+def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">,
+ VLFSched<32>;
+def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">,
+ VLFSched<64>;
-def VLE1_V : VUnitStrideLoadMask<"vle1.v">;
-def VSE1_V : VUnitStrideStoreMask<"vse1.v">;
+def VLE1_V : VUnitStrideLoadMask<"vle1.v">,
+ Sched<[WriteVLDM, ReadVLDX]>;
+def VSE1_V : VUnitStrideStoreMask<"vse1.v">,
+ Sched<[WriteVSTM, ReadVSTM, ReadVSTX]>;
-def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
-def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
-def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
-def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
+def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">,
+ VSESched<8>;
+def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">,
+ VSESched<16>;
+def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">,
+ VSESched<32>;
+def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">,
+ VSESched<64>;
// Vector Strided Instructions
-def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
-def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
-def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
-def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
+def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">,
+ VLSSched<8>;
+def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">,
+ VLSSched<16>;
+def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">,
+ VLSSched<32>;
+def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">,
+ VLSSched<32>;
-def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
-def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
-def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
-def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
+def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">,
+ VSSSched<8>;
+def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">,
+ VSSSched<16>;
+def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">,
+ VSSSched<32>;
+def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">,
+ VSSSched<64>;
// Vector Indexed Instructions
-def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
-def VLUXEI16_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth16, "vluxei16.v">;
-def VLUXEI32_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth32, "vluxei32.v">;
-def VLUXEI64_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth64, "vluxei64.v">;
+foreach n = [8, 16, 32, 64] in {
+defvar w = !cast<RISCVWidth>("LSWidth" # n);
-def VLOXEI8_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth8, "vloxei8.v">;
-def VLOXEI16_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth16, "vloxei16.v">;
-def VLOXEI32_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth32, "vloxei32.v">;
-def VLOXEI64_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth64, "vloxei64.v">;
+def VLUXEI # n # _V :
+ VIndexedLoad<MOPLDIndexedUnord, w, "vluxei" # n # ".v">,
+ VLXSched<n, "U">;
+def VLOXEI # n # _V :
+ VIndexedLoad<MOPLDIndexedOrder, w, "vloxei" # n # ".v">,
+ VLXSched<n, "O">;
-def VSUXEI8_V : VIndexedStore<MOPSTIndexedUnord, LSWidth8, "vsuxei8.v">;
-def VSUXEI16_V : VIndexedStore<MOPSTIndexedUnord, LSWidth16, "vsuxei16.v">;
-def VSUXEI32_V : VIndexedStore<MOPSTIndexedUnord, LSWidth32, "vsuxei32.v">;
-def VSUXEI64_V : VIndexedStore<MOPSTIndexedUnord, LSWidth64, "vsuxei64.v">;
+def VSUXEI # n # _V :
+ VIndexedStore<MOPSTIndexedUnord, w, "vsuxei" # n # ".v">,
+ VSXSched<n, "U">;
+def VSOXEI # n # _V :
+ VIndexedStore<MOPSTIndexedOrder, w, "vsoxei" # n # ".v">,
+ VSXSched<n, "O">;
+}
-def VSOXEI8_V : VIndexedStore<MOPSTIndexedOrder, LSWidth8, "vsoxei8.v">;
-def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
-def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
-def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
+defm VL1R : VWholeLoad1<"vl1r", VR>;
+defm VL2R : VWholeLoad2<"vl2r", VRM2>;
+defm VL4R : VWholeLoad4<"vl4r", VRM4>;
+defm VL8R : VWholeLoad8<"vl8r", VRM8>;
-defm VL1R : VWholeLoad<0, "vl1r">;
-defm VL2R : VWholeLoad<1, "vl2r">;
-defm VL4R : VWholeLoad<3, "vl4r">;
-defm VL8R : VWholeLoad<7, "vl8r">;
def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
-def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VR:$vd, GPR:$rs1)>;
-def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VR:$vd, GPR:$rs1)>;
-def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>;
+def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>;
+def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>;
-def VS1R_V : VWholeStore<0, "vs1r.v">;
-def VS2R_V : VWholeStore<1, "vs2r.v">;
-def VS4R_V : VWholeStore<3, "vs4r.v">;
-def VS8R_V : VWholeStore<7, "vs8r.v">;
+def VS1R_V : VWholeStore<0, "vs1r.v", VR>,
+ Sched<[WriteVST1R, ReadVST1R, ReadVSTX]>;
+def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>,
+ Sched<[WriteVST2R, ReadVST2R, ReadVSTX]>;
+def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>,
+ Sched<[WriteVST4R, ReadVST4R, ReadVSTX]>;
+def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>,
+ Sched<[WriteVST8R, ReadVST8R, ReadVSTX]>;
// Vector Single-Width Integer Add and Subtract
defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
@@ -583,9 +994,9 @@
(VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
// Vector Single-Width Bit Shift Instructions
-defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
-defm VSRL_V : VALU_IV_V_X_I<"vsrl", 0b101000, uimm5>;
-defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
+defm VSLL_V : VSHT_IV_V_X_I<"vsll", 0b100101, uimm5>;
+defm VSRL_V : VSHT_IV_V_X_I<"vsrl", 0b101000, uimm5>;
+defm VSRA_V : VSHT_IV_V_X_I<"vsra", 0b101001, uimm5>;
// Vector Narrowing Integer Right Shift Instructions
// Refer to 11.3. Narrowing Vector Arithmetic Instructions
@@ -593,8 +1004,8 @@
// vector register group (specified by vs2). The destination vector register
// group cannot overlap the mask register if used, unless LMUL=1.
let Constraints = "@earlyclobber $vd" in {
-defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
-defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
+defm VNSRL_W : VNSHT_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
+defm VNSRA_W : VNSHT_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
} // Constraints = "@earlyclobber $vd"
def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
@@ -602,14 +1013,14 @@
// Vector Integer Comparison Instructions
let RVVConstraint = NoConstraint in {
-defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
-defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
-defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
-defm VMSLT_V : VALU_IV_V_X<"vmslt", 0b011011>;
-defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
-defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
-defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
-defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+defm VMSEQ_V : VCMP_IV_V_X_I<"vmseq", 0b011000>;
+defm VMSNE_V : VCMP_IV_V_X_I<"vmsne", 0b011001>;
+defm VMSLTU_V : VCMP_IV_V_X<"vmsltu", 0b011010>;
+defm VMSLT_V : VCMP_IV_V_X<"vmslt", 0b011011>;
+defm VMSLEU_V : VCMP_IV_V_X_I<"vmsleu", 0b011100>;
+defm VMSLE_V : VCMP_IV_V_X_I<"vmsle", 0b011101>;
+defm VMSGTU_V : VCMP_IV_X_I<"vmsgtu", 0b011110>;
+defm VMSGT_V : VCMP_IV_X_I<"vmsgt", 0b011111>;
} // RVVConstraint = NoConstraint
def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
@@ -658,93 +1069,96 @@
def PseudoVMSGE_VX_M : Pseudo<(outs VRNoV0:$vd),
(ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
[], "vmsge.vx", "$vd, $vs2, $rs1$vm">;
-def PseudoVMSGEU_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+def PseudoVMSGEU_VX_M_T : Pseudo<(outs VR:$vd, VRNoV0:$scratch),
(ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
[], "vmsgeu.vx", "$vd, $vs2, $rs1$vm, $scratch">;
-def PseudoVMSGE_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+def PseudoVMSGE_VX_M_T : Pseudo<(outs VR:$vd, VRNoV0:$scratch),
(ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
[], "vmsge.vx", "$vd, $vs2, $rs1$vm, $scratch">;
}
// Vector Integer Min/Max Instructions
-defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
-defm VMIN_V : VALU_IV_V_X<"vmin", 0b000101>;
-defm VMAXU_V : VALU_IV_V_X<"vmaxu", 0b000110>;
-defm VMAX_V : VALU_IV_V_X<"vmax", 0b000111>;
+defm VMINU_V : VCMP_IV_V_X<"vminu", 0b000100>;
+defm VMIN_V : VCMP_IV_V_X<"vmin", 0b000101>;
+defm VMAXU_V : VCMP_IV_V_X<"vmaxu", 0b000110>;
+defm VMAX_V : VCMP_IV_V_X<"vmax", 0b000111>;
// Vector Single-Width Integer Multiply Instructions
-defm VMUL_V : VALU_MV_V_X<"vmul", 0b100101>;
-defm VMULH_V : VALU_MV_V_X<"vmulh", 0b100111>;
-defm VMULHU_V : VALU_MV_V_X<"vmulhu", 0b100100>;
-defm VMULHSU_V : VALU_MV_V_X<"vmulhsu", 0b100110>;
+defm VMUL_V : VMUL_MV_V_X<"vmul", 0b100101>;
+defm VMULH_V : VMUL_MV_V_X<"vmulh", 0b100111>;
+defm VMULHU_V : VMUL_MV_V_X<"vmulhu", 0b100100>;
+defm VMULHSU_V : VMUL_MV_V_X<"vmulhsu", 0b100110>;
// Vector Integer Divide Instructions
-defm VDIVU_V : VALU_MV_V_X<"vdivu", 0b100000>;
-defm VDIV_V : VALU_MV_V_X<"vdiv", 0b100001>;
-defm VREMU_V : VALU_MV_V_X<"vremu", 0b100010>;
-defm VREM_V : VALU_MV_V_X<"vrem", 0b100011>;
+defm VDIVU_V : VDIV_MV_V_X<"vdivu", 0b100000>;
+defm VDIV_V : VDIV_MV_V_X<"vdiv", 0b100001>;
+defm VREMU_V : VDIV_MV_V_X<"vremu", 0b100010>;
+defm VREM_V : VDIV_MV_V_X<"vrem", 0b100011>;
// Vector Widening Integer Multiply Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMUL_V : VALU_MV_V_X<"vwmul", 0b111011>;
-defm VWMULU_V : VALU_MV_V_X<"vwmulu", 0b111000>;
-defm VWMULSU_V : VALU_MV_V_X<"vwmulsu", 0b111010>;
+defm VWMUL_V : VWMUL_MV_V_X<"vwmul", 0b111011>;
+defm VWMULU_V : VWMUL_MV_V_X<"vwmulu", 0b111000>;
+defm VWMULSU_V : VWMUL_MV_V_X<"vwmulsu", 0b111010>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
// Vector Single-Width Integer Multiply-Add Instructions
-defm VMACC_V : VALUr_MV_V_X<"vmacc", 0b101101>;
-defm VNMSAC_V : VALUr_MV_V_X<"vnmsac", 0b101111>;
-defm VMADD_V : VALUr_MV_V_X<"vmadd", 0b101001>;
-defm VNMSUB_V : VALUr_MV_V_X<"vnmsub", 0b101011>;
+defm VMACC_V : VMAC_MV_V_X<"vmacc", 0b101101>;
+defm VNMSAC_V : VMAC_MV_V_X<"vnmsac", 0b101111>;
+defm VMADD_V : VMAC_MV_V_X<"vmadd", 0b101001>;
+defm VNMSUB_V : VMAC_MV_V_X<"vnmsub", 0b101011>;
// Vector Widening Integer Multiply-Add Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMACCU_V : VALUr_MV_V_X<"vwmaccu", 0b111100>;
-defm VWMACC_V : VALUr_MV_V_X<"vwmacc", 0b111101>;
-defm VWMACCSU_V : VALUr_MV_V_X<"vwmaccsu", 0b111111>;
-defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
+defm VWMACCU_V : VWMAC_MV_V_X<"vwmaccu", 0b111100>;
+defm VWMACC_V : VWMAC_MV_V_X<"vwmacc", 0b111101>;
+defm VWMACCSU_V : VWMAC_MV_V_X<"vwmaccsu", 0b111111>;
+defm VWMACCUS_V : VWMAC_MV_X<"vwmaccus", 0b111110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
// Vector Integer Merge Instructions
-defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
+defm VMERGE_V : VMRG_IV_V_X_I<"vmerge", 0b010111>;
// Vector Integer Move Instructions
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1,
RVVConstraint = NoConstraint in {
// op vd, vs1
def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VR:$vd),
- (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">;
+ (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">,
+ Sched<[WriteVIMovV, ReadVIMovV]>;
// op vd, rs1
def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VR:$vd),
- (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
+ (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">,
+ Sched<[WriteVIMovX, ReadVIMovX]>;
// op vd, imm
def VMV_V_I : RVInstIVI<0b010111, (outs VR:$vd),
- (ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
+ (ins simm5:$imm), "vmv.v.i", "$vd, $imm">,
+ Sched<[WriteVIMovI]>;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
// Vector Fixed-Point Arithmetic Instructions
-defm VSADDU_V : VALU_IV_V_X_I<"vsaddu", 0b100000>;
-defm VSADD_V : VALU_IV_V_X_I<"vsadd", 0b100001>;
-defm VSSUBU_V : VALU_IV_V_X<"vssubu", 0b100010>;
-defm VSSUB_V : VALU_IV_V_X<"vssub", 0b100011>;
+defm VSADDU_V : VSALU_IV_V_X_I<"vsaddu", 0b100000>;
+defm VSADD_V : VSALU_IV_V_X_I<"vsadd", 0b100001>;
+defm VSSUBU_V : VSALU_IV_V_X<"vssubu", 0b100010>;
+defm VSSUB_V : VSALU_IV_V_X<"vssub", 0b100011>;
// Vector Single-Width Averaging Add and Subtract
-defm VAADDU_V : VALU_MV_V_X<"vaaddu", 0b001000>;
-defm VAADD_V : VALU_MV_V_X<"vaadd", 0b001001>;
-defm VASUBU_V : VALU_MV_V_X<"vasubu", 0b001010>;
-defm VASUB_V : VALU_MV_V_X<"vasub", 0b001011>;
+defm VAADDU_V : VAALU_MV_V_X<"vaaddu", 0b001000>;
+defm VAADD_V : VAALU_MV_V_X<"vaadd", 0b001001>;
+defm VASUBU_V : VAALU_MV_V_X<"vasubu", 0b001010>;
+defm VASUB_V : VAALU_MV_V_X<"vasub", 0b001011>;
// Vector Single-Width Fractional Multiply with Rounding and Saturation
-defm VSMUL_V : VALU_IV_V_X<"vsmul", 0b100111>;
+defm VSMUL_V : VSMUL_IV_V_X<"vsmul", 0b100111>;
// Vector Single-Width Scaling Shift Instructions
-defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
-defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
+defm VSSRL_V : VSSHF_IV_V_X_I<"vssrl", 0b101010, uimm5>;
+defm VSSRA_V : VSSHF_IV_V_X_I<"vssra", 0b101011, uimm5>;
// Vector Narrowing Fixed-Point Clip Instructions
let Constraints = "@earlyclobber $vd" in {
-defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
-defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
+defm VNCLIPU_W : VNCLP_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
+defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
} // Constraints = "@earlyclobber $vd"
} // Predicates = [HasStdExtV]
@@ -757,72 +1171,74 @@
// Vector Widening Floating-Point Add/Subtract Instructions
let Constraints = "@earlyclobber $vd" in {
let RVVConstraint = WidenV in {
-defm VFWADD_V : VALU_FV_V_F<"vfwadd", 0b110000>;
-defm VFWSUB_V : VALU_FV_V_F<"vfwsub", 0b110010>;
+defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000>;
+defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010>;
} // RVVConstraint = WidenV
// Set earlyclobber for following instructions for second and mask operands.
// This has the downside that the earlyclobber constraint is too coarse and
// will impose unnecessary restrictions by not allowing the destination to
// overlap with the first (wide) operand.
let RVVConstraint = WidenW in {
-defm VFWADD_W : VALU_FV_V_F<"vfwadd", 0b110100, "w">;
-defm VFWSUB_W : VALU_FV_V_F<"vfwsub", 0b110110, "w">;
+defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">;
+defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">;
} // RVVConstraint = WidenW
} // Constraints = "@earlyclobber $vd"
// Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm VFMUL_V : VALU_FV_V_F<"vfmul", 0b100100>;
-defm VFDIV_V : VALU_FV_V_F<"vfdiv", 0b100000>;
-defm VFRDIV_V : VALU_FV_F<"vfrdiv", 0b100001>;
+defm VFMUL_V : VMUL_FV_V_F<"vfmul", 0b100100>;
+defm VFDIV_V : VDIV_FV_V_F<"vfdiv", 0b100000>;
+defm VFRDIV_V : VRDIV_FV_F<"vfrdiv", 0b100001>;
// Vector Widening Floating-Point Multiply
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMUL_V : VALU_FV_V_F<"vfwmul", 0b111000>;
+defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
// Vector Single-Width Floating-Point Fused Multiply-Add Instructions
-defm VFMACC_V : VALUr_FV_V_F<"vfmacc", 0b101100>;
-defm VFNMACC_V : VALUr_FV_V_F<"vfnmacc", 0b101101>;
-defm VFMSAC_V : VALUr_FV_V_F<"vfmsac", 0b101110>;
-defm VFNMSAC_V : VALUr_FV_V_F<"vfnmsac", 0b101111>;
-defm VFMADD_V : VALUr_FV_V_F<"vfmadd", 0b101000>;
-defm VFNMADD_V : VALUr_FV_V_F<"vfnmadd", 0b101001>;
-defm VFMSUB_V : VALUr_FV_V_F<"vfmsub", 0b101010>;
-defm VFNMSUB_V : VALUr_FV_V_F<"vfnmsub", 0b101011>;
+defm VFMACC_V : VMAC_FV_V_F<"vfmacc", 0b101100>;
+defm VFNMACC_V : VMAC_FV_V_F<"vfnmacc", 0b101101>;
+defm VFMSAC_V : VMAC_FV_V_F<"vfmsac", 0b101110>;
+defm VFNMSAC_V : VMAC_FV_V_F<"vfnmsac", 0b101111>;
+defm VFMADD_V : VMAC_FV_V_F<"vfmadd", 0b101000>;
+defm VFNMADD_V : VMAC_FV_V_F<"vfnmadd", 0b101001>;
+defm VFMSUB_V : VMAC_FV_V_F<"vfmsub", 0b101010>;
+defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>;
// Vector Widening Floating-Point Fused Multiply-Add Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMACC_V : VALUr_FV_V_F<"vfwmacc", 0b111100>;
-defm VFWNMACC_V : VALUr_FV_V_F<"vfwnmacc", 0b111101>;
-defm VFWMSAC_V : VALUr_FV_V_F<"vfwmsac", 0b111110>;
-defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
+defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>;
+defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>;
+defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>;
+defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
// Vector Floating-Point Square-Root Instruction
-defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
-defm VFRSQRT7_V : VALU_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
-defm VFREC7_V : VALU_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
+defm VFSQRT_V : VSQR_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
+defm VFRSQRT7_V : VRCP_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
+defm VFREC7_V : VRCP_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
// Vector Floating-Point MIN/MAX Instructions
-defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
-defm VFMAX_V : VALU_FV_V_F<"vfmax", 0b000110>;
+defm VFMIN_V : VCMP_FV_V_F<"vfmin", 0b000100>;
+defm VFMAX_V : VCMP_FV_V_F<"vfmax", 0b000110>;
// Vector Floating-Point Sign-Injection Instructions
-defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
-defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
-defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
+defm VFSGNJ_V : VSGNJ_FV_V_F<"vfsgnj", 0b001000>;
+defm VFSGNJN_V : VSGNJ_FV_V_F<"vfsgnjn", 0b001001>;
+defm VFSGNJX_V : VSGNJ_FV_V_F<"vfsgnjx", 0b001010>;
def : InstAlias<"vfneg.v $vd, $vs$vm",
(VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
+def : InstAlias<"vfabs.v $vd, $vs$vm",
+ (VFSGNJX_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
// Vector Floating-Point Compare Instructions
let RVVConstraint = NoConstraint in {
-defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
-defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
-defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
-defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
-defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
-defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>;
+defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>;
+defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>;
+defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>;
+defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>;
+defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>;
} // RVVConstraint = NoConstraint
def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
@@ -831,68 +1247,70 @@
(VMFLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
// Vector Floating-Point Classify Instruction
-defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
+defm VFCLASS_V : VCLS_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
// Vector Floating-Point Merge Instruction
+let vm = 0 in
def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
(ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
- "vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
- let vm = 0;
-}
+ "vfmerge.vfm", "$vd, $vs2, $rs1, v0">,
+ Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
// Vector Floating-Point Move Instruction
let RVVConstraint = NoConstraint in
+let vm = 1, vs2 = 0 in
def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
- (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
- let vs2 = 0;
- let vm = 1;
-}
+ (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1">,
+ Sched<[WriteVFMovV, ReadVFMovF]>;
+
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
// Single-Width Floating-Point/Integer Type-Convert Instructions
-defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
-defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
-defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
-defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
-defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
-defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
+defm VFCVT_XU_F_V : VCVTI_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
+defm VFCVT_X_F_V : VCVTI_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
+defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
+defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
+defm VFCVT_F_XU_V : VCVTF_IV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
+defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
// Widening Floating-Point/Integer Type-Convert Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
-defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
-defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
-defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
-defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
-defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
-defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
-defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
+defm VFWCVT_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
+defm VFWCVT_X_F_V : VWCVTI_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
+defm VFWCVT_RTZ_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
+defm VFWCVT_RTZ_X_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
+defm VFWCVT_F_XU_V : VWCVTF_IV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
+defm VFWCVT_F_X_V : VWCVTF_IV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
+defm VFWCVT_F_F_V : VWCVTF_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
// Narrowing Floating-Point/Integer Type-Convert Instructions
let Constraints = "@earlyclobber $vd" in {
-defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
-defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
-defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
-defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
-defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
-defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
-defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
-defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
+defm VFNCVT_XU_F_W : VNCVTI_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
+defm VFNCVT_X_F_W : VNCVTI_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
+defm VFNCVT_RTZ_XU_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
+defm VFNCVT_RTZ_X_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
+defm VFNCVT_F_XU_W : VNCVTF_IV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
+defm VFNCVT_F_X_W : VNCVTF_IV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
+defm VFNCVT_F_F_W : VNCVTF_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
} // Constraints = "@earlyclobber $vd"
} // Predicates = [HasStdExtV, HasStdExtF]
let Predicates = [HasStdExtV] in {
+
// Vector Single-Width Integer Reduction Instructions
let RVVConstraint = NoConstraint in {
-defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
-defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
-defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
-defm VREDMINU : VALU_MV_V<"vredminu", 0b000100>;
-defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
-defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
-defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
-defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+defm VREDSUM : VRED_MV_V<"vredsum", 0b000000>;
+defm VREDMAXU : VRED_MV_V<"vredmaxu", 0b000110>;
+defm VREDMAX : VRED_MV_V<"vredmax", 0b000111>;
+defm VREDMINU : VRED_MV_V<"vredminu", 0b000100>;
+defm VREDMIN : VRED_MV_V<"vredmin", 0b000101>;
+defm VREDAND : VRED_MV_V<"vredand", 0b000001>;
+defm VREDOR : VRED_MV_V<"vredor", 0b000010>;
+defm VREDXOR : VRED_MV_V<"vredxor", 0b000011>;
} // RVVConstraint = NoConstraint
// Vector Widening Integer Reduction Instructions
@@ -901,18 +1319,19 @@
// This has the downside that the earlyclobber constraint is too coarse and
// will impose unnecessary restrictions by not allowing the destination to
// overlap with the first (wide) operand.
-defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
-defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
+defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
+defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
// Vector Single-Width Floating-Point Reduction Instructions
let RVVConstraint = NoConstraint in {
-defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
-defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
-defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
-defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
+defm VFREDSUM : VRED_FV_V<"vfredsum", 0b000001>;
+defm VFREDMAX : VRED_FV_V<"vfredmax", 0b000111>;
+defm VFREDMIN : VRED_FV_V<"vfredmin", 0b000101>;
} // RVVConstraint = NoConstraint
// Vector Widening Floating-Point Reduction Instructions
@@ -921,22 +1340,22 @@
// This has the downside that the earlyclobber constraint is too coarse and
// will impose unnecessary restrictions by not allowing the destination to
// overlap with the first (wide) operand.
-defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
-defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
+defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
+defm VFWREDSUM : VWRED_FV_V<"vfwredsum", 0b110001>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
} // Predicates = [HasStdExtV, HasStdExtF]
let Predicates = [HasStdExtV] in {
// Vector Mask-Register Logical Instructions
let RVVConstraint = NoConstraint in {
-defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
-defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
-defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
-defm VMXOR_M : VALU_MV_Mask<"vmxor", 0b011011, "m">;
-defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
-defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
-defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
-defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">;
+defm VMNAND_M : VMALU_MV_Mask<"vmnand", 0b011101, "m">;
+defm VMANDNOT_M : VMALU_MV_Mask<"vmandnot", 0b011000, "m">;
+defm VMXOR_M : VMALU_MV_Mask<"vmxor", 0b011011, "m">;
+defm VMOR_M : VMALU_MV_Mask<"vmor", 0b011010, "m">;
+defm VMNOR_M : VMALU_MV_Mask<"vmnor", 0b011110, "m">;
+defm VMORNOT_M : VMALU_MV_Mask<"vmornot", 0b011100, "m">;
+defm VMXNOR_M : VMALU_MV_Mask<"vmxnor", 0b011111, "m">;
}
def : InstAlias<"vmmv.m $vd, $vs",
@@ -950,98 +1369,113 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
RVVConstraint = NoConstraint in {
+
// Vector mask population count vpopc
def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
- (ins VR:$vs2, VMaskOp:$vm),
- "vpopc.m", "$vd, $vs2$vm">;
+ (ins VR:$vs2, VMaskOp:$vm),
+ "vpopc.m", "$vd, $vs2$vm">,
+ Sched<[WriteVMPopV, ReadVMPopV, ReadVMask]>;
// vfirst find-first-set mask bit
def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
- (ins VR:$vs2, VMaskOp:$vm),
- "vfirst.m", "$vd, $vs2$vm">;
+ (ins VR:$vs2, VMaskOp:$vm),
+ "vfirst.m", "$vd, $vs2$vm">,
+ Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMask]>;
+
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
+
// vmsbf.m set-before-first mask bit
-defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
+defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>;
// vmsif.m set-including-first mask bit
-defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
+defm VMSIF_M : VMSFS_MV_V<"vmsif.m", 0b010100, 0b00011>;
// vmsof.m set-only-first mask bit
-defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
+defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
// Vector Iota Instruction
-defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
+defm VIOTA_M : VMIOT_MV_V<"viota.m", 0b010100, 0b10000>;
+
} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
// Vector Element Index Instruction
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
+let vs2 = 0 in
def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
- (ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
- let vs2 = 0;
-}
+ (ins VMaskOp:$vm), "vid.v", "$vd$vm">,
+ Sched<[WriteVMIdxV, ReadVMask]>;
// Integer Scalar Move Instructions
let vm = 1, RVVConstraint = NoConstraint in {
def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
- (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">;
+ (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">,
+ Sched<[WriteVIMovVX, ReadVIMovVX]>;
let Constraints = "$vd = $vd_wb" in
def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
- (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
-
+ (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">,
+ Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>;
}
+
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
+
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
RVVConstraint = NoConstraint in {
// Floating-Point Scalar Move Instructions
def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
- (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">;
+ (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">,
+ Sched<[WriteVFMovVF, ReadVFMovVF]>;
let Constraints = "$vd = $vd_wb" in
def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
- (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+ (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">,
+ Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+
} // Predicates = [HasStdExtV, HasStdExtF]
let Predicates = [HasStdExtV] in {
// Vector Slide Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
-defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, uimm5>;
+defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
-defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, uimm5>;
+defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>;
+defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>;
+defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
} // Predicates = [HasStdExtV, HasStdExtF]
let Predicates = [HasStdExtV] in {
// Vector Register Gather Instruction
let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
-defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
-def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">;
+defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
+ Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV]>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
// Vector Compress Instruction
let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
-defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
+defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
RVVConstraint = NoConstraint in {
-foreach nf = [1, 2, 4, 8] in {
- def VMV#nf#R_V : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VR:$vd),
- (ins VR:$vs2), "vmv" # nf # "r.v",
- "$vd, $vs2"> {
- let Uses = [];
- let vm = 1;
- }
+foreach n = [1, 2, 4, 8] in {
+ def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs VR:$vd),
+ (ins VR:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
+ VMVRSched<n> {
+ let Uses = [];
+ let vm = 1;
+}
}
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
} // Predicates = [HasStdExtV]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 5c22882..0284ff6 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -20,22 +20,13 @@
def riscv_read_vlenb : SDNode<"RISCVISD::READ_VLENB",
SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>>;
-def riscv_vleff : SDNode<"RISCVISD::VLEFF",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
- SDTCisVT<2, XLenVT>]>,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
- SDNPSideEffect]>;
-def riscv_vleff_mask : SDNode<"RISCVISD::VLEFF_MASK",
- SDTypeProfile<1, 4, [SDTCisVec<0>,
- SDTCisSameAs<0, 1>,
- SDTCisPtrTy<2>,
- SDTCVecEltisVT<3, i1>,
- SDTCisVT<4, XLenVT>]>,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
- SDNPSideEffect]>;
-def riscv_read_vl : SDNode<"RISCVISD::READ_VL",
- SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>,
- [SDNPInGlue]>;
+// Operand that is allowed to be a register or a 5 bit immediate.
+// This allows us to pick between VSETIVLI and VSETVLI opcodes using the same
+// pseudo instructions.
+def AVL : RegisterOperand<GPR> {
+ let OperandNamespace = "RISCVOp";
+ let OperandType = "OPERAND_AVL";
+}
// X0 has special meaning for vsetvl/vsetvli.
// rd | rs1 | AVL value | Effect on vl
@@ -54,7 +45,7 @@
//===----------------------------------------------------------------------===//
// This class describes information associated to the LMUL.
-class LMULInfo<int lmul, VReg regclass, VReg wregclass,
+class LMULInfo<int lmul, int oct, VReg regclass, VReg wregclass,
VReg f2regclass, VReg f4regclass, VReg f8regclass, string mx> {
bits<3> value = lmul; // This is encoded as the vlmul field of vtype.
VReg vrclass = regclass;
@@ -63,22 +54,39 @@
VReg f4vrclass = f4regclass;
VReg f2vrclass = f2regclass;
string MX = mx;
+ int octuple = oct;
}
// Associate LMUL with tablegen records of register classes.
-def V_M1 : LMULInfo<0b000, VR, VRM2, VR, VR, VR, "M1">;
-def V_M2 : LMULInfo<0b001, VRM2, VRM4, VR, VR, VR, "M2">;
-def V_M4 : LMULInfo<0b010, VRM4, VRM8, VRM2, VR, VR, "M4">;
-def V_M8 : LMULInfo<0b011, VRM8,/*NoVReg*/VR, VRM4, VRM2, VR, "M8">;
+def V_M1 : LMULInfo<0b000, 8, VR, VRM2, VR, VR, VR, "M1">;
+def V_M2 : LMULInfo<0b001, 16, VRM2, VRM4, VR, VR, VR, "M2">;
+def V_M4 : LMULInfo<0b010, 32, VRM4, VRM8, VRM2, VR, VR, "M4">;
+def V_M8 : LMULInfo<0b011, 64, VRM8,/*NoVReg*/VR, VRM4, VRM2, VR, "M8">;
-def V_MF8 : LMULInfo<0b101, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF8">;
-def V_MF4 : LMULInfo<0b110, VR, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF4">;
-def V_MF2 : LMULInfo<0b111, VR, VR, VR, VR,/*NoVReg*/VR, "MF2">;
+def V_MF8 : LMULInfo<0b101, 1, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF8">;
+def V_MF4 : LMULInfo<0b110, 2, VR, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF4">;
+def V_MF2 : LMULInfo<0b111, 4, VR, VR, VR, VR,/*NoVReg*/VR, "MF2">;
// Used to iterate over all possible LMULs.
def MxList {
list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
}
+// Used for widening and narrowing instructions as it doesn't contain M8.
+def MxListW {
+ list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4];
+}
+// Use for zext/sext.vf2
+def MxListVF2 {
+ list<LMULInfo> m = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+}
+// Use for zext/sext.vf4
+def MxListVF4 {
+ list<LMULInfo> m = [V_MF2, V_M1, V_M2, V_M4, V_M8];
+}
+// Use for zext/sext.vf8
+def MxListVF8 {
+ list<LMULInfo> m = [V_M1, V_M2, V_M4, V_M8];
+}
class FPR_Info<RegisterClass regclass, string fx> {
RegisterClass fprclass = regclass;
@@ -92,6 +100,10 @@
def FPList {
list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32, SCALAR_F64];
}
+// Used for widening instructions. It excludes F64.
+def FPListW {
+ list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32];
+}
class MxSet<int eew> {
list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
@@ -107,18 +119,8 @@
true: [2, 3, 4, 5, 6, 7, 8]);
}
-class shift_amount<int num> {
- int val = !if(!eq(num, 1), 0, !add(1, shift_amount<!srl(num, 1)>.val));
-}
-
-class octuple_from_str<string MX> {
- int ret = !cond(!eq(MX, "MF8") : 1,
- !eq(MX, "MF4") : 2,
- !eq(MX, "MF2") : 4,
- !eq(MX, "M1") : 8,
- !eq(MX, "M2") : 16,
- !eq(MX, "M4") : 32,
- !eq(MX, "M8") : 64);
+class log2<int num> {
+ int val = !if(!eq(num, 1), 0, !add(1, log2<!srl(num, 1)>.val));
}
class octuple_to_str<int octuple> {
@@ -132,6 +134,8 @@
"NoDef")))))));
}
+def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>;
+
// Output pattern for X0 used to represent VLMAX in the pseudo instructions.
def VLMax : OutPatFrag<(ops), (XLenVT X0)>;
@@ -155,6 +159,7 @@
ValueType Vector = Vec;
ValueType Mask = Mas;
int SEW = Sew;
+ int Log2SEW = log2<Sew>.val;
VReg RegClass = Reg;
LMULInfo LMul = M;
ValueType Scalar = Scal;
@@ -180,14 +185,16 @@
defset list<VTypeInfo> AllVectors = {
defset list<VTypeInfo> AllIntegerVectors = {
defset list<VTypeInfo> NoGroupIntegerVectors = {
- def VI8MF8: VTypeInfo<vint8mf8_t, vbool64_t, 8, VR, V_MF8>;
- def VI8MF4: VTypeInfo<vint8mf4_t, vbool32_t, 8, VR, V_MF4>;
- def VI8MF2: VTypeInfo<vint8mf2_t, vbool16_t, 8, VR, V_MF2>;
+ defset list<VTypeInfo> FractionalGroupIntegerVectors = {
+ def VI8MF8: VTypeInfo<vint8mf8_t, vbool64_t, 8, VR, V_MF8>;
+ def VI8MF4: VTypeInfo<vint8mf4_t, vbool32_t, 8, VR, V_MF4>;
+ def VI8MF2: VTypeInfo<vint8mf2_t, vbool16_t, 8, VR, V_MF2>;
+ def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
+ def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
+ def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
+ }
def VI8M1: VTypeInfo<vint8m1_t, vbool8_t, 8, VR, V_M1>;
- def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
- def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
def VI16M1: VTypeInfo<vint16m1_t, vbool16_t, 16, VR, V_M1>;
- def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
def VI32M1: VTypeInfo<vint32m1_t, vbool32_t, 32, VR, V_M1>;
def VI64M1: VTypeInfo<vint64m1_t, vbool64_t, 64, VR, V_M1>;
}
@@ -212,13 +219,13 @@
defset list<VTypeInfo> AllFloatVectors = {
defset list<VTypeInfo> NoGroupFloatVectors = {
- def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
- def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
+ defset list<VTypeInfo> FractionalGroupFloatVectors = {
+ def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
+ def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
+ def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
+ }
def VF16M1: VTypeInfo<vfloat16m1_t, vbool16_t, 16, VR, V_M1, f16, FPR16>;
-
- def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
def VF32M1: VTypeInfo<vfloat32m1_t, vbool32_t, 32, VR, V_M1, f32, FPR32>;
-
def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, VR, V_M1, f64, FPR64>;
}
@@ -260,8 +267,10 @@
class MTypeInfo<ValueType Mas, LMULInfo M, string Bx> {
ValueType Mask = Mas;
// {SEW, VLMul} values set a valid VType to deal with this mask type.
- // we assume SEW=8 and set corresponding LMUL.
- int SEW = 8;
+ // we assume SEW=1 and set corresponding LMUL. vsetvli insertion will
+ // look for SEW=1 to optimize based on surrounding instructions.
+ int SEW = 1;
+ int Log2SEW = 0;
LMULInfo LMul = M;
string BX = Bx; // Appendix of mask operations.
// The pattern fragment which produces the AVL operand, representing the
@@ -403,29 +412,149 @@
let Fields = [ "Pseudo", "BaseInstr" ];
let PrimaryKey = [ "Pseudo" ];
let PrimaryKeyName = "getPseudoInfo";
+ let PrimaryKeyEarlyOut = true;
}
def RISCVVIntrinsicsTable : GenericTable {
let FilterClass = "RISCVVIntrinsic";
let CppTypeName = "RISCVVIntrinsicInfo";
- let Fields = ["IntrinsicID", "ExtendOperand"];
+ let Fields = ["IntrinsicID", "SplatOperand"];
let PrimaryKey = ["IntrinsicID"];
let PrimaryKeyName = "getRISCVVIntrinsicInfo";
}
-class RISCVZvlsseg<string IntrName, bits<11> S, bits<3> L, bits<3> IL = V_M1.value> {
- Intrinsic IntrinsicID = !cast<Intrinsic>(IntrName);
- bits<11> SEW = S;
+class RISCVVLE<bit M, bit Str, bit F, bits<3> S, bits<3> L> {
+ bits<1> Masked = M;
+ bits<1> Strided = Str;
+ bits<1> FF = F;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVLETable : GenericTable {
+ let FilterClass = "RISCVVLE";
+ let CppTypeName = "VLEPseudo";
+ let Fields = ["Masked", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
+ let PrimaryKey = ["Masked", "Strided", "FF", "Log2SEW", "LMUL"];
+ let PrimaryKeyName = "getVLEPseudo";
+}
+
+class RISCVVSE<bit M, bit Str, bits<3> S, bits<3> L> {
+ bits<1> Masked = M;
+ bits<1> Strided = Str;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVSETable : GenericTable {
+ let FilterClass = "RISCVVSE";
+ let CppTypeName = "VSEPseudo";
+ let Fields = ["Masked", "Strided", "Log2SEW", "LMUL", "Pseudo"];
+ let PrimaryKey = ["Masked", "Strided", "Log2SEW", "LMUL"];
+ let PrimaryKeyName = "getVSEPseudo";
+}
+
+class RISCVVLX_VSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
+ bits<1> Masked = M;
+ bits<1> Ordered = O;
+ bits<3> Log2SEW = S;
bits<3> LMUL = L;
bits<3> IndexLMUL = IL;
Pseudo Pseudo = !cast<Pseudo>(NAME);
}
-def RISCVZvlssegTable : GenericTable {
- let FilterClass = "RISCVZvlsseg";
- let Fields = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL", "Pseudo"];
- let PrimaryKey = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL"];
- let PrimaryKeyName = "getPseudo";
+class RISCVVLX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> :
+ RISCVVLX_VSX<M, O, S, L, IL>;
+class RISCVVSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> :
+ RISCVVLX_VSX<M, O, S, L, IL>;
+
+class RISCVVLX_VSXTable : GenericTable {
+ let CppTypeName = "VLX_VSXPseudo";
+ let Fields = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
+ let PrimaryKey = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
+}
+
+def RISCVVLXTable : RISCVVLX_VSXTable {
+ let FilterClass = "RISCVVLX";
+ let PrimaryKeyName = "getVLXPseudo";
+}
+
+def RISCVVSXTable : RISCVVLX_VSXTable {
+ let FilterClass = "RISCVVSX";
+ let PrimaryKeyName = "getVSXPseudo";
+}
+
+class RISCVVLSEG<bits<4> N, bit M, bit Str, bit F, bits<3> S, bits<3> L> {
+ bits<4> NF = N;
+ bits<1> Masked = M;
+ bits<1> Strided = Str;
+ bits<1> FF = F;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVLSEGTable : GenericTable {
+ let FilterClass = "RISCVVLSEG";
+ let CppTypeName = "VLSEGPseudo";
+ let Fields = ["NF", "Masked", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
+ let PrimaryKey = ["NF", "Masked", "Strided", "FF", "Log2SEW", "LMUL"];
+ let PrimaryKeyName = "getVLSEGPseudo";
+}
+
+class RISCVVLXSEG<bits<4> N, bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
+ bits<4> NF = N;
+ bits<1> Masked = M;
+ bits<1> Ordered = O;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ bits<3> IndexLMUL = IL;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVLXSEGTable : GenericTable {
+ let FilterClass = "RISCVVLXSEG";
+ let CppTypeName = "VLXSEGPseudo";
+ let Fields = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
+ let PrimaryKey = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
+ let PrimaryKeyName = "getVLXSEGPseudo";
+}
+
+class RISCVVSSEG<bits<4> N, bit M, bit Str, bits<3> S, bits<3> L> {
+ bits<4> NF = N;
+ bits<1> Masked = M;
+ bits<1> Strided = Str;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVSSEGTable : GenericTable {
+ let FilterClass = "RISCVVSSEG";
+ let CppTypeName = "VSSEGPseudo";
+ let Fields = ["NF", "Masked", "Strided", "Log2SEW", "LMUL", "Pseudo"];
+ let PrimaryKey = ["NF", "Masked", "Strided", "Log2SEW", "LMUL"];
+ let PrimaryKeyName = "getVSSEGPseudo";
+}
+
+class RISCVVSXSEG<bits<4> N, bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
+ bits<4> NF = N;
+ bits<1> Masked = M;
+ bits<1> Ordered = O;
+ bits<3> Log2SEW = S;
+ bits<3> LMUL = L;
+ bits<3> IndexLMUL = IL;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVVSXSEGTable : GenericTable {
+ let FilterClass = "RISCVVSXSEG";
+ let CppTypeName = "VSXSEGPseudo";
+ let Fields = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
+ let PrimaryKey = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
+ let PrimaryKeyName = "getVSXSEGPseudo";
}
//===----------------------------------------------------------------------===//
@@ -448,39 +577,13 @@
!subst("_B32", "",
!subst("_B64", "",
!subst("_MASK", "",
+ !subst("_COMMUTABLE", "",
+ !subst("_TA", "",
+ !subst("_TIED", "",
!subst("F16", "F",
!subst("F32", "F",
!subst("F64", "F",
- !subst("Pseudo", "", PseudoInst)))))))))))))))))));
-}
-
-class ToLowerCase<string Upper> {
- string L = !subst("FF", "ff",
- !subst("VLSEG", "vlseg",
- !subst("VLSSEG", "vlsseg",
- !subst("VSSEG", "vsseg",
- !subst("VSSSEG", "vssseg",
- !subst("VLOXSEG", "vloxseg",
- !subst("VLUXSEG", "vluxseg",
- !subst("VSOXSEG", "vsoxseg",
- !subst("VSUXSEG", "vsuxseg", Upper)))))))));
-}
-
-// Example: PseudoVLSEG2E32_V_M2 -> int_riscv_vlseg2
-// Example: PseudoVLSEG2E32_V_M2_MASK -> int_riscv_vlseg2_mask
-class PseudoToIntrinsic<string PseudoInst, bit IsMasked> {
- string Intrinsic = !strconcat("int_riscv_",
- ToLowerCase<
- !subst("E8", "",
- !subst("E16", "",
- !subst("E32", "",
- !subst("E64", "",
- !subst("EI8", "",
- !subst("EI16", "",
- !subst("EI32", "",
- !subst("EI64", "",
- !subst("_V", "", PseudoToVInst<PseudoInst>.VInst)))))))))>.L,
- !if(IsMasked, "_mask", ""));
+ !subst("Pseudo", "", PseudoInst))))))))))))))))))))));
}
// The destination vector register group for a masked vector instruction cannot
@@ -492,7 +595,18 @@
!eq(VRegClass, VRM2) : VRM2NoV0,
!eq(VRegClass, VRM4) : VRM4NoV0,
!eq(VRegClass, VRM8) : VRM8NoV0,
- !eq(1, 1) : VRegClass);
+ !eq(VRegClass, VRN2M1) : VRN2M1NoV0,
+ !eq(VRegClass, VRN2M2) : VRN2M2NoV0,
+ !eq(VRegClass, VRN2M4) : VRN2M4NoV0,
+ !eq(VRegClass, VRN3M1) : VRN3M1NoV0,
+ !eq(VRegClass, VRN3M2) : VRN3M2NoV0,
+ !eq(VRegClass, VRN4M1) : VRN4M1NoV0,
+ !eq(VRegClass, VRN4M2) : VRN4M2NoV0,
+ !eq(VRegClass, VRN5M1) : VRN5M1NoV0,
+ !eq(VRegClass, VRN6M1) : VRN6M1NoV0,
+ !eq(VRegClass, VRN7M1) : VRN7M1NoV0,
+ !eq(VRegClass, VRN8M1) : VRN8M1NoV0,
+ true : VRegClass);
}
// Join strings in list using separator and ignoring empty elements
@@ -511,158 +625,151 @@
let VLMul = m.value;
}
-class VPseudoUSLoadNoMask<VReg RetClass>:
+class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> :
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSLoadMask<VReg RetClass>:
+class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge,
GPR:$rs1,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSLoadNoMask<VReg RetClass>:
+class VPseudoSLoadNoMask<VReg RetClass, int EEW>:
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSLoadMask<VReg RetClass>:
+class VPseudoSLoadMask<VReg RetClass, int EEW>:
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge,
GPR:$rs1, GPR:$rs2,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass>:
+class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bit Ordered, bit EarlyClobber>:
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins GPR:$rs1, IdxClass:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLX</*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
+ let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd", "");
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoILoadMask<VReg RetClass, VReg IdxClass>:
+class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bit Ordered, bit EarlyClobber>:
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge,
GPR:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVLX</*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
+ let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $merge", "$rd = $merge");
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSStoreNoMask<VReg StClass>:
+class VPseudoUSStoreNoMask<VReg StClass, int EEW>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSE</*Masked*/0, /*Strided*/0, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSStoreMask<VReg StClass>:
+class VPseudoUSStoreMask<VReg StClass, int EEW>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSE</*Masked*/1, /*Strided*/0, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSStoreNoMask<VReg StClass>:
+class VPseudoSStoreNoMask<VReg StClass, int EEW>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSE</*Masked*/0, /*Strided*/1, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSStoreMask<VReg StClass>:
+class VPseudoSStoreMask<VReg StClass, int EEW>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, GPR:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSE</*Masked*/1, /*Strided*/1, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
@@ -672,13 +779,11 @@
class VPseudoUnaryNoDummyMask<VReg RetClass,
DAGOperand Op2Class> :
Pseudo<(outs RetClass:$rd),
- (ins Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+ (ins Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
@@ -686,13 +791,11 @@
class VPseudoNullaryNoMask<VReg RegClass>:
Pseudo<(outs RegClass:$rd),
- (ins GPR:$vl, ixlenimm:$sew),
+ (ins AVL:$vl, ixlenimm:$sew),
[]>, RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
@@ -701,14 +804,12 @@
class VPseudoNullaryMask<VReg RegClass>:
Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
- (ins GetVRegNoV0<RegClass>.R:$merge, VMaskOp:$vm, GPR:$vl,
+ (ins GetVRegNoV0<RegClass>.R:$merge, VMaskOp:$vm, AVL:$vl,
ixlenimm:$sew), []>, RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints ="$rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
@@ -718,13 +819,11 @@
// Nullary for pseudo instructions. They are expanded in
// RISCVExpandPseudoInsts pass.
class VPseudoNullaryPseudoM<string BaseInst>
- : Pseudo<(outs VR:$rd), (ins GPR:$vl, ixlenimm:$sew), []>,
+ : Pseudo<(outs VR:$rd), (ins AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
// BaseInstr is not used in RISCVExpandPseudoInsts pass.
@@ -735,14 +834,12 @@
// RetClass could be GPR or VReg.
class VPseudoUnaryNoMask<DAGOperand RetClass, VReg OpClass, string Constraint = ""> :
Pseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, GPR:$vl, ixlenimm:$sew), []>,
+ (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Constraint;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
@@ -752,14 +849,12 @@
class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
@@ -769,50 +864,29 @@
// mask unary operation without maskedoff
class VPseudoMaskUnarySOutMask:
Pseudo<(outs GPR:$rd),
- (ins VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-// Masked mask operation have no $rd=$merge constraints
-class VPseudoUnaryMOutMask:
- Pseudo<(outs VR:$rd),
- (ins VR:$merge, VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
- RISCVVPseudo {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
- let HasVLOp = 1;
- let HasSEWOp = 1;
- let HasMergeOp = 1;
- let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
-}
-
// Mask can be V0~V31
class VPseudoUnaryAnyMask<VReg RetClass,
VReg Op1Class> :
Pseudo<(outs RetClass:$rd),
(ins RetClass:$merge,
Op1Class:$rs2,
- VR:$vm, GPR:$vl, ixlenimm:$sew),
+ VR:$vm, AVL:$vl, ixlenimm:$sew),
[]>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = "@earlyclobber $rd, $rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
@@ -824,70 +898,125 @@
DAGOperand Op2Class,
string Constraint> :
Pseudo<(outs RetClass:$rd),
- (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+ (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Constraint;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass>:
+class VPseudoTiedBinaryNoMask<VReg RetClass,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = Join<[Constraint, "$rd = $rs2"], ",">.ret;
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let ForceTailAgnostic = 1;
+ let isConvertibleToThreeAddress = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bit Ordered>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSX</*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoIStoreMask<VReg StClass, VReg IdxClass>:
+class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bit Ordered>:
Pseudo<(outs),
- (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
- RISCVVPseudo {
+ (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVVSX</*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
class VPseudoBinaryMask<VReg RetClass,
- VReg Op1Class,
+ RegisterClass Op1Class,
DAGOperand Op2Class,
string Constraint> :
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge,
Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
+// Like VPseudoBinaryMask, but output can be V0.
+class VPseudoBinaryMOutMask<VReg RetClass,
+ RegisterClass Op1Class,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs RetClass:$rd),
+ (ins RetClass:$merge,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Special version of VPseudoBinaryMask where we pretend the first source is
+// tied to the destination so we can workaround the earlyclobber constraint.
+// This allows maskedoff and rs2 to be the same register.
+class VPseudoTiedBinaryMask<VReg RetClass,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge,
+ Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 0; // Merge is also rs2.
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
class VPseudoBinaryCarryIn<VReg RetClass,
VReg Op1Class,
DAGOperand Op2Class,
@@ -896,16 +1025,14 @@
string Constraint> :
Pseudo<(outs RetClass:$rd),
!if(CarryIn,
- (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, GPR:$vl,
+ (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl,
ixlenimm:$sew),
- (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew)), []>,
+ (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew)), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Constraint;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 0;
@@ -914,20 +1041,18 @@
}
class VPseudoTernaryNoMask<VReg RetClass,
- VReg Op1Class,
+ RegisterClass Op1Class,
DAGOperand Op2Class,
string Constraint> :
Pseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- GPR:$vl, ixlenimm:$sew),
+ AVL:$vl, ixlenimm:$sew),
[]>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = Join<[Constraint, "$rd = $rs3"], ",">.ret;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
@@ -941,14 +1066,12 @@
(ins GPR:$rs1,
Op1Class:$vs2,
GetVRegNoV0<RetClass>.R:$vd,
- GPR:$vl, ixlenimm:$sew), []>,
+ AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
- let usesCustomInserter = 1;
let Constraints = "$vd_wd = $vd";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
@@ -961,14 +1084,12 @@
(ins GPR:$rs1,
Op1Class:$vs2,
GetVRegNoV0<RetClass>.R:$vd,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
RISCVVPseudo {
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
- let usesCustomInserter = 1;
let Constraints = "$vd_wd = $vd";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
@@ -981,17 +1102,15 @@
// therefore only [32, 64] is allowed here.
foreach sew = [32, 64] in {
foreach lmul = MxSet<sew>.m in {
- defvar octuple_lmul = octuple_from_str<lmul.MX>.ret;
+ defvar octuple_lmul = lmul.octuple;
// Calculate emul = eew * lmul / sew
- defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<sew>.val);
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), log2<sew>.val);
if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
defvar emulMX = octuple_to_str<octuple_emul>.ret;
- defvar lmulMX = octuple_to_str<octuple_lmul>.ret;
defvar emul= !cast<LMULInfo>("V_" # emulMX);
- defvar lmul = !cast<LMULInfo>("V_" # lmulMX);
let VLMul = lmul.value in {
- def "_WD_" # lmulMX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>;
- def "_WD_" # lmulMX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>;
+ def "_WD_" # lmul.MX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>;
+ def "_WD_" # lmul.MX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>;
}
}
}
@@ -1000,220 +1119,206 @@
multiclass VPseudoAMO {
foreach eew = EEWList in
- defm "EI" # eew : VPseudoAMOEI<eew>;
+ defm "EI" # eew : VPseudoAMOEI<eew>;
}
-class VPseudoUSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSSegLoadMask<VReg RetClass, bits<11> EEW>:
+class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+class VPseudoSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF>:
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, GPR:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+ (ins GPR:$rs1, GPR:$offset, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+ let mayLoad = 1;
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSSegLoadMask<VReg RetClass, bits<11> EEW>:
+class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
- GPR:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
let Constraints = "$rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bits<4> NF, bit Ordered>:
Pseudo<(outs RetClass:$rd),
- (ins GPR:$rs1, IdxClass:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+ (ins GPR:$rs1, IdxClass:$offset, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+ RISCVVLXSEG<NF, /*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
// For vector indexed segment loads, the destination vector register groups
// cannot overlap the source vector register group
let Constraints = "@earlyclobber $rd";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bits<4> NF, bit Ordered>:
Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
- IdxClass:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+ RISCVVLXSEG<NF, /*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
// For vector indexed segment loads, the destination vector register groups
// cannot overlap the source vector register group
let Constraints = "@earlyclobber $rd, $rd = $merge";
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasMergeOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+class VPseudoUSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
Pseudo<(outs),
- (ins ValClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ (ins ValClass:$rd, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoUSSegStoreMask<VReg ValClass, bits<11> EEW>:
+class VPseudoUSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
Pseudo<(outs),
(ins ValClass:$rd, GPR:$rs1,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+class VPseudoSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
Pseudo<(outs),
- (ins ValClass:$rd, GPR:$rs1, GPR: $offset, GPR:$vl, ixlenimm:$sew),[]>,
+ (ins ValClass:$rd, GPR:$rs1, GPR: $offset, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoSSegStoreMask<VReg ValClass, bits<11> EEW>:
+class VPseudoSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
Pseudo<(outs),
(ins ValClass:$rd, GPR:$rs1, GPR: $offset,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, log2<EEW>.val, VLMul> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bits<4> NF, bit Ordered>:
Pseudo<(outs),
(ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
- GPR:$vl, ixlenimm:$sew),[]>,
+ AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+ RISCVVSXSEG<NF, /*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let HasDummyMask = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
+ bits<4> NF, bit Ordered>:
Pseudo<(outs),
(ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
- VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
RISCVVPseudo,
- RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+ RISCVVSXSEG<NF, /*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
let hasSideEffects = 0;
- let usesCustomInserter = 1;
- let Uses = [VL, VTYPE];
let HasVLOp = 1;
let HasSEWOp = 1;
let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
}
-multiclass VPseudoUSLoad {
- foreach lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar vreg = lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # LInfo : VPseudoUSLoadNoMask<vreg>;
- def "_V_" # LInfo # "_MASK" : VPseudoUSLoadMask<vreg>;
+multiclass VPseudoUSLoad<bit isFF> {
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ defvar FFStr = !if(isFF, "FF", "");
+ let VLMul = lmul.value in {
+ def "E" # eew # FFStr # "_V_" # LInfo :
+ VPseudoUSLoadNoMask<vreg, eew, isFF>;
+ def "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
+ VPseudoUSLoadMask<vreg, eew, isFF>;
+ }
}
}
}
@@ -1221,43 +1326,59 @@
multiclass VPseudoLoadMask {
foreach mti = AllMasks in {
let VLMul = mti.LMul.value in {
- def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR>;
+ def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR, /*EEW*/1, /*isFF*/0>;
}
}
}
multiclass VPseudoSLoad {
- foreach lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar vreg = lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # LInfo : VPseudoSLoadNoMask<vreg>;
- def "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg>;
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>;
+ def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>;
+ }
}
}
}
-multiclass VPseudoILoad {
- foreach lmul = MxList.m in
- foreach idx_lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar Vreg = lmul.vrclass;
- defvar IdxLInfo = idx_lmul.MX;
- defvar IdxVreg = idx_lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # IdxLInfo # "_" # LInfo : VPseudoILoadNoMask<Vreg, IdxVreg>;
- def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoILoadMask<Vreg, IdxVreg>;
+multiclass VPseudoILoad<bit Ordered> {
+ foreach eew = EEWList in {
+ foreach sew = EEWList in {
+ foreach lmul = MxSet<sew>.m in {
+ defvar octuple_lmul = lmul.octuple;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), log2<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar LInfo = lmul.MX;
+ defvar IdxLInfo = octuple_to_str<octuple_emul>.ret;
+ defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo);
+ defvar Vreg = lmul.vrclass;
+ defvar IdxVreg = idx_lmul.vrclass;
+ defvar HasConstraint = !ne(sew, eew);
+ let VLMul = lmul.value in {
+ def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo :
+ VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>;
+ def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" :
+ VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>;
+ }
+ }
+ }
}
}
}
multiclass VPseudoUSStore {
- foreach lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar vreg = lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # LInfo : VPseudoUSStoreNoMask<vreg>;
- def "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg>;
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "E" # eew # "_V_" # LInfo : VPseudoUSStoreNoMask<vreg, eew>;
+ def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg, eew>;
+ }
}
}
}
@@ -1265,32 +1386,45 @@
multiclass VPseudoStoreMask {
foreach mti = AllMasks in {
let VLMul = mti.LMul.value in {
- def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR>;
+ def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR, /*EEW*/1>;
}
}
}
multiclass VPseudoSStore {
- foreach lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar vreg = lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # LInfo : VPseudoSStoreNoMask<vreg>;
- def "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg>;
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "E" # eew # "_V_" # LInfo : VPseudoSStoreNoMask<vreg, eew>;
+ def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg, eew>;
+ }
}
}
}
-multiclass VPseudoIStore {
- foreach lmul = MxList.m in
- foreach idx_lmul = MxList.m in {
- defvar LInfo = lmul.MX;
- defvar Vreg = lmul.vrclass;
- defvar IdxLInfo = idx_lmul.MX;
- defvar IdxVreg = idx_lmul.vrclass;
- let VLMul = lmul.value in {
- def "_V_" # IdxLInfo # "_" # LInfo : VPseudoIStoreNoMask<Vreg, IdxVreg>;
- def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoIStoreMask<Vreg, IdxVreg>;
+multiclass VPseudoIStore<bit Ordered> {
+ foreach eew = EEWList in {
+ foreach sew = EEWList in {
+ foreach lmul = MxSet<sew>.m in {
+ defvar octuple_lmul = lmul.octuple;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), log2<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar LInfo = lmul.MX;
+ defvar IdxLInfo = octuple_to_str<octuple_emul>.ret;
+ defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo);
+ defvar Vreg = lmul.vrclass;
+ defvar IdxVreg = idx_lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo :
+ VPseudoIStoreNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>;
+ def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" :
+ VPseudoIStoreMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>;
+ }
+ }
+ }
}
}
}
@@ -1363,6 +1497,20 @@
}
}
+multiclass VPseudoBinaryM<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ LMULInfo MInfo,
+ string Constraint = ""> {
+ let VLMul = MInfo.value in {
+ def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+ Constraint>;
+ let ForceTailAgnostic = true in
+ def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class,
+ Op2Class, Constraint>;
+ }
+}
+
multiclass VPseudoBinaryEmul<VReg RetClass,
VReg Op1Class,
DAGOperand Op2Class,
@@ -1377,6 +1525,18 @@
}
}
+multiclass VPseudoTiedBinary<VReg RetClass,
+ DAGOperand Op2Class,
+ LMULInfo MInfo,
+ string Constraint = ""> {
+ let VLMul = MInfo.value in {
+ def "_" # MInfo.MX # "_TIED": VPseudoTiedBinaryNoMask<RetClass, Op2Class,
+ Constraint>;
+ def "_" # MInfo.MX # "_MASK_TIED" : VPseudoTiedBinaryMask<RetClass, Op2Class,
+ Constraint>;
+ }
+}
+
multiclass VPseudoBinaryV_VV<string Constraint = ""> {
foreach m = MxList.m in
defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
@@ -1385,9 +1545,9 @@
multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> {
foreach m = MxList.m in {
foreach sew = EEWList in {
- defvar octuple_lmul = octuple_from_str<m.MX>.ret;
+ defvar octuple_lmul = m.octuple;
// emul = lmul * eew / sew
- defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<sew>.val);
+ defvar octuple_emul = !srl(!mul(octuple_lmul, eew), log2<sew>.val);
if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
defvar emulMX = octuple_to_str<octuple_emul>.ret;
defvar emul = !cast<LMULInfo>("V_" # emulMX);
@@ -1429,61 +1589,67 @@
// at least 1, and the overlap is in the highest-numbered part of the
// destination register group is legal. Otherwise, it is illegal.
multiclass VPseudoBinaryW_VV {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
"@earlyclobber $rd">;
}
multiclass VPseudoBinaryW_VX {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm "_VX" : VPseudoBinary<m.wvrclass, m.vrclass, GPR, m,
"@earlyclobber $rd">;
}
multiclass VPseudoBinaryW_VF {
- foreach m = MxList.m[0-5] in
- foreach f = FPList.fpinfo[0-1] in
+ foreach m = MxListW.m in
+ foreach f = FPListW.fpinfo in
defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass,
f.fprclass, m,
"@earlyclobber $rd">;
}
multiclass VPseudoBinaryW_WV {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in {
defm _WV : VPseudoBinary<m.wvrclass, m.wvrclass, m.vrclass, m,
"@earlyclobber $rd">;
+ defm _WV : VPseudoTiedBinary<m.wvrclass, m.vrclass, m,
+ "@earlyclobber $rd">;
+ }
}
multiclass VPseudoBinaryW_WX {
- foreach m = MxList.m[0-5] in
- defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m,
- "@earlyclobber $rd">;
+ foreach m = MxListW.m in
+ defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m>;
}
multiclass VPseudoBinaryW_WF {
- foreach m = MxList.m[0-5] in
- foreach f = FPList.fpinfo[0-1] in
+ foreach m = MxListW.m in
+ foreach f = FPListW.fpinfo in
defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass,
- f.fprclass, m,
- "@earlyclobber $rd">;
+ f.fprclass, m>;
}
+// Narrowing instructions like vnsrl/vnsra/vnclip(u) don't need @earlyclobber
+// if the source and destination have an LMUL<=1. This matches this overlap
+// exception from the spec.
+// "The destination EEW is smaller than the source EEW and the overlap is in the
+// lowest-numbered part of the source register group."
multiclass VPseudoBinaryV_WV {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _WV : VPseudoBinary<m.vrclass, m.wvrclass, m.vrclass, m,
- "@earlyclobber $rd">;
+ !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryV_WX {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _WX : VPseudoBinary<m.vrclass, m.wvrclass, GPR, m,
- "@earlyclobber $rd">;
+ !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryV_WI {
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _WI : VPseudoBinary<m.vrclass, m.wvrclass, uimm5, m,
- "@earlyclobber $rd">;
+ !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
}
// For vadc and vsbc, the instruction encoding is reserved if the destination
@@ -1558,7 +1724,7 @@
multiclass PseudoUnaryV_VF2 {
defvar constraints = "@earlyclobber $rd";
- foreach m = MxList.m[1-6] in
+ foreach m = MxListVF2.m in
{
let VLMul = m.value in {
def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>;
@@ -1570,7 +1736,7 @@
multiclass PseudoUnaryV_VF4 {
defvar constraints = "@earlyclobber $rd";
- foreach m = MxList.m[2-6] in
+ foreach m = MxListVF4.m in
{
let VLMul = m.value in {
def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>;
@@ -1582,7 +1748,7 @@
multiclass PseudoUnaryV_VF8 {
defvar constraints = "@earlyclobber $rd";
- foreach m = MxList.m[3-6] in
+ foreach m = MxListVF8.m in
{
let VLMul = m.value in {
def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>;
@@ -1592,31 +1758,42 @@
}
}
-// The destination EEW is 1.
+// The destination EEW is 1 since "For the purposes of register group overlap
+// constraints, mask elements have EEW=1."
// The source EEW is 8, 16, 32, or 64.
// When the destination EEW is different from source EEW, we need to use
// @earlyclobber to avoid the overlap between destination and source registers.
+// We don't need @earlyclobber for LMUL<=1 since that matches this overlap
+// exception from the spec
+// "The destination EEW is smaller than the source EEW and the overlap is in the
+// lowest-numbered part of the source register group".
+// With LMUL<=1 the source and dest occupy a single register so any overlap
+// is in the lowest-numbered part.
multiclass VPseudoBinaryM_VV {
foreach m = MxList.m in
- defm _VV : VPseudoBinary<VR, m.vrclass, m.vrclass, m, "@earlyclobber $rd">;
+ defm _VV : VPseudoBinaryM<VR, m.vrclass, m.vrclass, m,
+ !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryM_VX {
foreach m = MxList.m in
defm "_VX" :
- VPseudoBinary<VR, m.vrclass, GPR, m, "@earlyclobber $rd">;
+ VPseudoBinaryM<VR, m.vrclass, GPR, m,
+ !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryM_VF {
foreach m = MxList.m in
foreach f = FPList.fpinfo in
defm "_V" # f.FX :
- VPseudoBinary<VR, m.vrclass, f.fprclass, m, "@earlyclobber $rd">;
+ VPseudoBinaryM<VR, m.vrclass, f.fprclass, m,
+ !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryM_VI {
foreach m = MxList.m in
- defm _VI : VPseudoBinary<VR, m.vrclass, simm5, m, "@earlyclobber $rd">;
+ defm _VI : VPseudoBinaryM<VR, m.vrclass, simm5, m,
+ !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
}
multiclass VPseudoBinaryV_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
@@ -1700,8 +1877,8 @@
}
multiclass VPseudoTernary<VReg RetClass,
- VReg Op1Class,
- RegisterClass Op2Class,
+ RegisterClass Op1Class,
+ DAGOperand Op2Class,
LMULInfo MInfo,
string Constraint = ""> {
let VLMul = MInfo.value in {
@@ -1711,8 +1888,16 @@
}
multiclass VPseudoTernaryV_VV<string Constraint = ""> {
- foreach m = MxList.m in
+ foreach m = MxList.m in {
defm _VV : VPseudoTernary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+
+ // Add a commutable version for use by IR mul+add.
+ let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
+ def "_VV_" # m.MX # "_COMMUTABLE" : VPseudoTernaryNoMask<m.vrclass,
+ m.vrclass,
+ m.vrclass,
+ Constraint>;
+ }
}
multiclass VPseudoTernaryV_VX<string Constraint = ""> {
@@ -1721,35 +1906,68 @@
}
multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
- foreach m = MxList.m in
+ foreach m = MxList.m in {
defm "_VX" : VPseudoTernary<m.vrclass, GPR, m.vrclass, m, Constraint>;
+
+ // Add a commutable version for use by IR mul+add.
+ let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
+ def "_VX_" # m.MX # "_COMMUTABLE" :
+ VPseudoTernaryNoMask<m.vrclass, GPR, m.vrclass, Constraint>;
+ }
}
multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> {
- foreach m = MxList.m in
- foreach f = FPList.fpinfo in
+ foreach m = MxList.m in {
+ foreach f = FPList.fpinfo in {
defm "_V" # f.FX : VPseudoTernary<m.vrclass, f.fprclass, m.vrclass,
m, Constraint>;
+
+ // Add a commutable version for use by IR mul+add.
+ let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
+ def "_V" # f.FX # "_" # m.MX # "_COMMUTABLE" :
+ VPseudoTernaryNoMask<m.vrclass, f.fprclass, m.vrclass, Constraint>;
+ }
+ }
}
multiclass VPseudoTernaryW_VV {
defvar constraint = "@earlyclobber $rd";
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in {
defm _VV : VPseudoTernary<m.wvrclass, m.vrclass, m.vrclass, m, constraint>;
+
+ // Add a tail agnostic version for us by IR mul+add.
+ let ForceTailAgnostic = true, VLMul = m.value in
+ def "_VV_" # m.MX # "_TA" : VPseudoTernaryNoMask<m.wvrclass,
+ m.vrclass,
+ m.vrclass,
+ constraint>;
+ }
}
multiclass VPseudoTernaryW_VX {
defvar constraint = "@earlyclobber $rd";
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in {
defm "_VX" : VPseudoTernary<m.wvrclass, GPR, m.vrclass, m, constraint>;
+
+ // Add a tail agnostic version for use by IR mul+add.
+ let ForceTailAgnostic = true, VLMul = m.value in
+ def "_VX_" # m.MX # "_TA" :
+ VPseudoTernaryNoMask<m.wvrclass, GPR, m.vrclass, constraint>;
+ }
}
multiclass VPseudoTernaryW_VF {
defvar constraint = "@earlyclobber $rd";
- foreach m = MxList.m[0-5] in
- foreach f = FPList.fpinfo[0-1] in
+ foreach m = MxListW.m in
+ foreach f = FPListW.fpinfo in {
defm "_V" # f.FX : VPseudoTernary<m.wvrclass, f.fprclass, m.vrclass, m,
constraint>;
+
+ // Add a tail agnostic version for use by IR mul+add.
+ let ForceTailAgnostic = true, VLMul = m.value in
+ def "_V" # f.FX # "_" # m.MX # "_TA" :
+ VPseudoTernaryNoMask<m.vrclass, f.fprclass, m.vrclass, constraint>;
+ }
}
multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
@@ -1805,7 +2023,6 @@
multiclass VPseudoReductionV_VS {
foreach m = MxList.m in {
- let WritesElement0 = 1 in
defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>;
}
}
@@ -1828,13 +2045,13 @@
multiclass VPseudoConversionW_V {
defvar constraint = "@earlyclobber $rd";
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>;
}
multiclass VPseudoConversionV_W {
defvar constraint = "@earlyclobber $rd";
- foreach m = MxList.m[0-5] in
+ foreach m = MxListW.m in
defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>;
}
@@ -1847,9 +2064,9 @@
defvar vreg = SegRegClass<lmul, nf>.RC;
defvar FFStr = !if(isFF, "FF", "");
def nf # "E" # eew # FFStr # "_V_" # LInfo :
- VPseudoUSSegLoadNoMask<vreg, eew>;
+ VPseudoUSSegLoadNoMask<vreg, eew, nf, isFF>;
def nf # "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
- VPseudoUSSegLoadMask<vreg, eew>;
+ VPseudoUSSegLoadMask<vreg, eew, nf, isFF>;
}
}
}
@@ -1863,28 +2080,37 @@
let VLMul = lmul.value in {
foreach nf = NFSet<lmul>.L in {
defvar vreg = SegRegClass<lmul, nf>.RC;
- def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegLoadNoMask<vreg, eew>;
- def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegLoadMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegLoadNoMask<vreg, eew, nf>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegLoadMask<vreg, eew, nf>;
}
}
}
}
}
-multiclass VPseudoISegLoad {
- foreach idx_eew = EEWList in { // EEW for index argument.
- foreach idx_lmul = MxSet<idx_eew>.m in { // LMUL for index argument.
- foreach val_lmul = MxList.m in { // LMUL for the value.
- defvar IdxLInfo = idx_lmul.MX;
- defvar IdxVreg = idx_lmul.vrclass;
- defvar ValLInfo = val_lmul.MX;
- let VLMul = val_lmul.value in {
- foreach nf = NFSet<val_lmul>.L in {
- defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
- def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
- VPseudoISegLoadNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
- def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
- VPseudoISegLoadMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+multiclass VPseudoISegLoad<bit Ordered> {
+ foreach idx_eew = EEWList in {
+ foreach sew = EEWList in {
+ foreach val_lmul = MxSet<sew>.m in {
+ defvar octuple_lmul = val_lmul.octuple;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(idx_eew, octuple_lmul), log2<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar ValLInfo = val_lmul.MX;
+ defvar IdxLInfo = octuple_to_str<octuple_emul>.ret;
+ defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo);
+ defvar Vreg = val_lmul.vrclass;
+ defvar IdxVreg = idx_lmul.vrclass;
+ let VLMul = val_lmul.value in {
+ foreach nf = NFSet<val_lmul>.L in {
+ defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+ VPseudoISegLoadNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
+ nf, Ordered>;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+ VPseudoISegLoadMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
+ nf, Ordered>;
+ }
}
}
}
@@ -1899,8 +2125,8 @@
let VLMul = lmul.value in {
foreach nf = NFSet<lmul>.L in {
defvar vreg = SegRegClass<lmul, nf>.RC;
- def nf # "E" # eew # "_V_" # LInfo : VPseudoUSSegStoreNoMask<vreg, eew>;
- def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSSegStoreMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoUSSegStoreNoMask<vreg, eew, nf>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSSegStoreMask<vreg, eew, nf>;
}
}
}
@@ -1914,28 +2140,37 @@
let VLMul = lmul.value in {
foreach nf = NFSet<lmul>.L in {
defvar vreg = SegRegClass<lmul, nf>.RC;
- def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegStoreNoMask<vreg, eew>;
- def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegStoreMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegStoreNoMask<vreg, eew, nf>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegStoreMask<vreg, eew, nf>;
}
}
}
}
}
-multiclass VPseudoISegStore {
- foreach idx_eew = EEWList in { // EEW for index argument.
- foreach idx_lmul = MxSet<idx_eew>.m in { // LMUL for index argument.
- foreach val_lmul = MxList.m in { // LMUL for the value.
- defvar IdxLInfo = idx_lmul.MX;
- defvar IdxVreg = idx_lmul.vrclass;
- defvar ValLInfo = val_lmul.MX;
- let VLMul = val_lmul.value in {
- foreach nf = NFSet<val_lmul>.L in {
- defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
- def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
- VPseudoISegStoreNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
- def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
- VPseudoISegStoreMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+multiclass VPseudoISegStore<bit Ordered> {
+ foreach idx_eew = EEWList in {
+ foreach sew = EEWList in {
+ foreach val_lmul = MxSet<sew>.m in {
+ defvar octuple_lmul = val_lmul.octuple;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(idx_eew, octuple_lmul), log2<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar ValLInfo = val_lmul.MX;
+ defvar IdxLInfo = octuple_to_str<octuple_emul>.ret;
+ defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo);
+ defvar Vreg = val_lmul.vrclass;
+ defvar IdxVreg = idx_lmul.vrclass;
+ let VLMul = val_lmul.value in {
+ foreach nf = NFSet<val_lmul>.L in {
+ defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+ VPseudoISegStoreNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
+ nf, Ordered>;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+ VPseudoISegStoreMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
+ nf, Ordered>;
+ }
}
}
}
@@ -1957,7 +2192,7 @@
VReg op2_reg_class> :
Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
(op2_type op2_reg_class:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
(op2_type op2_reg_class:$rs2),
GPR:$vl, sew)>;
@@ -1976,7 +2211,7 @@
(result_type result_reg_class:$merge),
(op2_type op2_reg_class:$rs2),
(mask_type V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
(result_type result_reg_class:$merge),
(op2_type op2_reg_class:$rs2),
@@ -1987,10 +2222,10 @@
MTypeInfo mti> :
Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name)
(mti.Mask VR:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX)
(mti.Mask VR:$rs2),
- GPR:$vl, mti.SEW)>;
+ GPR:$vl, mti.Log2SEW)>;
class VPatMaskUnaryMask<string intrinsic_name,
string inst,
@@ -1999,11 +2234,11 @@
(mti.Mask VR:$merge),
(mti.Mask VR:$rs2),
(mti.Mask V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
(mti.Mask VR:$merge),
(mti.Mask VR:$rs2),
- (mti.Mask V0), GPR:$vl, mti.SEW)>;
+ (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
class VPatUnaryAnyMask<string intrinsic,
string inst,
@@ -2019,7 +2254,7 @@
(result_type result_reg_class:$merge),
(op1_type op1_reg_class:$rs1),
(mask_type VR:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
(result_type result_reg_class:$merge),
(op1_type op1_reg_class:$rs1),
@@ -2037,7 +2272,25 @@
Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
+ (!cast<Instruction>(inst)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ GPR:$vl, sew)>;
+
+// Same as above but source operands are swapped.
+class VPatBinaryNoMaskSwapped<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ int sew,
+ VReg op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ (op2_type op2_kind:$rs2),
+ (op1_type op1_reg_class:$rs1),
+ VLOpFrag)),
(!cast<Instruction>(inst)
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
@@ -2058,13 +2311,71 @@
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
(mask_type V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_MASK")
(result_type result_reg_class:$merge),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
(mask_type V0), GPR:$vl, sew)>;
+// Same as above but source operands are swapped.
+class VPatBinaryMaskSwapped<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ VReg op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+ (result_type result_reg_class:$merge),
+ (op2_type op2_kind:$rs2),
+ (op1_type op1_reg_class:$rs1),
+ (mask_type V0),
+ VLOpFrag)),
+ (!cast<Instruction>(inst#"_MASK")
+ (result_type result_reg_class:$merge),
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0), GPR:$vl, sew)>;
+
+class VPatTiedBinaryNoMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op2_type,
+ int sew,
+ VReg result_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ (result_type result_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ VLOpFrag)),
+ (!cast<Instruction>(inst#"_TIED")
+ (result_type result_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ GPR:$vl, sew)>;
+
+class VPatTiedBinaryMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+ (result_type result_reg_class:$merge),
+ (result_type result_reg_class:$merge),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0),
+ VLOpFrag)),
+ (!cast<Instruction>(inst#"_MASK_TIED")
+ (result_type result_reg_class:$merge),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0), GPR:$vl, sew)>;
+
class VPatTernaryNoMask<string intrinsic,
string inst,
string kind,
@@ -2081,7 +2392,7 @@
(result_type result_reg_class:$rs3),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
result_reg_class:$rs3,
(op1_type op1_reg_class:$rs1),
@@ -2105,7 +2416,7 @@
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
(mask_type V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
result_reg_class:$rs3,
(op1_type op1_reg_class:$rs1),
@@ -2125,7 +2436,7 @@
GPR:$rs1,
(op1_type op1_reg_class:$vs2),
(result_type vlmul.vrclass:$vd),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX)
$rs1, $vs2, $vd,
GPR:$vl, sew)>;
@@ -2144,164 +2455,23 @@
(op1_type op1_reg_class:$vs2),
(result_type vlmul.vrclass:$vd),
(mask_type V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK")
$rs1, $vs2, $vd,
(mask_type V0), GPR:$vl, sew)>;
-multiclass VPatUSLoad<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
- def : Pat<(type (Intr GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
- (Pseudo $rs1, GPR:$vl, sew)>;
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
- def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
- GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
- (PseudoMask $merge,
- $rs1, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatUSLoadFF<string inst,
- LLVMType type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg reg_class>
-{
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
- def : Pat<(type (riscv_vleff GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
- (Pseudo $rs1, GPR:$vl, sew)>;
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
- def : Pat<(type (riscv_vleff_mask (type GetVRegNoV0<reg_class>.R:$merge),
- GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
- (PseudoMask $merge,
- $rs1, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatSLoad<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
- def : Pat<(type (Intr GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
- (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
- def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
- GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
- (PseudoMask $merge,
- $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatILoad<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType idx_type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- LMULInfo idx_vlmul,
- VReg reg_class,
- VReg idx_reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
- def : Pat<(type (Intr GPR:$rs1, (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl)))),
- (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
-
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
- def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
- GPR:$rs1, (idx_type idx_reg_class:$rs2),
- (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
- (PseudoMask $merge,
- $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatUSStore<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
- def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
- (Pseudo $rs3, $rs1, GPR:$vl, sew)>;
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
- def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
- (PseudoMask $rs3, $rs1, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatSStore<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
- def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl))),
- (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
- def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
- (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
-}
-
-multiclass VPatIStore<string intrinsic,
- string inst,
- LLVMType type,
- LLVMType idx_type,
- LLVMType mask_type,
- int sew,
- LMULInfo vlmul,
- LMULInfo idx_vlmul,
- VReg reg_class,
- VReg idx_reg_class>
-{
- defvar Intr = !cast<Intrinsic>(intrinsic);
- defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
- def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1,
- (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl))),
- (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
- defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
- defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
- def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1,
- (idx_type idx_reg_class:$rs2), (mask_type V0), (XLenVT (VLOp GPR:$vl))),
- (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
-}
-
multiclass VPatUnaryS_M<string intrinsic_name,
string inst>
{
foreach mti = AllMasks in {
def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name)
- (mti.Mask VR:$rs1), (XLenVT (VLOp GPR:$vl)))),
+ (mti.Mask VR:$rs1), VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX) $rs1,
- GPR:$vl, mti.SEW)>;
+ GPR:$vl, mti.Log2SEW)>;
def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name # "_mask")
- (mti.Mask VR:$rs1), (mti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+ (mti.Mask VR:$rs1), (mti.Mask V0), VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK") $rs1,
- (mti.Mask V0), GPR:$vl, mti.SEW)>;
+ (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
}
}
@@ -2310,7 +2480,7 @@
foreach vti = vtilist in {
def : VPatUnaryAnyMask<intrinsic, instruction, "VM",
vti.Vector, vti.Vector, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
vti.RegClass>;
}
}
@@ -2328,9 +2498,9 @@
{
foreach vti = AllIntegerVectors in {
def : VPatUnaryNoMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
- vti.SEW, vti.LMul, VR>;
+ vti.Log2SEW, vti.LMul, VR>;
def : VPatUnaryMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
- vti.Mask, vti.SEW, vti.LMul, vti.RegClass, VR>;
+ vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass, VR>;
}
}
@@ -2343,10 +2513,10 @@
defvar fti = vtiTofti.Fti;
def : VPatUnaryNoMask<intrinsic, instruction, suffix,
vti.Vector, fti.Vector,
- vti.SEW, vti.LMul, fti.RegClass>;
+ vti.Log2SEW, vti.LMul, fti.RegClass>;
def : VPatUnaryMask<intrinsic, instruction, suffix,
vti.Vector, fti.Vector, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass, fti.RegClass>;
+ vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
}
}
@@ -2355,10 +2525,10 @@
foreach vti = vtilist in {
def : VPatUnaryNoMask<intrinsic, instruction, "V",
vti.Vector, vti.Vector,
- vti.SEW, vti.LMul, vti.RegClass>;
+ vti.Log2SEW, vti.LMul, vti.RegClass>;
def : VPatUnaryMask<intrinsic, instruction, "V",
vti.Vector, vti.Vector, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass, vti.RegClass>;
+ vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
}
}
@@ -2366,24 +2536,24 @@
{
foreach vti = AllIntegerVectors in {
def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
- GPR:$vl, vti.SEW)>;
+ GPR:$vl, vti.Log2SEW)>;
def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
(vti.Vector vti.RegClass:$merge),
- (vti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+ (vti.Mask V0), VLOpFrag)),
(!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
vti.RegClass:$merge, (vti.Mask V0),
- GPR:$vl, vti.SEW)>;
+ GPR:$vl, vti.Log2SEW)>;
}
}
multiclass VPatNullaryM<string intrinsic, string inst> {
foreach mti = AllMasks in
def : Pat<(mti.Mask (!cast<Intrinsic>(intrinsic)
- (XLenVT (VLOp GPR:$vl)))),
+ (XLenVT (VLOp (XLenVT (XLenVT GPR:$vl)))))),
(!cast<Instruction>(inst#"_M_"#mti.BX)
- GPR:$vl, mti.SEW)>;
+ GPR:$vl, mti.Log2SEW)>;
}
multiclass VPatBinary<string intrinsic,
@@ -2404,6 +2574,24 @@
op2_kind>;
}
+multiclass VPatBinarySwapped<string intrinsic,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ VReg op1_reg_class,
+ DAGOperand op2_kind>
+{
+ def : VPatBinaryNoMaskSwapped<intrinsic, inst, result_type, op1_type, op2_type,
+ sew, op1_reg_class, op2_kind>;
+ def : VPatBinaryMaskSwapped<intrinsic, inst, result_type, op1_type, op2_type,
+ mask_type, sew, result_reg_class, op1_reg_class,
+ op2_kind>;
+}
+
multiclass VPatBinaryCarryIn<string intrinsic,
string inst,
string kind,
@@ -2420,7 +2608,7 @@
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
(mask_type V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
@@ -2441,7 +2629,7 @@
def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
@@ -2470,7 +2658,7 @@
foreach vti = vtilist in
defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
vti.Vector, vti.Vector, vti.Vector,vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, vti.RegClass>;
}
@@ -2480,7 +2668,7 @@
defvar ivti = GetIntVTypeInfo<vti>.Vti;
defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, vti.RegClass>;
}
}
@@ -2490,15 +2678,15 @@
foreach vti = vtilist in {
// emul = lmul * eew / sew
defvar vlmul = vti.LMul;
- defvar octuple_lmul = octuple_from_str<vlmul.MX>.ret;
- defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<vti.SEW>.val);
+ defvar octuple_lmul = vlmul.octuple;
+ defvar octuple_emul = !srl(!mul(octuple_lmul, eew), vti.Log2SEW);
if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
defvar emul_str = octuple_to_str<octuple_emul>.ret;
defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
defvar inst = instruction # "_VV_" # vti.LMul.MX # "_" # emul_str;
defm : VPatBinary<intrinsic, inst,
vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, ivti.RegClass>;
}
}
@@ -2510,7 +2698,7 @@
defvar kind = "V"#vti.ScalarSuffix;
defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, vti.ScalarRegClass>;
}
}
@@ -2520,7 +2708,7 @@
foreach vti = vtilist in
defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
vti.Vector, vti.Vector, XLenVT, vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, GPR>;
}
@@ -2529,7 +2717,7 @@
foreach vti = vtilist in
defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
vti.Vector, vti.Vector, XLenVT, vti.Mask,
- vti.SEW, vti.RegClass,
+ vti.Log2SEW, vti.RegClass,
vti.RegClass, imm_type>;
}
@@ -2537,7 +2725,7 @@
foreach mti = AllMasks in
def : VPatBinaryNoMask<intrinsic, instruction # "_MM_" # mti.LMul.MX,
mti.Mask, mti.Mask, mti.Mask,
- mti.SEW, VR, VR>;
+ mti.Log2SEW, VR, VR>;
}
multiclass VPatBinaryW_VV<string intrinsic, string instruction,
@@ -2547,7 +2735,7 @@
defvar Wti = VtiToWti.Wti;
defm : VPatBinary<intrinsic, instruction # "_VV_" # Vti.LMul.MX,
Wti.Vector, Vti.Vector, Vti.Vector, Vti.Mask,
- Vti.SEW, Wti.RegClass,
+ Vti.Log2SEW, Wti.RegClass,
Vti.RegClass, Vti.RegClass>;
}
}
@@ -2560,7 +2748,7 @@
defvar kind = "V"#Vti.ScalarSuffix;
defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
Wti.Vector, Vti.Vector, Vti.Scalar, Vti.Mask,
- Vti.SEW, Wti.RegClass,
+ Vti.Log2SEW, Wti.RegClass,
Vti.RegClass, Vti.ScalarRegClass>;
}
}
@@ -2570,10 +2758,17 @@
foreach VtiToWti = vtilist in {
defvar Vti = VtiToWti.Vti;
defvar Wti = VtiToWti.Wti;
- defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
- Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
- Vti.SEW, Wti.RegClass,
- Wti.RegClass, Vti.RegClass>;
+ def : VPatTiedBinaryNoMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+ Wti.Vector, Vti.Vector,
+ Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
+ let AddedComplexity = 1 in
+ def : VPatTiedBinaryMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+ Wti.Vector, Vti.Vector, Vti.Mask,
+ Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
+ def : VPatBinaryMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+ Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+ Vti.Log2SEW, Wti.RegClass,
+ Wti.RegClass, Vti.RegClass>;
}
}
@@ -2585,7 +2780,7 @@
defvar kind = "W"#Vti.ScalarSuffix;
defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
Wti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
- Vti.SEW, Wti.RegClass,
+ Vti.Log2SEW, Wti.RegClass,
Wti.RegClass, Vti.ScalarRegClass>;
}
}
@@ -2597,7 +2792,7 @@
defvar Wti = VtiToWti.Wti;
defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
Vti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
- Vti.SEW, Vti.RegClass,
+ Vti.Log2SEW, Vti.RegClass,
Wti.RegClass, Vti.RegClass>;
}
}
@@ -2610,7 +2805,7 @@
defvar kind = "W"#Vti.ScalarSuffix;
defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
Vti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
- Vti.SEW, Vti.RegClass,
+ Vti.Log2SEW, Vti.RegClass,
Wti.RegClass, Vti.ScalarRegClass>;
}
}
@@ -2622,7 +2817,7 @@
defvar Wti = VtiToWti.Wti;
defm : VPatBinary<intrinsic, instruction # "_WI_" # Vti.LMul.MX,
Vti.Vector, Wti.Vector, XLenVT, Vti.Mask,
- Vti.SEW, Vti.RegClass,
+ Vti.Log2SEW, Vti.RegClass,
Wti.RegClass, uimm5>;
}
}
@@ -2634,7 +2829,7 @@
defm : VPatBinaryCarryIn<intrinsic, instruction, "VVM",
!if(CarryOut, vti.Mask, vti.Vector),
vti.Vector, vti.Vector, vti.Mask,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, vti.RegClass>;
}
@@ -2646,7 +2841,7 @@
"V"#vti.ScalarSuffix#"M",
!if(CarryOut, vti.Mask, vti.Vector),
vti.Vector, vti.Scalar, vti.Mask,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, vti.ScalarRegClass>;
}
@@ -2656,7 +2851,7 @@
defm : VPatBinaryCarryIn<intrinsic, instruction, "VIM",
!if(CarryOut, vti.Mask, vti.Vector),
vti.Vector, XLenVT, vti.Mask,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, simm5>;
}
@@ -2664,7 +2859,7 @@
foreach vti = AllIntegerVectors in
defm : VPatBinaryMaskOut<intrinsic, instruction, "VV",
vti.Mask, vti.Vector, vti.Vector,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, vti.RegClass>;
}
@@ -2672,7 +2867,7 @@
foreach vti = AllIntegerVectors in
defm : VPatBinaryMaskOut<intrinsic, instruction, "VX",
vti.Mask, vti.Vector, XLenVT,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, GPR>;
}
@@ -2680,7 +2875,7 @@
foreach vti = AllIntegerVectors in
defm : VPatBinaryMaskOut<intrinsic, instruction, "VI",
vti.Mask, vti.Vector, XLenVT,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
vti.RegClass, simm5>;
}
@@ -2689,17 +2884,26 @@
foreach vti = vtilist in
defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
vti.Mask, vti.Vector, vti.Vector, vti.Mask,
- vti.SEW, VR,
+ vti.Log2SEW, VR,
vti.RegClass, vti.RegClass>;
}
+multiclass VPatBinarySwappedM_VV<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatBinarySwapped<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+ vti.Mask, vti.Vector, vti.Vector, vti.Mask,
+ vti.Log2SEW, VR,
+ vti.RegClass, vti.RegClass>;
+}
+
multiclass VPatBinaryM_VX<string intrinsic, string instruction,
list<VTypeInfo> vtilist> {
foreach vti = vtilist in {
defvar kind = "V"#vti.ScalarSuffix;
defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
vti.Mask, vti.Vector, vti.Scalar, vti.Mask,
- vti.SEW, VR,
+ vti.Log2SEW, VR,
vti.RegClass, vti.ScalarRegClass>;
}
}
@@ -2709,92 +2913,68 @@
foreach vti = vtilist in
defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
vti.Mask, vti.Vector, XLenVT, vti.Mask,
- vti.SEW, VR,
+ vti.Log2SEW, VR,
vti.RegClass, simm5>;
}
multiclass VPatBinaryV_VV_VX_VI<string intrinsic, string instruction,
list<VTypeInfo> vtilist, Operand ImmType = simm5>
-{
- defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
-}
+ : VPatBinaryV_VV<intrinsic, instruction, vtilist>,
+ VPatBinaryV_VX<intrinsic, instruction, vtilist>,
+ VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
multiclass VPatBinaryV_VV_VX<string intrinsic, string instruction,
list<VTypeInfo> vtilist>
-{
- defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryV_VV<intrinsic, instruction, vtilist>,
+ VPatBinaryV_VX<intrinsic, instruction, vtilist>;
multiclass VPatBinaryV_VX_VI<string intrinsic, string instruction,
list<VTypeInfo> vtilist>
-{
- defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, simm5>;
-}
+ : VPatBinaryV_VX<intrinsic, instruction, vtilist>,
+ VPatBinaryV_VI<intrinsic, instruction, vtilist, simm5>;
multiclass VPatBinaryW_VV_VX<string intrinsic, string instruction,
list<VTypeInfoToWide> vtilist>
-{
- defm "" : VPatBinaryW_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryW_VX<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryW_VV<intrinsic, instruction, vtilist>,
+ VPatBinaryW_VX<intrinsic, instruction, vtilist>;
multiclass VPatBinaryW_WV_WX<string intrinsic, string instruction,
list<VTypeInfoToWide> vtilist>
-{
- defm "" : VPatBinaryW_WV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryW_WX<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryW_WV<intrinsic, instruction, vtilist>,
+ VPatBinaryW_WX<intrinsic, instruction, vtilist>;
multiclass VPatBinaryV_WV_WX_WI<string intrinsic, string instruction,
list<VTypeInfoToWide> vtilist>
-{
- defm "" : VPatBinaryV_WV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_WX<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_WI<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryV_WV<intrinsic, instruction, vtilist>,
+ VPatBinaryV_WX<intrinsic, instruction, vtilist>,
+ VPatBinaryV_WI<intrinsic, instruction, vtilist>;
multiclass VPatBinaryV_VM_XM_IM<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_VM<intrinsic, instruction>;
- defm "" : VPatBinaryV_XM<intrinsic, instruction>;
- defm "" : VPatBinaryV_IM<intrinsic, instruction>;
-}
+ : VPatBinaryV_VM<intrinsic, instruction>,
+ VPatBinaryV_XM<intrinsic, instruction>,
+ VPatBinaryV_IM<intrinsic, instruction>;
multiclass VPatBinaryM_VM_XM_IM<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
- defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
- defm "" : VPatBinaryV_IM<intrinsic, instruction, /*CarryOut=*/1>;
-}
+ : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
+ VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>,
+ VPatBinaryV_IM<intrinsic, instruction, /*CarryOut=*/1>;
multiclass VPatBinaryM_V_X_I<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_V<intrinsic, instruction>;
- defm "" : VPatBinaryV_X<intrinsic, instruction>;
- defm "" : VPatBinaryV_I<intrinsic, instruction>;
-}
+ : VPatBinaryV_V<intrinsic, instruction>,
+ VPatBinaryV_X<intrinsic, instruction>,
+ VPatBinaryV_I<intrinsic, instruction>;
multiclass VPatBinaryV_VM_XM<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_VM<intrinsic, instruction>;
- defm "" : VPatBinaryV_XM<intrinsic, instruction>;
-}
+ : VPatBinaryV_VM<intrinsic, instruction>,
+ VPatBinaryV_XM<intrinsic, instruction>;
multiclass VPatBinaryM_VM_XM<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
- defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
-}
+ : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
+ VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
multiclass VPatBinaryM_V_X<string intrinsic, string instruction>
-{
- defm "" : VPatBinaryV_V<intrinsic, instruction>;
- defm "" : VPatBinaryV_X<intrinsic, instruction>;
-}
+ : VPatBinaryV_V<intrinsic, instruction>,
+ VPatBinaryV_X<intrinsic, instruction>;
multiclass VPatTernary<string intrinsic,
string inst,
@@ -2821,7 +3001,7 @@
foreach vti = vtilist in
defm : VPatTernary<intrinsic, instruction, "VV",
vti.Vector, vti.Vector, vti.Vector, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
vti.RegClass, vti.RegClass>;
}
@@ -2830,7 +3010,7 @@
foreach vti = vtilist in
defm : VPatTernary<intrinsic, instruction, "VX",
vti.Vector, vti.Vector, XLenVT, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
vti.RegClass, GPR>;
}
@@ -2840,7 +3020,7 @@
defm : VPatTernary<intrinsic, instruction,
"V"#vti.ScalarSuffix,
vti.Vector, vti.Scalar, vti.Vector, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
vti.ScalarRegClass, vti.RegClass>;
}
@@ -2849,7 +3029,7 @@
foreach vti = vtilist in
defm : VPatTernary<intrinsic, instruction, "VI",
vti.Vector, vti.Vector, XLenVT, vti.Mask,
- vti.SEW, vti.LMul, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
vti.RegClass, Imm_type>;
}
@@ -2860,7 +3040,7 @@
defvar wti = vtiToWti.Wti;
defm : VPatTernary<intrinsic, instruction, "VV",
wti.Vector, vti.Vector, vti.Vector,
- vti.Mask, vti.SEW, vti.LMul,
+ vti.Mask, vti.Log2SEW, vti.LMul,
wti.RegClass, vti.RegClass, vti.RegClass>;
}
}
@@ -2873,58 +3053,47 @@
defm : VPatTernary<intrinsic, instruction,
"V"#vti.ScalarSuffix,
wti.Vector, vti.Scalar, vti.Vector,
- vti.Mask, vti.SEW, vti.LMul,
+ vti.Mask, vti.Log2SEW, vti.LMul,
wti.RegClass, vti.ScalarRegClass, vti.RegClass>;
}
}
multiclass VPatTernaryV_VV_VX_AAXA<string intrinsic, string instruction,
- list<VTypeInfo> vtilist> {
- defm "" : VPatTernaryV_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatTernaryV_VX_AAXA<intrinsic, instruction, vtilist>;
-}
+ list<VTypeInfo> vtilist>
+ : VPatTernaryV_VV<intrinsic, instruction, vtilist>,
+ VPatTernaryV_VX_AAXA<intrinsic, instruction, vtilist>;
multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
- list<VTypeInfo> vtilist, Operand Imm_type = simm5> {
- defm "" : VPatTernaryV_VX<intrinsic, instruction, vtilist>;
- defm "" : VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
-}
+ list<VTypeInfo> vtilist, Operand Imm_type = simm5>
+ : VPatTernaryV_VX<intrinsic, instruction, vtilist>,
+ VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
multiclass VPatBinaryM_VV_VX_VI<string intrinsic, string instruction,
list<VTypeInfo> vtilist>
-{
- defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryM_VV<intrinsic, instruction, vtilist>,
+ VPatBinaryM_VX<intrinsic, instruction, vtilist>,
+ VPatBinaryM_VI<intrinsic, instruction, vtilist>;
multiclass VPatTernaryW_VV_VX<string intrinsic, string instruction,
- list<VTypeInfoToWide> vtilist> {
- defm "" : VPatTernaryW_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatTernaryW_VX<intrinsic, instruction, vtilist>;
-}
+ list<VTypeInfoToWide> vtilist>
+ : VPatTernaryW_VV<intrinsic, instruction, vtilist>,
+ VPatTernaryW_VX<intrinsic, instruction, vtilist>;
multiclass VPatBinaryM_VV_VX<string intrinsic, string instruction,
list<VTypeInfo> vtilist>
-{
- defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryM_VV<intrinsic, instruction, vtilist>,
+ VPatBinaryM_VX<intrinsic, instruction, vtilist>;
multiclass VPatBinaryM_VX_VI<string intrinsic, string instruction,
list<VTypeInfo> vtilist>
-{
- defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
-}
+ : VPatBinaryM_VX<intrinsic, instruction, vtilist>,
+ VPatBinaryM_VI<intrinsic, instruction, vtilist>;
multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
list<VTypeInfo> vtilist, Operand ImmType = simm5>
-{
- defm "" : VPatBinaryV_VV_INT<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VX_INT<intrinsic, instruction, vtilist>;
- defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
-}
+ : VPatBinaryV_VV_INT<intrinsic#"_vv", instruction, vtilist>,
+ VPatBinaryV_VX_INT<intrinsic#"_vx", instruction, vtilist>,
+ VPatBinaryV_VI<intrinsic#"_vx", instruction, vtilist, ImmType>;
multiclass VPatReductionV_VS<string intrinsic, string instruction, bit IsFloat = 0> {
foreach vti = !if(IsFloat, NoGroupFloatVectors, NoGroupIntegerVectors) in
@@ -2933,7 +3102,7 @@
defm : VPatTernary<intrinsic, instruction, "VS",
vectorM1.Vector, vti.Vector,
vectorM1.Vector, vti.Mask,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
VR, vti.RegClass, VR>;
}
foreach gvti = !if(IsFloat, GroupFloatVectors, GroupIntegerVectors) in
@@ -2941,7 +3110,7 @@
defm : VPatTernary<intrinsic, instruction, "VS",
gvti.VectorM1, gvti.Vector,
gvti.VectorM1, gvti.Mask,
- gvti.SEW, gvti.LMul,
+ gvti.Log2SEW, gvti.LMul,
VR, gvti.RegClass, VR>;
}
}
@@ -2955,7 +3124,7 @@
defm : VPatTernary<intrinsic, instruction, "VS",
wtiM1.Vector, vti.Vector,
wtiM1.Vector, vti.Mask,
- vti.SEW, vti.LMul,
+ vti.Log2SEW, vti.LMul,
wtiM1.RegClass, vti.RegClass,
wtiM1.RegClass>;
}
@@ -2970,7 +3139,7 @@
defvar ivti = GetIntVTypeInfo<fvti>.Vti;
defm : VPatConversion<intrinsic, instruction, "V",
- ivti.Vector, fvti.Vector, ivti.Mask, fvti.SEW,
+ ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW,
fvti.LMul, ivti.RegClass, fvti.RegClass>;
}
}
@@ -2983,7 +3152,7 @@
defvar ivti = GetIntVTypeInfo<fvti>.Vti;
defm : VPatConversion<intrinsic, instruction, "V",
- fvti.Vector, ivti.Vector, fvti.Mask, ivti.SEW,
+ fvti.Vector, ivti.Vector, fvti.Mask, ivti.Log2SEW,
ivti.LMul, fvti.RegClass, ivti.RegClass>;
}
}
@@ -2995,7 +3164,7 @@
defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
defm : VPatConversion<intrinsic, instruction, "V",
- iwti.Vector, fvti.Vector, iwti.Mask, fvti.SEW,
+ iwti.Vector, fvti.Vector, iwti.Mask, fvti.Log2SEW,
fvti.LMul, iwti.RegClass, fvti.RegClass>;
}
}
@@ -3007,7 +3176,7 @@
defvar fwti = vtiToWti.Wti;
defm : VPatConversion<intrinsic, instruction, "V",
- fwti.Vector, vti.Vector, fwti.Mask, vti.SEW,
+ fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW,
vti.LMul, fwti.RegClass, vti.RegClass>;
}
}
@@ -3019,7 +3188,7 @@
defvar fwti = fvtiToFWti.Wti;
defm : VPatConversion<intrinsic, instruction, "V",
- fwti.Vector, fvti.Vector, fwti.Mask, fvti.SEW,
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
fvti.LMul, fwti.RegClass, fvti.RegClass>;
}
}
@@ -3031,7 +3200,7 @@
defvar fwti = vtiToWti.Wti;
defm : VPatConversion<intrinsic, instruction, "W",
- vti.Vector, fwti.Vector, vti.Mask, vti.SEW,
+ vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.RegClass, fwti.RegClass>;
}
}
@@ -3043,7 +3212,7 @@
defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
defm : VPatConversion<intrinsic, instruction, "W",
- fvti.Vector, iwti.Vector, fvti.Mask, fvti.SEW,
+ fvti.Vector, iwti.Vector, fvti.Mask, fvti.Log2SEW,
fvti.LMul, fvti.RegClass, iwti.RegClass>;
}
}
@@ -3055,7 +3224,7 @@
defvar fwti = fvtiToFWti.Wti;
defm : VPatConversion<intrinsic, instruction, "W",
- fvti.Vector, fwti.Vector, fvti.Mask, fvti.SEW,
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
fvti.LMul, fvti.RegClass, fwti.RegClass>;
}
}
@@ -3082,16 +3251,16 @@
foreach eew = EEWList in {
foreach vti = vtilist in {
if !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)) then {
- defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
+ defvar octuple_lmul = vti.LMul.octuple;
// Calculate emul = eew * lmul / sew
- defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<vti.SEW>.val);
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), vti.Log2SEW);
if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
defvar emulMX = octuple_to_str<octuple_emul>.ret;
defvar offsetVti = !cast<VTypeInfo>("VI" # eew # emulMX);
defvar inst_ei = inst # "EI" # eew;
defm : VPatAMOWD<intrinsic, inst_ei,
vti.Vector, offsetVti.Vector,
- vti.Mask, vti.SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>;
+ vti.Mask, vti.Log2SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>;
}
}
}
@@ -3121,8 +3290,35 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1,
Uses = [VL] in
-def PseudoReadVL : Pseudo<(outs GPR:$rd), (ins),
- [(set GPR:$rd, (riscv_read_vl))]>;
+def PseudoReadVL : Pseudo<(outs GPR:$rd), (ins), []>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in {
+ def PseudoVSPILL_M1 : VPseudo<VS1R_V, V_M1, (outs), (ins VR:$rs1, GPR:$rs2)>;
+ def PseudoVSPILL_M2 : VPseudo<VS2R_V, V_M2, (outs), (ins VRM2:$rs1, GPR:$rs2)>;
+ def PseudoVSPILL_M4 : VPseudo<VS4R_V, V_M4, (outs), (ins VRM4:$rs1, GPR:$rs2)>;
+ def PseudoVSPILL_M8 : VPseudo<VS8R_V, V_M8, (outs), (ins VRM8:$rs1, GPR:$rs2)>;
+}
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in {
+ def PseudoVRELOAD_M1 : VPseudo<VL1RE8_V, V_M1, (outs VR:$rs1), (ins GPR:$rs2)>;
+ def PseudoVRELOAD_M2 : VPseudo<VL2RE8_V, V_M2, (outs VRM2:$rs1), (ins GPR:$rs2)>;
+ def PseudoVRELOAD_M4 : VPseudo<VL4RE8_V, V_M4, (outs VRM4:$rs1), (ins GPR:$rs2)>;
+ def PseudoVRELOAD_M8 : VPseudo<VL8RE8_V, V_M8, (outs VRM8:$rs1), (ins GPR:$rs2)>;
+}
+
+foreach lmul = MxList.m in {
+ foreach nf = NFSet<lmul>.L in {
+ defvar vreg = SegRegClass<lmul, nf>.RC;
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in {
+ def "PseudoVSPILL" # nf # "_" # lmul.MX :
+ Pseudo<(outs), (ins vreg:$rs1, GPR:$rs2, GPR:$vlenb), []>;
+ }
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in {
+ def "PseudoVRELOAD" # nf # "_" # lmul.MX :
+ Pseudo<(outs vreg:$rs1), (ins GPR:$rs2, GPR:$vlenb), []>;
+ }
+ }
+}
//===----------------------------------------------------------------------===//
// 6. Configuration-Setting Instructions
@@ -3143,10 +3339,8 @@
//===----------------------------------------------------------------------===//
// Pseudos Unit-Stride Loads and Stores
-foreach eew = EEWList in {
- defm PseudoVLE # eew : VPseudoUSLoad;
- defm PseudoVSE # eew : VPseudoUSStore;
-}
+defm PseudoVL : VPseudoUSLoad</*isFF=*/false>;
+defm PseudoVS : VPseudoUSStore;
defm PseudoVLE1 : VPseudoLoadMask;
defm PseudoVSE1 : VPseudoStoreMask;
@@ -3156,22 +3350,18 @@
//===----------------------------------------------------------------------===//
// Vector Strided Loads and Stores
-foreach eew = EEWList in {
- defm PseudoVLSE # eew : VPseudoSLoad;
- defm PseudoVSSE # eew : VPseudoSStore;
-}
+defm PseudoVLS : VPseudoSLoad;
+defm PseudoVSS : VPseudoSStore;
//===----------------------------------------------------------------------===//
// 7.6 Vector Indexed Instructions
//===----------------------------------------------------------------------===//
// Vector Indexed Loads and Stores
-foreach eew = EEWList in {
- defm PseudoVLUXEI # eew : VPseudoILoad;
- defm PseudoVLOXEI # eew : VPseudoILoad;
- defm PseudoVSOXEI # eew : VPseudoIStore;
- defm PseudoVSUXEI # eew : VPseudoIStore;
-}
+defm PseudoVLUX : VPseudoILoad</*Ordered=*/false>;
+defm PseudoVLOX : VPseudoILoad</*Ordered=*/true>;
+defm PseudoVSOX : VPseudoIStore</*Ordered=*/true>;
+defm PseudoVSUX : VPseudoIStore</*Ordered=*/false>;
//===----------------------------------------------------------------------===//
// 7.7. Unit-stride Fault-Only-First Loads
@@ -3179,25 +3369,23 @@
// vleff may update VL register
let hasSideEffects = 1, Defs = [VL] in
-foreach eew = EEWList in {
- defm PseudoVLE # eew # FF : VPseudoUSLoad;
-}
+defm PseudoVL : VPseudoUSLoad</*isFF=*/true>;
//===----------------------------------------------------------------------===//
// 7.8. Vector Load/Store Segment Instructions
//===----------------------------------------------------------------------===//
-defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/false>;
+defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/false>;
defm PseudoVLSSEG : VPseudoSSegLoad;
-defm PseudoVLOXSEG : VPseudoISegLoad;
-defm PseudoVLUXSEG : VPseudoISegLoad;
+defm PseudoVLOXSEG : VPseudoISegLoad</*Ordered=*/true>;
+defm PseudoVLUXSEG : VPseudoISegLoad</*Ordered=*/false>;
defm PseudoVSSEG : VPseudoUSSegStore;
defm PseudoVSSSEG : VPseudoSSegStore;
-defm PseudoVSOXSEG : VPseudoISegStore;
-defm PseudoVSUXSEG : VPseudoISegStore;
+defm PseudoVSOXSEG : VPseudoISegStore</*Ordered=*/true>;
+defm PseudoVSUXSEG : VPseudoISegStore</*Ordered=*/false>;
// vlseg<nf>e<eew>ff.v may update VL register
let hasSideEffects = 1, Defs = [VL] in
-defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/true>;
+defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/true>;
//===----------------------------------------------------------------------===//
// 8. Vector AMO Operations
@@ -3223,6 +3411,53 @@
defm PseudoVSUB : VPseudoBinaryV_VV_VX;
defm PseudoVRSUB : VPseudoBinaryV_VX_VI;
+foreach vti = AllIntegerVectors in {
+ // Match vrsub with 2 vector operands to vsub.vv by swapping operands. This
+ // Occurs when legalizing vrsub.vx intrinsics for i64 on RV32 since we need
+ // to use a more complex splat sequence. Add the pattern for all VTs for
+ // consistency.
+ def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$rs2),
+ (vti.Vector vti.RegClass:$rs1),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
+ vti.RegClass:$rs2,
+ GPR:$vl,
+ vti.Log2SEW)>;
+ def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$merge),
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge,
+ vti.RegClass:$rs1,
+ vti.RegClass:$rs2,
+ (vti.Mask V0),
+ GPR:$vl,
+ vti.Log2SEW)>;
+
+ // Match VSUB with a small immediate to vadd.vi by negating the immediate.
+ def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+ (NegImm simm5_plus1:$rs2),
+ GPR:$vl,
+ vti.Log2SEW)>;
+ def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$merge),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ (vti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge,
+ vti.RegClass:$rs1,
+ (NegImm simm5_plus1:$rs2),
+ (vti.Mask V0),
+ GPR:$vl,
+ vti.Log2SEW)>;
+}
+
//===----------------------------------------------------------------------===//
// 12.2. Vector Widening Integer Add/Subtract
//===----------------------------------------------------------------------===//
@@ -3336,12 +3571,12 @@
defm PseudoVWMACCUS : VPseudoTernaryW_VX;
//===----------------------------------------------------------------------===//
-// 12.16. Vector Integer Merge Instructions
+// 12.15. Vector Integer Merge Instructions
//===----------------------------------------------------------------------===//
defm PseudoVMERGE : VPseudoBinaryV_VM_XM_IM;
//===----------------------------------------------------------------------===//
-// 12.17. Vector Integer Move Instructions
+// 12.16. Vector Integer Move Instructions
//===----------------------------------------------------------------------===//
defm PseudoVMV_V : VPseudoUnaryV_V_X_I_NoDummyMask;
@@ -3358,7 +3593,7 @@
//===----------------------------------------------------------------------===//
// 13.2. Vector Single-Width Averaging Add and Subtract
//===----------------------------------------------------------------------===//
-let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+let Uses = [VXRM], hasSideEffects = 1 in {
defm PseudoVAADDU : VPseudoBinaryV_VV_VX;
defm PseudoVAADD : VPseudoBinaryV_VV_VX;
defm PseudoVASUBU : VPseudoBinaryV_VV_VX;
@@ -3368,14 +3603,14 @@
//===----------------------------------------------------------------------===//
// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
//===----------------------------------------------------------------------===//
-let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
defm PseudoVSMUL : VPseudoBinaryV_VV_VX;
}
//===----------------------------------------------------------------------===//
// 13.4. Vector Single-Width Scaling Shift Instructions
//===----------------------------------------------------------------------===//
-let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+let Uses = [VXRM], hasSideEffects = 1 in {
defm PseudoVSSRL : VPseudoBinaryV_VV_VX_VI<uimm5>;
defm PseudoVSSRA : VPseudoBinaryV_VV_VX_VI<uimm5>;
}
@@ -3383,7 +3618,7 @@
//===----------------------------------------------------------------------===//
// 13.5. Vector Narrowing Fixed-Point Clip Instructions
//===----------------------------------------------------------------------===//
-let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
defm PseudoVNCLIP : VPseudoBinaryV_WV_WX_WI;
defm PseudoVNCLIPU : VPseudoBinaryV_WV_WX_WI;
}
@@ -3579,7 +3814,7 @@
defm PseudoVMORNOT: VPseudoBinaryM_MM;
defm PseudoVMXNOR: VPseudoBinaryM_MM;
-// Pseudo insturctions
+// Pseudo instructions
defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">;
defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
@@ -3629,19 +3864,18 @@
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtV] in {
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
- Uses = [VL, VTYPE] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach m = MxList.m in {
let VLMul = m.value in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
def PseudoVMV_X_S # "_" # m.MX: Pseudo<(outs GPR:$rd),
(ins m.vrclass:$rs2, ixlenimm:$sew),
[]>, RISCVVPseudo;
- let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, WritesElement0 = 1,
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
Constraints = "$rd = $rs1" in
def PseudoVMV_S_X # "_" # m.MX: Pseudo<(outs m.vrclass:$rd),
(ins m.vrclass:$rs1, GPR:$rs2,
- GPR:$vl, ixlenimm:$sew),
+ AVL:$vl, ixlenimm:$sew),
[]>, RISCVVPseudo;
}
}
@@ -3653,8 +3887,7 @@
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtV, HasStdExtF] in {
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
- Uses = [VL, VTYPE] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach m = MxList.m in {
foreach f = FPList.fpinfo in {
let VLMul = m.value in {
@@ -3664,12 +3897,12 @@
(ins m.vrclass:$rs2,
ixlenimm:$sew),
[]>, RISCVVPseudo;
- let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, WritesElement0 = 1,
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
Constraints = "$rd = $rs1" in
def "PseudoVFMV_S_" # f.FX # "_" # m.MX :
Pseudo<(outs m.vrclass:$rd),
(ins m.vrclass:$rs1, f.fprclass:$rs2,
- GPR:$vl, ixlenimm:$sew),
+ AVL:$vl, ixlenimm:$sew),
[]>, RISCVVPseudo;
}
}
@@ -3706,109 +3939,24 @@
//===----------------------------------------------------------------------===//
// Patterns.
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV] in {
-
-//===----------------------------------------------------------------------===//
-// 7. Vector Loads and Stores
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// 7.4 Vector Unit-Stride Instructions
-//===----------------------------------------------------------------------===//
-
-foreach vti = AllVectors in
-{
- defm : VPatUSLoad<"int_riscv_vle",
- "PseudoVLE" # vti.SEW,
- vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
- defm : VPatUSLoadFF<"PseudoVLE" # vti.SEW # "FF",
- vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
- defm : VPatUSStore<"int_riscv_vse",
- "PseudoVSE" # vti.SEW,
- vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
-}
-
-foreach vti = AllMasks in {
- defvar PseudoVLE1 = !cast<Instruction>("PseudoVLE1_V_"#vti.BX);
- def : Pat<(vti.Mask (int_riscv_vle1 GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
- (PseudoVLE1 $rs1, GPR:$vl, vti.SEW)>;
- defvar PseudoVSE1 = !cast<Instruction>("PseudoVSE1_V_"#vti.BX);
- def : Pat<(int_riscv_vse1 (vti.Mask VR:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
- (PseudoVSE1 $rs3, $rs1, GPR:$vl, vti.SEW)>;
-}
-
-//===----------------------------------------------------------------------===//
-// 7.5 Vector Strided Instructions
-//===----------------------------------------------------------------------===//
-
-foreach vti = AllVectors in
-{
- defm : VPatSLoad<"int_riscv_vlse",
- "PseudoVLSE" # vti.SEW,
- vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
- defm : VPatSStore<"int_riscv_vsse",
- "PseudoVSSE" # vti.SEW,
- vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
-}
-
-//===----------------------------------------------------------------------===//
-// 7.6 Vector Indexed Instructions
-//===----------------------------------------------------------------------===//
-
-foreach vti = AllVectors in
-foreach eew = EEWList in {
- defvar vlmul = vti.LMul;
- defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
- defvar log_sew = shift_amount<vti.SEW>.val;
- // The data vector register group has EEW=SEW, EMUL=LMUL, while the offset
- // vector register group has EEW encoding in the instruction and EMUL=(EEW/SEW)*LMUL.
- // calculate octuple elmul which is (eew * octuple_lmul) >> log_sew
- defvar octuple_elmul = !srl(!mul(eew, octuple_lmul), log_sew);
- // legal octuple elmul should be more than 0 and less than equal 64
- if !gt(octuple_elmul, 0) then {
- if !le(octuple_elmul, 64) then {
- defvar elmul_str = octuple_to_str<octuple_elmul>.ret;
- defvar elmul =!cast<LMULInfo>("V_" # elmul_str);
- defvar idx_vti = !cast<VTypeInfo>("VI" # eew # elmul_str);
-
- defm : VPatILoad<"int_riscv_vluxei",
- "PseudoVLUXEI"#eew,
- vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
- vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
- defm : VPatILoad<"int_riscv_vloxei",
- "PseudoVLOXEI"#eew,
- vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
- vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
- defm : VPatIStore<"int_riscv_vsoxei",
- "PseudoVSOXEI"#eew,
- vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
- vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
- defm : VPatIStore<"int_riscv_vsuxei",
- "PseudoVSUXEI"#eew,
- vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
- vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
- }
- }
-}
-} // Predicates = [HasStdExtV]
//===----------------------------------------------------------------------===//
// 8. Vector AMO Operations
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtZvamo] in {
- defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>;
- defm "" : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
} // Predicates = [HasStdExtZvamo]
let Predicates = [HasStdExtZvamo, HasStdExtF] in {
- defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
+ defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
} // Predicates = [HasStdExtZvamo, HasStdExtF]
//===----------------------------------------------------------------------===//
@@ -3819,256 +3967,317 @@
//===----------------------------------------------------------------------===//
// 12.1. Vector Single-Width Integer Add and Subtract
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vadd", "PseudoVADD", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vsub", "PseudoVSUB", AllIntegerVectors>;
-defm "" : VPatBinaryV_VX_VI<"int_riscv_vrsub", "PseudoVRSUB", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vadd", "PseudoVADD", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vsub", "PseudoVSUB", AllIntegerVectors>;
+defm : VPatBinaryV_VX_VI<"int_riscv_vrsub", "PseudoVRSUB", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.2. Vector Widening Integer Add/Subtract
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwaddu", "PseudoVWADDU", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsubu", "PseudoVWSUBU", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwadd", "PseudoVWADD", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsub", "PseudoVWSUB", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vwaddu_w", "PseudoVWADDU", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsubu_w", "PseudoVWSUBU", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vwadd_w", "PseudoVWADD", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsub_w", "PseudoVWSUB", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwaddu", "PseudoVWADDU", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwsubu", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwadd", "PseudoVWADD", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwsub", "PseudoVWSUB", AllWidenableIntVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vwaddu_w", "PseudoVWADDU", AllWidenableIntVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vwsubu_w", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vwadd_w", "PseudoVWADD", AllWidenableIntVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vwsub_w", "PseudoVWSUB", AllWidenableIntVectors>;
//===----------------------------------------------------------------------===//
// 12.3. Vector Integer Extension
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF2",
- AllFractionableVF2IntVectors>;
-defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF4",
- AllFractionableVF4IntVectors>;
-defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF8",
- AllFractionableVF8IntVectors>;
-defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF2",
- AllFractionableVF2IntVectors>;
-defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF4",
- AllFractionableVF4IntVectors>;
-defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF8",
- AllFractionableVF8IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF8",
+ AllFractionableVF8IntVectors>;
//===----------------------------------------------------------------------===//
// 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vadc", "PseudoVADC">;
-defm "" : VPatBinaryM_VM_XM_IM<"int_riscv_vmadc_carry_in", "PseudoVMADC">;
-defm "" : VPatBinaryM_V_X_I<"int_riscv_vmadc", "PseudoVMADC">;
+defm : VPatBinaryV_VM_XM_IM<"int_riscv_vadc", "PseudoVADC">;
+defm : VPatBinaryM_VM_XM_IM<"int_riscv_vmadc_carry_in", "PseudoVMADC">;
+defm : VPatBinaryM_V_X_I<"int_riscv_vmadc", "PseudoVMADC">;
-defm "" : VPatBinaryV_VM_XM<"int_riscv_vsbc", "PseudoVSBC">;
-defm "" : VPatBinaryM_VM_XM<"int_riscv_vmsbc_borrow_in", "PseudoVMSBC">;
-defm "" : VPatBinaryM_V_X<"int_riscv_vmsbc", "PseudoVMSBC">;
+defm : VPatBinaryV_VM_XM<"int_riscv_vsbc", "PseudoVSBC">;
+defm : VPatBinaryM_VM_XM<"int_riscv_vmsbc_borrow_in", "PseudoVMSBC">;
+defm : VPatBinaryM_V_X<"int_riscv_vmsbc", "PseudoVMSBC">;
//===----------------------------------------------------------------------===//
// 12.5. Vector Bitwise Logical Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vand", "PseudoVAND", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vor", "PseudoVOR", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vxor", "PseudoVXOR", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vand", "PseudoVAND", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vor", "PseudoVOR", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vxor", "PseudoVXOR", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.6. Vector Single-Width Bit Shift Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsll", "PseudoVSLL", AllIntegerVectors,
- uimm5>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors,
- uimm5>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
- uimm5>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsll", "PseudoVSLL", AllIntegerVectors,
+ uimm5>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors,
+ uimm5>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
+ uimm5>;
//===----------------------------------------------------------------------===//
// 12.7. Vector Narrowing Integer Right Shift Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsrl", "PseudoVNSRL", AllWidenableIntVectors>;
-defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsra", "PseudoVNSRA", AllWidenableIntVectors>;
+defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnsrl", "PseudoVNSRL", AllWidenableIntVectors>;
+defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnsra", "PseudoVNSRA", AllWidenableIntVectors>;
//===----------------------------------------------------------------------===//
// 12.8. Vector Integer Comparison Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmseq", "PseudoVMSEQ", AllIntegerVectors>;
-defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsne", "PseudoVMSNE", AllIntegerVectors>;
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmsltu", "PseudoVMSLTU", AllIntegerVectors>;
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmslt", "PseudoVMSLT", AllIntegerVectors>;
-defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsleu", "PseudoVMSLEU", AllIntegerVectors>;
-defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsle", "PseudoVMSLE", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX_VI<"int_riscv_vmseq", "PseudoVMSEQ", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX_VI<"int_riscv_vmsne", "PseudoVMSNE", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmsltu", "PseudoVMSLTU", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmslt", "PseudoVMSLT", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX_VI<"int_riscv_vmsleu", "PseudoVMSLEU", AllIntegerVectors>;
+defm : VPatBinaryM_VV_VX_VI<"int_riscv_vmsle", "PseudoVMSLE", AllIntegerVectors>;
-defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgtu", "PseudoVMSGTU", AllIntegerVectors>;
-defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgt", "PseudoVMSGT", AllIntegerVectors>;
+defm : VPatBinaryM_VX_VI<"int_riscv_vmsgtu", "PseudoVMSGTU", AllIntegerVectors>;
+defm : VPatBinaryM_VX_VI<"int_riscv_vmsgt", "PseudoVMSGT", AllIntegerVectors>;
+
+// Match vmsgt with 2 vector operands to vmslt with the operands swapped.
+defm : VPatBinarySwappedM_VV<"int_riscv_vmsgtu", "PseudoVMSLTU", AllIntegerVectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmsgt", "PseudoVMSLT", AllIntegerVectors>;
+
+defm : VPatBinarySwappedM_VV<"int_riscv_vmsgeu", "PseudoVMSLEU", AllIntegerVectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmsge", "PseudoVMSLE", AllIntegerVectors>;
// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This
// avoids the user needing to know that there is no vmslt(u).vi instruction.
-// This is limited to vmslt(u).vx as there is no vmsge().vx intrinsic or
-// instruction.
+// Similar for vmsge(u).vx intrinsics using vmslt(u).vi.
foreach vti = AllIntegerVectors in {
def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1),
- (vti.Scalar simm5_plus1:$rs2), (XLenVT (VLOp GPR:$vl)))),
+ (vti.Scalar simm5_plus1:$rs2),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
(DecImm simm5_plus1:$rs2),
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask VR:$merge),
(vti.Vector vti.RegClass:$rs1),
(vti.Scalar simm5_plus1:$rs2),
(vti.Mask V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
VR:$merge,
vti.RegClass:$rs1,
(DecImm simm5_plus1:$rs2),
(vti.Mask V0),
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
(vti.Scalar simm5_plus1:$rs2),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
(DecImm simm5_plus1:$rs2),
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
(vti.Vector vti.RegClass:$rs1),
(vti.Scalar simm5_plus1:$rs2),
(vti.Mask V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
VR:$merge,
vti.RegClass:$rs1,
(DecImm simm5_plus1:$rs2),
(vti.Mask V0),
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
// Special cases to avoid matching vmsltu.vi 0 (always false) to
// vmsleu.vi -1 (always true). Instead match to vmsne.vv.
def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
- (vti.Scalar 0), (XLenVT (VLOp GPR:$vl)))),
+ (vti.Scalar 0), VLOpFrag)),
(!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
vti.RegClass:$rs1,
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
(vti.Vector vti.RegClass:$rs1),
(vti.Scalar 0),
(vti.Mask V0),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
VR:$merge,
vti.RegClass:$rs1,
vti.RegClass:$rs1,
(vti.Mask V0),
GPR:$vl,
- vti.SEW)>;
+ vti.Log2SEW)>;
+
+ def : Pat<(vti.Mask (int_riscv_vmsge (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ GPR:$vl,
+ vti.Log2SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmsge_mask (vti.Mask VR:$merge),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ (vti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (vti.Mask V0),
+ GPR:$vl,
+ vti.Log2SEW)>;
+
+ def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ GPR:$vl,
+ vti.Log2SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ (vti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (vti.Mask V0),
+ GPR:$vl,
+ vti.Log2SEW)>;
+
+ // Special cases to avoid matching vmsgeu.vi 0 (always true) to
+ // vmsgtu.vi -1 (always false). Instead match to vmsne.vv.
+ def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar 0), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
+ vti.RegClass:$rs1,
+ GPR:$vl,
+ vti.Log2SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar 0),
+ (vti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ vti.RegClass:$rs1,
+ (vti.Mask V0),
+ GPR:$vl,
+ vti.Log2SEW)>;
}
//===----------------------------------------------------------------------===//
// 12.9. Vector Integer Min/Max Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vminu", "PseudoVMINU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmin", "PseudoVMIN", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmaxu", "PseudoVMAXU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmax", "PseudoVMAX", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vminu", "PseudoVMINU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmin", "PseudoVMIN", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmaxu", "PseudoVMAXU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmax", "PseudoVMAX", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.10. Vector Single-Width Integer Multiply Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmul", "PseudoVMUL", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulh", "PseudoVMULH", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhu", "PseudoVMULHU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhsu", "PseudoVMULHSU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmul", "PseudoVMUL", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmulh", "PseudoVMULH", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmulhu", "PseudoVMULHU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vmulhsu", "PseudoVMULHSU", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.11. Vector Integer Divide Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vdivu", "PseudoVDIVU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vdiv", "PseudoVDIV", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vremu", "PseudoVREMU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vrem", "PseudoVREM", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vdivu", "PseudoVDIVU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vdiv", "PseudoVDIV", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vremu", "PseudoVREMU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vrem", "PseudoVREM", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.12. Vector Widening Integer Multiply Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmul", "PseudoVWMUL", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulu", "PseudoVWMULU", AllWidenableIntVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulsu", "PseudoVWMULSU", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwmul", "PseudoVWMUL", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwmulu", "PseudoVWMULU", AllWidenableIntVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vwmulsu", "PseudoVWMULSU", AllWidenableIntVectors>;
//===----------------------------------------------------------------------===//
// 12.13. Vector Single-Width Integer Multiply-Add Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmadd", "PseudoVMADD", AllIntegerVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsub", "PseudoVNMSUB", AllIntegerVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmacc", "PseudoVMACC", AllIntegerVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsac", "PseudoVNMSAC", AllIntegerVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmadd", "PseudoVMADD", AllIntegerVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsub", "PseudoVNMSUB", AllIntegerVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmacc", "PseudoVMACC", AllIntegerVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsac", "PseudoVNMSAC", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 12.14. Vector Widening Integer Multiply-Add Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccu", "PseudoVWMACCU", AllWidenableIntVectors>;
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmacc", "PseudoVWMACC", AllWidenableIntVectors>;
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccsu", "PseudoVWMACCSU", AllWidenableIntVectors>;
-defm "" : VPatTernaryW_VX<"int_riscv_vwmaccus", "PseudoVWMACCUS", AllWidenableIntVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vwmaccu", "PseudoVWMACCU", AllWidenableIntVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vwmacc", "PseudoVWMACC", AllWidenableIntVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vwmaccsu", "PseudoVWMACCSU", AllWidenableIntVectors>;
+defm : VPatTernaryW_VX<"int_riscv_vwmaccus", "PseudoVWMACCUS", AllWidenableIntVectors>;
//===----------------------------------------------------------------------===//
-// 12.16. Vector Integer Merge Instructions
+// 12.15. Vector Integer Merge Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
+defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
//===----------------------------------------------------------------------===//
-// 12.17. Vector Integer Move Instructions
+// 12.16. Vector Integer Move Instructions
//===----------------------------------------------------------------------===//
foreach vti = AllVectors in {
def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
- (XLenVT (VLOp GPR:$vl)))),
+ VLOpFrag)),
(!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
- $rs1, GPR:$vl, vti.SEW)>;
-}
+ $rs1, GPR:$vl, vti.Log2SEW)>;
-foreach vti = AllIntegerVectors in {
- def : Pat<(vti.Vector (int_riscv_vmv_v_x GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
- (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
- $rs2, GPR:$vl, vti.SEW)>;
- def : Pat<(vti.Vector (int_riscv_vmv_v_x simm5:$imm5, (XLenVT (VLOp GPR:$vl)))),
- (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
- simm5:$imm5, GPR:$vl, vti.SEW)>;
+ // vmv.v.x/vmv.v.i are handled in RISCInstrVInstrInfoVVLPatterns.td
}
//===----------------------------------------------------------------------===//
// 13.1. Vector Single-Width Saturating Add and Subtract
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsaddu", "PseudoVSADDU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsadd", "PseudoVSADD", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vssubu", "PseudoVSSUBU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vssub", "PseudoVSSUB", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsaddu", "PseudoVSADDU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsadd", "PseudoVSADD", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vssubu", "PseudoVSSUBU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vssub", "PseudoVSSUB", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 13.2. Vector Single-Width Averaging Add and Subtract
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vaaddu", "PseudoVAADDU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vaadd", "PseudoVAADD", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vasubu", "PseudoVASUBU", AllIntegerVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vasub", "PseudoVASUB", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vaaddu", "PseudoVAADDU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vaadd", "PseudoVAADD", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vasubu", "PseudoVASUBU", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vasub", "PseudoVASUB", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vsmul", "PseudoVSMUL", AllIntegerVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vsmul", "PseudoVSMUL", AllIntegerVectors>;
//===----------------------------------------------------------------------===//
// 13.4. Vector Single-Width Scaling Shift Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssrl", "PseudoVSSRL", AllIntegerVectors,
- uimm5>;
-defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssra", "PseudoVSSRA", AllIntegerVectors,
- uimm5>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vssrl", "PseudoVSSRL", AllIntegerVectors,
+ uimm5>;
+defm : VPatBinaryV_VV_VX_VI<"int_riscv_vssra", "PseudoVSSRA", AllIntegerVectors,
+ uimm5>;
//===----------------------------------------------------------------------===//
// 13.5. Vector Narrowing Fixed-Point Clip Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclipu", "PseudoVNCLIPU", AllWidenableIntVectors>;
-defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclip", "PseudoVNCLIP", AllWidenableIntVectors>;
+defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnclipu", "PseudoVNCLIPU", AllWidenableIntVectors>;
+defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnclip", "PseudoVNCLIP", AllWidenableIntVectors>;
} // Predicates = [HasStdExtV]
@@ -4076,196 +4285,180 @@
//===----------------------------------------------------------------------===//
// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfadd", "PseudoVFADD", AllFloatVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsub", "PseudoVFSUB", AllFloatVectors>;
-defm "" : VPatBinaryV_VX<"int_riscv_vfrsub", "PseudoVFRSUB", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfadd", "PseudoVFADD", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsub", "PseudoVFSUB", AllFloatVectors>;
+defm : VPatBinaryV_VX<"int_riscv_vfrsub", "PseudoVFRSUB", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwadd", "PseudoVFWADD", AllWidenableFloatVectors>;
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwsub", "PseudoVFWSUB", AllWidenableFloatVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwadd_w", "PseudoVFWADD", AllWidenableFloatVectors>;
-defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwsub_w", "PseudoVFWSUB", AllWidenableFloatVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vfwadd", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vfwsub", "PseudoVFWSUB", AllWidenableFloatVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vfwadd_w", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm : VPatBinaryW_WV_WX<"int_riscv_vfwsub_w", "PseudoVFWSUB", AllWidenableFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmul", "PseudoVFMUL", AllFloatVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfdiv", "PseudoVFDIV", AllFloatVectors>;
-defm "" : VPatBinaryV_VX<"int_riscv_vfrdiv", "PseudoVFRDIV", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmul", "PseudoVFMUL", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfdiv", "PseudoVFDIV", AllFloatVectors>;
+defm : VPatBinaryV_VX<"int_riscv_vfrdiv", "PseudoVFRDIV", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.5. Vector Widening Floating-Point Multiply
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwmul", "PseudoVFWMUL", AllWidenableFloatVectors>;
+defm : VPatBinaryW_VV_VX<"int_riscv_vfwmul", "PseudoVFWMUL", AllWidenableFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmacc", "PseudoVFMACC", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmacc", "PseudoVFNMACC", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsac", "PseudoVFMSAC", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsac", "PseudoVFNMSAC", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmadd", "PseudoVFMADD", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmadd", "PseudoVFNMADD", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsub", "PseudoVFMSUB", AllFloatVectors>;
-defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsub", "PseudoVFNMSUB", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmacc", "PseudoVFMACC", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmacc", "PseudoVFNMACC", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsac", "PseudoVFMSAC", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsac", "PseudoVFNMSAC", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmadd", "PseudoVFMADD", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmadd", "PseudoVFNMADD", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsub", "PseudoVFMSUB", AllFloatVectors>;
+defm : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsub", "PseudoVFNMSUB", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmacc", "PseudoVFWMACC", AllWidenableFloatVectors>;
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmacc", "PseudoVFWNMACC", AllWidenableFloatVectors>;
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmsac", "PseudoVFWMSAC", AllWidenableFloatVectors>;
-defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmsac", "PseudoVFWNMSAC", AllWidenableFloatVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vfwmacc", "PseudoVFWMACC", AllWidenableFloatVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vfwnmacc", "PseudoVFWNMACC", AllWidenableFloatVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vfwmsac", "PseudoVFWMSAC", AllWidenableFloatVectors>;
+defm : VPatTernaryW_VV_VX<"int_riscv_vfwnmsac", "PseudoVFWNMSAC", AllWidenableFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.8. Vector Floating-Point Square-Root Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_V<"int_riscv_vfsqrt", "PseudoVFSQRT", AllFloatVectors>;
+defm : VPatUnaryV_V<"int_riscv_vfsqrt", "PseudoVFSQRT", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7", AllFloatVectors>;
+defm : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.10. Vector Floating-Point Reciprocal Estimate Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_V<"int_riscv_vfrec7", "PseudoVFREC7", AllFloatVectors>;
+defm : VPatUnaryV_V<"int_riscv_vfrec7", "PseudoVFREC7", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.11. Vector Floating-Point Min/Max Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN", AllFloatVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.12. Vector Floating-Point Sign-Injection Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ", AllFloatVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN", AllFloatVectors>;
-defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN", AllFloatVectors>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.13. Vector Floating-Point Compare Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ", AllFloatVectors>;
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE", AllFloatVectors>;
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT", AllFloatVectors>;
-defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE", AllFloatVectors>;
-defm "" : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT", AllFloatVectors>;
-defm "" : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE", AllFloatVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ", AllFloatVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE", AllFloatVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT", AllFloatVectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE", AllFloatVectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT", AllFloatVectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE", AllFloatVectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfgt", "PseudoVMFLT", AllFloatVectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE", AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 14.14. Vector Floating-Point Classify Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
+defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
//===----------------------------------------------------------------------===//
// 14.15. Vector Floating-Point Merge Instruction
//===----------------------------------------------------------------------===//
// We can use vmerge.vvm to support vector-vector vfmerge.
-defm "" : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
- /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
-defm "" : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
- /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
+ /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
+ /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
foreach fvti = AllFloatVectors in {
defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
(fvti.Scalar (fpimm0)),
- (fvti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
- (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.SEW)>;
-}
-
-//===----------------------------------------------------------------------===//
-// 14.16. Vector Floating-Point Move Instruction
-//===----------------------------------------------------------------------===//
-foreach fvti = AllFloatVectors in {
- // If we're splatting fpimm0, use vmv.v.x vd, x0.
- def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
- (fvti.Scalar (fpimm0)), (XLenVT (VLOp GPR:$vl)))),
- (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
- 0, GPR:$vl, fvti.SEW)>;
-
- def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
- (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
- (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
- fvti.LMul.MX)
- (fvti.Scalar fvti.ScalarRegClass:$rs2),
- GPR:$vl, fvti.SEW)>;
+ (fvti.Mask V0), VLOpFrag)),
+ (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
}
//===----------------------------------------------------------------------===//
// 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_xu_f_v", "PseudoVFCVT_XU_F">;
-defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_xu_f_v", "PseudoVFCVT_RTZ_XU_F">;
-defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_x_f_v", "PseudoVFCVT_X_F">;
-defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_x_f_v", "PseudoVFCVT_RTZ_X_F">;
-defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_x_v", "PseudoVFCVT_F_X">;
-defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_xu_v", "PseudoVFCVT_F_XU">;
+defm : VPatConversionVI_VF<"int_riscv_vfcvt_xu_f_v", "PseudoVFCVT_XU_F">;
+defm : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_xu_f_v", "PseudoVFCVT_RTZ_XU_F">;
+defm : VPatConversionVI_VF<"int_riscv_vfcvt_x_f_v", "PseudoVFCVT_X_F">;
+defm : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_x_f_v", "PseudoVFCVT_RTZ_X_F">;
+defm : VPatConversionVF_VI<"int_riscv_vfcvt_f_x_v", "PseudoVFCVT_F_X">;
+defm : VPatConversionVF_VI<"int_riscv_vfcvt_f_xu_v", "PseudoVFCVT_F_XU">;
//===----------------------------------------------------------------------===//
// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_xu_f_v", "PseudoVFWCVT_XU_F">;
-defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_x_f_v", "PseudoVFWCVT_X_F">;
-defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_xu_f_v", "PseudoVFWCVT_RTZ_XU_F">;
-defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_x_f_v", "PseudoVFWCVT_RTZ_X_F">;
-defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU">;
-defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X">;
-defm "" : VPatConversionWF_VF<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F">;
+defm : VPatConversionWI_VF<"int_riscv_vfwcvt_xu_f_v", "PseudoVFWCVT_XU_F">;
+defm : VPatConversionWI_VF<"int_riscv_vfwcvt_x_f_v", "PseudoVFWCVT_X_F">;
+defm : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_xu_f_v", "PseudoVFWCVT_RTZ_XU_F">;
+defm : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_x_f_v", "PseudoVFWCVT_RTZ_X_F">;
+defm : VPatConversionWF_VI<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU">;
+defm : VPatConversionWF_VI<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X">;
+defm : VPatConversionWF_VF<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F">;
//===----------------------------------------------------------------------===//
// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F">;
-defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F">;
-defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F">;
-defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F">;
-defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU">;
-defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X">;
-defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F">;
-defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F">;
+defm : VPatConversionVI_WF<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F">;
+defm : VPatConversionVI_WF<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F">;
+defm : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F">;
+defm : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F">;
+defm : VPatConversionVF_WI <"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU">;
+defm : VPatConversionVF_WI <"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X">;
+defm : VPatConversionVF_WF<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F">;
+defm : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F">;
} // Predicates = [HasStdExtV, HasStdExtF]
let Predicates = [HasStdExtV] in {
//===----------------------------------------------------------------------===//
// 15.1. Vector Single-Width Integer Reduction Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatReductionV_VS<"int_riscv_vredsum", "PseudoVREDSUM">;
-defm "" : VPatReductionV_VS<"int_riscv_vredand", "PseudoVREDAND">;
-defm "" : VPatReductionV_VS<"int_riscv_vredor", "PseudoVREDOR">;
-defm "" : VPatReductionV_VS<"int_riscv_vredxor", "PseudoVREDXOR">;
-defm "" : VPatReductionV_VS<"int_riscv_vredminu", "PseudoVREDMINU">;
-defm "" : VPatReductionV_VS<"int_riscv_vredmin", "PseudoVREDMIN">;
-defm "" : VPatReductionV_VS<"int_riscv_vredmaxu", "PseudoVREDMAXU">;
-defm "" : VPatReductionV_VS<"int_riscv_vredmax", "PseudoVREDMAX">;
+defm : VPatReductionV_VS<"int_riscv_vredsum", "PseudoVREDSUM">;
+defm : VPatReductionV_VS<"int_riscv_vredand", "PseudoVREDAND">;
+defm : VPatReductionV_VS<"int_riscv_vredor", "PseudoVREDOR">;
+defm : VPatReductionV_VS<"int_riscv_vredxor", "PseudoVREDXOR">;
+defm : VPatReductionV_VS<"int_riscv_vredminu", "PseudoVREDMINU">;
+defm : VPatReductionV_VS<"int_riscv_vredmin", "PseudoVREDMIN">;
+defm : VPatReductionV_VS<"int_riscv_vredmaxu", "PseudoVREDMAXU">;
+defm : VPatReductionV_VS<"int_riscv_vredmax", "PseudoVREDMAX">;
//===----------------------------------------------------------------------===//
// 15.2. Vector Widening Integer Reduction Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatReductionW_VS<"int_riscv_vwredsumu", "PseudoVWREDSUMU">;
-defm "" : VPatReductionW_VS<"int_riscv_vwredsum", "PseudoVWREDSUM">;
+defm : VPatReductionW_VS<"int_riscv_vwredsumu", "PseudoVWREDSUMU">;
+defm : VPatReductionW_VS<"int_riscv_vwredsum", "PseudoVWREDSUM">;
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
//===----------------------------------------------------------------------===//
// 15.3. Vector Single-Width Floating-Point Reduction Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatReductionV_VS<"int_riscv_vfredosum", "PseudoVFREDOSUM", /*IsFloat=*/1>;
-defm "" : VPatReductionV_VS<"int_riscv_vfredsum", "PseudoVFREDSUM", /*IsFloat=*/1>;
-defm "" : VPatReductionV_VS<"int_riscv_vfredmin", "PseudoVFREDMIN", /*IsFloat=*/1>;
-defm "" : VPatReductionV_VS<"int_riscv_vfredmax", "PseudoVFREDMAX", /*IsFloat=*/1>;
+defm : VPatReductionV_VS<"int_riscv_vfredosum", "PseudoVFREDOSUM", /*IsFloat=*/1>;
+defm : VPatReductionV_VS<"int_riscv_vfredsum", "PseudoVFREDSUM", /*IsFloat=*/1>;
+defm : VPatReductionV_VS<"int_riscv_vfredmin", "PseudoVFREDMIN", /*IsFloat=*/1>;
+defm : VPatReductionV_VS<"int_riscv_vfredmax", "PseudoVFREDMAX", /*IsFloat=*/1>;
//===----------------------------------------------------------------------===//
// 15.4. Vector Widening Floating-Point Reduction Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatReductionW_VS<"int_riscv_vfwredsum", "PseudoVFWREDSUM", /*IsFloat=*/1>;
-defm "" : VPatReductionW_VS<"int_riscv_vfwredosum", "PseudoVFWREDOSUM", /*IsFloat=*/1>;
+defm : VPatReductionW_VS<"int_riscv_vfwredsum", "PseudoVFWREDSUM", /*IsFloat=*/1>;
+defm : VPatReductionW_VS<"int_riscv_vfwredosum", "PseudoVFWREDOSUM", /*IsFloat=*/1>;
} // Predicates = [HasStdExtV, HasStdExtF]
@@ -4277,53 +4470,53 @@
//===----------------------------------------------------------------------===//
// 16.1 Vector Mask-Register Logical Instructions
//===----------------------------------------------------------------------===//
-defm "" : VPatBinaryM_MM<"int_riscv_vmand", "PseudoVMAND">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmnand", "PseudoVMNAND">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmandnot", "PseudoVMANDNOT">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmxor", "PseudoVMXOR">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmor", "PseudoVMOR">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmnor", "PseudoVMNOR">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmornot", "PseudoVMORNOT">;
-defm "" : VPatBinaryM_MM<"int_riscv_vmxnor", "PseudoVMXNOR">;
+defm : VPatBinaryM_MM<"int_riscv_vmand", "PseudoVMAND">;
+defm : VPatBinaryM_MM<"int_riscv_vmnand", "PseudoVMNAND">;
+defm : VPatBinaryM_MM<"int_riscv_vmandnot", "PseudoVMANDNOT">;
+defm : VPatBinaryM_MM<"int_riscv_vmxor", "PseudoVMXOR">;
+defm : VPatBinaryM_MM<"int_riscv_vmor", "PseudoVMOR">;
+defm : VPatBinaryM_MM<"int_riscv_vmnor", "PseudoVMNOR">;
+defm : VPatBinaryM_MM<"int_riscv_vmornot", "PseudoVMORNOT">;
+defm : VPatBinaryM_MM<"int_riscv_vmxnor", "PseudoVMXNOR">;
// pseudo instructions
-defm "" : VPatNullaryM<"int_riscv_vmclr", "PseudoVMCLR">;
-defm "" : VPatNullaryM<"int_riscv_vmset", "PseudoVMSET">;
+defm : VPatNullaryM<"int_riscv_vmclr", "PseudoVMCLR">;
+defm : VPatNullaryM<"int_riscv_vmset", "PseudoVMSET">;
//===----------------------------------------------------------------------===//
// 16.2. Vector mask population count vpopc
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryS_M<"int_riscv_vpopc", "PseudoVPOPC">;
+defm : VPatUnaryS_M<"int_riscv_vpopc", "PseudoVPOPC">;
//===----------------------------------------------------------------------===//
// 16.3. vfirst find-first-set mask bit
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryS_M<"int_riscv_vfirst", "PseudoVFIRST">;
+defm : VPatUnaryS_M<"int_riscv_vfirst", "PseudoVFIRST">;
//===----------------------------------------------------------------------===//
// 16.4. vmsbf.m set-before-first mask bit
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryM_M<"int_riscv_vmsbf", "PseudoVMSBF">;
+defm : VPatUnaryM_M<"int_riscv_vmsbf", "PseudoVMSBF">;
//===----------------------------------------------------------------------===//
// 16.5. vmsif.m set-including-first mask bit
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryM_M<"int_riscv_vmsif", "PseudoVMSIF">;
+defm : VPatUnaryM_M<"int_riscv_vmsif", "PseudoVMSIF">;
//===----------------------------------------------------------------------===//
// 16.6. vmsof.m set-only-first mask bit
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryM_M<"int_riscv_vmsof", "PseudoVMSOF">;
+defm : VPatUnaryM_M<"int_riscv_vmsof", "PseudoVMSOF">;
//===----------------------------------------------------------------------===//
// 16.8. Vector Iota Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_M<"int_riscv_viota", "PseudoVIOTA">;
+defm : VPatUnaryV_M<"int_riscv_viota", "PseudoVIOTA">;
//===----------------------------------------------------------------------===//
// 16.9. Vector Element Index Instruction
//===----------------------------------------------------------------------===//
-defm "" : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
+defm : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
} // Predicates = [HasStdExtV]
@@ -4338,11 +4531,8 @@
let Predicates = [HasStdExtV] in {
foreach vti = AllIntegerVectors in {
def : Pat<(riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2)),
- (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.SEW)>;
- def : Pat<(vti.Vector (int_riscv_vmv_s_x (vti.Vector vti.RegClass:$rs1),
- GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
- (!cast<Instruction>("PseudoVMV_S_X_" # vti.LMul.MX)
- (vti.Vector $rs1), $rs2, GPR:$vl, vti.SEW)>;
+ (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.Log2SEW)>;
+ // vmv.s.x is handled with a custom node in RISCVInstrInfoVVLPatterns.td
}
} // Predicates = [HasStdExtV]
@@ -4355,15 +4545,15 @@
defvar instr = !cast<Instruction>("PseudoVFMV_"#fvti.ScalarSuffix#"_S_" #
fvti.LMul.MX);
def : Pat<(fvti.Scalar (int_riscv_vfmv_f_s (fvti.Vector fvti.RegClass:$rs2))),
- (instr $rs2, fvti.SEW)>;
+ (instr $rs2, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
- (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
+ (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
(!cast<Instruction>("PseudoVFMV_S_"#fvti.ScalarSuffix#"_" #
fvti.LMul.MX)
(fvti.Vector $rs1),
(fvti.Scalar fvti.ScalarRegClass:$rs2),
- GPR:$vl, fvti.SEW)>;
+ GPR:$vl, fvti.Log2SEW)>;
}
} // Predicates = [HasStdExtV, HasStdExtF]
@@ -4371,46 +4561,47 @@
// 17.3. Vector Slide Instructions
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtV] in {
- defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>;
- defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>;
- defm "" : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
- defm "" : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>;
+ defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
+ defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
- defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
- defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
- defm "" : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
- defm "" : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
+ defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
+ defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
} // Predicates = [HasStdExtV, HasStdExtF]
//===----------------------------------------------------------------------===//
// 17.4. Vector Register Gather Instructions
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtV] in {
- defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
- AllIntegerVectors, uimm5>;
- defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
- /* eew */ 16, AllIntegerVectors>;
+ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllIntegerVectors, uimm5>;
+ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
+ /* eew */ 16, AllIntegerVectors>;
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
- defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
- AllFloatVectors, uimm5>;
- defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
- /* eew */ 16, AllFloatVectors>;
+ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllFloatVectors, uimm5>;
+ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
+ /* eew */ 16, AllFloatVectors>;
} // Predicates = [HasStdExtV, HasStdExtF]
//===----------------------------------------------------------------------===//
// 17.5. Vector Compress Instruction
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtV] in {
- defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
+ defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
} // Predicates = [HasStdExtV]
let Predicates = [HasStdExtV, HasStdExtF] in {
- defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
+ defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
} // Predicates = [HasStdExtV, HasStdExtF]
// Include the non-intrinsic ISel patterns
include "RISCVInstrInfoVSDPatterns.td"
+include "RISCVInstrInfoVVLPatterns.td"
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index dee6770..483fc8b 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -28,43 +28,75 @@
def rv32_splat_i64 : SDNode<"RISCVISD::SPLAT_VECTOR_I64", SDTSplatI64>;
-def riscv_trunc_vector : SDNode<"RISCVISD::TRUNCATE_VECTOR",
- SDTypeProfile<1, 1,
- [SDTCisVec<0>, SDTCisVec<1>]>>;
+def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i1>,
+ SDTCisVT<1, XLenVT>]>;
+def riscv_vmclr_vl : SDNode<"RISCVISD::VMCLR_VL", SDT_RISCVVMSETCLR_VL>;
+def riscv_vmset_vl : SDNode<"RISCVISD::VMSET_VL", SDT_RISCVVMSETCLR_VL>;
-// Penalize the generic form with Complexity=1 to give the simm5/uimm5 variants
-// precedence
-def SplatPat : ComplexPattern<vAny, 1, "selectVSplat", [], [], 1>;
+def rvv_vnot : PatFrag<(ops node:$in),
+ (xor node:$in, (riscv_vmset_vl (XLenVT srcvalue)))>;
-def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", []>;
-def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", []>;
+// Give explicit Complexity to prefer simm5/uimm5.
+def SplatPat : ComplexPattern<vAny, 1, "selectVSplat", [splat_vector, rv32_splat_i64], [], 1>;
+def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", [splat_vector, rv32_splat_i64], [], 2>;
+def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", [splat_vector, rv32_splat_i64], [], 2>;
+def SplatPat_simm5_plus1
+ : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1",
+ [splat_vector, rv32_splat_i64], [], 2>;
+def SplatPat_simm5_plus1_nonzero
+ : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1NonZero",
+ [splat_vector, rv32_splat_i64], [], 2>;
class SwapHelper<dag Prefix, dag A, dag B, dag Suffix, bit swap> {
dag Value = !con(Prefix, !if(swap, B, A), !if(swap, A, B), Suffix);
}
-multiclass VPatUSLoadStoreSDNode<LLVMType type,
- LLVMType mask_type,
- int sew,
+multiclass VPatUSLoadStoreSDNode<ValueType type,
+ int log2sew,
LMULInfo vlmul,
OutPatFrag avl,
- RegisterClass reg_rs1,
- VReg reg_class>
+ VReg reg_class,
+ int sew = !shl(1, log2sew)>
{
defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX);
defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX);
// Load
- def : Pat<(type (load reg_rs1:$rs1)),
- (load_instr reg_rs1:$rs1, avl, sew)>;
+ def : Pat<(type (load BaseAddr:$rs1)),
+ (load_instr BaseAddr:$rs1, avl, log2sew)>;
// Store
- def : Pat<(store type:$rs2, reg_rs1:$rs1),
- (store_instr reg_class:$rs2, reg_rs1:$rs1, avl, sew)>;
+ def : Pat<(store type:$rs2, BaseAddr:$rs1),
+ (store_instr reg_class:$rs2, BaseAddr:$rs1, avl, log2sew)>;
}
-multiclass VPatUSLoadStoreSDNodes<RegisterClass reg_rs1> {
- foreach vti = AllVectors in
- defm "" : VPatUSLoadStoreSDNode<vti.Vector, vti.Mask, vti.SEW, vti.LMul,
- vti.AVL, reg_rs1, vti.RegClass>;
+multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
+ int log2sew,
+ LMULInfo vlmul,
+ VReg reg_class,
+ int sew = !shl(1, log2sew)>
+{
+ defvar load_instr =
+ !cast<Instruction>("VL"#!substr(vlmul.MX, 1)#"RE"#sew#"_V");
+ defvar store_instr =
+ !cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V");
+
+ // Load
+ def : Pat<(type (load BaseAddr:$rs1)),
+ (load_instr BaseAddr:$rs1)>;
+ // Store
+ def : Pat<(store type:$rs2, BaseAddr:$rs1),
+ (store_instr reg_class:$rs2, BaseAddr:$rs1)>;
+}
+
+multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m>
+{
+ defvar load_instr = !cast<Instruction>("PseudoVLE1_V_"#m.BX);
+ defvar store_instr = !cast<Instruction>("PseudoVSE1_V_"#m.BX);
+ // Load
+ def : Pat<(m.Mask (load BaseAddr:$rs1)),
+ (load_instr BaseAddr:$rs1, m.AVL, m.Log2SEW)>;
+ // Store
+ def : Pat<(store m.Mask:$rs2, BaseAddr:$rs1),
+ (store_instr VR:$rs2, BaseAddr:$rs1, m.AVL, m.Log2SEW)>;
}
class VPatBinarySDNode_VV<SDNode vop,
@@ -90,7 +122,6 @@
string suffix,
ValueType result_type,
ValueType vop_type,
- ValueType xop_type,
ValueType mask_type,
int sew,
LMULInfo vlmul,
@@ -107,14 +138,13 @@
xop_kind:$rs2,
avl, sew)>;
-multiclass VPatBinarySDNode_VV_VX<SDNode vop, string instruction_name>
-{
+multiclass VPatBinarySDNode_VV_VX<SDNode vop, string instruction_name> {
foreach vti = AllIntegerVectors in {
def : VPatBinarySDNode_VV<vop, instruction_name,
- vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
- vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
SplatPat, GPR>;
}
@@ -122,17 +152,10 @@
multiclass VPatBinarySDNode_VV_VX_VI<SDNode vop, string instruction_name,
Operand ImmType = simm5>
-{
+ : VPatBinarySDNode_VV_VX<vop, instruction_name> {
foreach vti = AllIntegerVectors in {
- def : VPatBinarySDNode_VV<vop, instruction_name,
- vti.Vector, vti.Vector, vti.Mask, vti.SEW,
- vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
- def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
- vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
- vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
- SplatPat, GPR>;
def : VPatBinarySDNode_XI<vop, instruction_name, "VI",
- vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
!cast<ComplexPattern>(SplatPat#_#ImmType),
ImmType>;
@@ -161,11 +184,11 @@
multiclass VPatBinaryFPSDNode_VV_VF<SDNode vop, string instruction_name> {
foreach vti = AllFloatVectors in {
def : VPatBinarySDNode_VV<vop, instruction_name,
- vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
def : VPatBinarySDNode_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
- vti.SEW, vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+ vti.Log2SEW, vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
vti.ScalarRegClass>;
}
}
@@ -177,7 +200,7 @@
(!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
fvti.RegClass:$rs1,
(fvti.Scalar fvti.ScalarRegClass:$rs2),
- fvti.AVL, fvti.SEW)>;
+ fvti.AVL, fvti.Log2SEW)>;
}
multiclass VPatIntegerSetCCSDNode_VV<CondCode cc,
@@ -190,7 +213,7 @@
SwapHelper<(instruction),
(instruction vti.RegClass:$rs1),
(instruction vti.RegClass:$rs2),
- (instruction vti.AVL, vti.SEW),
+ (instruction vti.AVL, vti.Log2SEW),
swap>.Value>;
}
}
@@ -208,7 +231,7 @@
SwapHelper<(instruction),
(instruction vti.RegClass:$rs1),
(instruction xop_kind:$rs2),
- (instruction vti.AVL, vti.SEW),
+ (instruction vti.AVL, vti.Log2SEW),
swap>.Value>;
}
}
@@ -240,43 +263,40 @@
SplatPat_simm5, simm5, swap>;
}
-multiclass VPatFPSetCCSDNode_VV<CondCode cc, string instruction_name> {
- foreach fvti = AllFloatVectors in
- def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
- (fvti.Vector fvti.RegClass:$rs2),
- cc)),
- (!cast<Instruction>(instruction_name#"_VV_"#fvti.LMul.MX)
- fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.AVL, fvti.SEW)>;
-}
-
-multiclass VPatFPSetCCSDNode_VF<CondCode cc, string instruction_name> {
- foreach fvti = AllFloatVectors in
- def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
- (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
- cc)),
- (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
- fvti.RegClass:$rs1,
- (fvti.Scalar fvti.ScalarRegClass:$rs2),
- fvti.AVL, fvti.SEW)>;
-}
-
-multiclass VPatFPSetCCSDNode_FV<CondCode cc, string swapped_op_instruction_name> {
- foreach fvti = AllFloatVectors in
- def : Pat<(fvti.Mask (setcc (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
- (fvti.Vector fvti.RegClass:$rs1),
- cc)),
- (!cast<Instruction>(swapped_op_instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
- fvti.RegClass:$rs1,
- (fvti.Scalar fvti.ScalarRegClass:$rs2),
- fvti.AVL, fvti.SEW)>;
+multiclass VPatIntegerSetCCSDNode_VIPlus1<CondCode cc, string instruction_name,
+ ComplexPattern splatpat_kind> {
+ foreach vti = AllIntegerVectors in {
+ defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
+ def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (splatpat_kind simm5:$rs2)),
+ cc)),
+ (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2),
+ vti.AVL, vti.Log2SEW)>;
+ }
}
multiclass VPatFPSetCCSDNode_VV_VF_FV<CondCode cc,
string inst_name,
string swapped_op_inst_name> {
- defm : VPatFPSetCCSDNode_VV<cc, inst_name>;
- defm : VPatFPSetCCSDNode_VF<cc, inst_name>;
- defm : VPatFPSetCCSDNode_FV<cc, swapped_op_inst_name>;
+ foreach fvti = AllFloatVectors in {
+ def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Vector fvti.RegClass:$rs2),
+ cc)),
+ (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+ (splat_vector fvti.ScalarRegClass:$rs2),
+ cc)),
+ (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Mask (setcc (splat_vector fvti.ScalarRegClass:$rs2),
+ (fvti.Vector fvti.RegClass:$rs1),
+ cc)),
+ (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ }
}
multiclass VPatExtendSDNode_V<list<SDNode> ops, string inst_name, string suffix,
@@ -287,7 +307,65 @@
foreach op = ops in
def : Pat<(vti.Vector (op (fti.Vector fti.RegClass:$rs2))),
(!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
- fti.RegClass:$rs2, fti.AVL, vti.SEW)>;
+ fti.RegClass:$rs2, fti.AVL, vti.Log2SEW)>;
+ }
+}
+
+multiclass VPatConvertI2FPSDNode_V<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ ivti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
+ fvti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>;
+ }
+}
+
+multiclass VPatWConvertI2FPSDNode_V<SDNode vop, string instruction_name> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in {
+ defvar ivti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
+ ivti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>;
+ }
+}
+
+multiclass VPatWConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+ def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatNConvertI2FPSDNode_V<SDNode vop, string instruction_name> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+ def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ iwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatNConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX)
+ fwti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
}
}
@@ -298,144 +376,207 @@
let Predicates = [HasStdExtV] in {
// 7.4. Vector Unit-Stride Instructions
-defm "" : VPatUSLoadStoreSDNodes<GPR>;
-defm "" : VPatUSLoadStoreSDNodes<AddrFI>;
+foreach vti = !listconcat(FractionalGroupIntegerVectors,
+ FractionalGroupFloatVectors) in
+ defm : VPatUSLoadStoreSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
+ vti.AVL, vti.RegClass>;
+foreach vti = [VI8M1, VI16M1, VI32M1, VI64M1, VF16M1, VF32M1, VF64M1] in
+ defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
+ vti.RegClass>;
+foreach vti = !listconcat(GroupIntegerVectors, GroupFloatVectors) in
+ defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
+ vti.RegClass>;
+foreach mti = AllMasks in
+ defm : VPatUSLoadStoreMaskSDNode<mti>;
// 12.1. Vector Single-Width Integer Add and Subtract
-defm "" : VPatBinarySDNode_VV_VX_VI<add, "PseudoVADD">;
-defm "" : VPatBinarySDNode_VV_VX<sub, "PseudoVSUB">;
+defm : VPatBinarySDNode_VV_VX_VI<add, "PseudoVADD">;
+defm : VPatBinarySDNode_VV_VX<sub, "PseudoVSUB">;
// Handle VRSUB specially since it's the only integer binary op with reversed
// pattern operands
foreach vti = AllIntegerVectors in {
- def : Pat<(sub (vti.Vector (SplatPat XLenVT:$rs2)),
+ def : Pat<(sub (vti.Vector (SplatPat GPR:$rs2)),
(vti.Vector vti.RegClass:$rs1)),
(!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
- vti.RegClass:$rs1, GPR:$rs2, vti.AVL, vti.SEW)>;
- def : Pat<(sub (vti.Vector (SplatPat_simm5 XLenVT:$rs2)),
+ vti.RegClass:$rs1, GPR:$rs2, vti.AVL, vti.Log2SEW)>;
+ def : Pat<(sub (vti.Vector (SplatPat_simm5 simm5:$rs2)),
(vti.Vector vti.RegClass:$rs1)),
(!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
- vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.SEW)>;
+ vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.Log2SEW)>;
}
// 12.3. Vector Integer Extension
-defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2",
- AllFractionableVF2IntVectors>;
-defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF2",
- AllFractionableVF2IntVectors>;
-defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF4",
- AllFractionableVF4IntVectors>;
-defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF4",
- AllFractionableVF4IntVectors>;
-defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF8",
- AllFractionableVF8IntVectors>;
-defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF8",
- AllFractionableVF8IntVectors>;
+defm : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+defm : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF8",
+ AllFractionableVF8IntVectors>;
// 12.5. Vector Bitwise Logical Instructions
-defm "" : VPatBinarySDNode_VV_VX_VI<and, "PseudoVAND">;
-defm "" : VPatBinarySDNode_VV_VX_VI<or, "PseudoVOR">;
-defm "" : VPatBinarySDNode_VV_VX_VI<xor, "PseudoVXOR">;
+defm : VPatBinarySDNode_VV_VX_VI<and, "PseudoVAND">;
+defm : VPatBinarySDNode_VV_VX_VI<or, "PseudoVOR">;
+defm : VPatBinarySDNode_VV_VX_VI<xor, "PseudoVXOR">;
// 12.6. Vector Single-Width Bit Shift Instructions
-defm "" : VPatBinarySDNode_VV_VX_VI<shl, "PseudoVSLL", uimm5>;
-defm "" : VPatBinarySDNode_VV_VX_VI<srl, "PseudoVSRL", uimm5>;
-defm "" : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
+defm : VPatBinarySDNode_VV_VX_VI<shl, "PseudoVSLL", uimm5>;
+defm : VPatBinarySDNode_VV_VX_VI<srl, "PseudoVSRL", uimm5>;
+defm : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
-// 12.7. Vector Narrowing Integer Right Shift Instructions
-foreach vtiTofti = AllFractionableVF2IntVectors in {
- defvar vti = vtiTofti.Vti;
- defvar fti = vtiTofti.Fti;
- def : Pat<(fti.Vector (riscv_trunc_vector (vti.Vector vti.RegClass:$rs1))),
- (!cast<Instruction>("PseudoVNSRL_WI_"#fti.LMul.MX)
- vti.RegClass:$rs1, 0, fti.AVL, fti.SEW)>;
+foreach vti = AllIntegerVectors in {
+ // Emit shift by 1 as an add since it might be faster.
+ def : Pat<(shl (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (splat_vector (XLenVT 1)))),
+ (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+}
+foreach vti = [VI64M1, VI64M2, VI64M4, VI64M8] in {
+ def : Pat<(shl (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (rv32_splat_i64 (XLenVT 1)))),
+ (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+
}
// 12.8. Vector Integer Comparison Instructions
-defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETEQ, "PseudoVMSEQ">;
-defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE, "PseudoVMSNE">;
+defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETEQ, "PseudoVMSEQ">;
+defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE, "PseudoVMSNE">;
-// FIXME: Support immediate forms of these by choosing SLE decrementing the
-// immediate
-defm "" : VPatIntegerSetCCSDNode_VV_VX<SETLT, "PseudoVMSLT">;
-defm "" : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
+defm : VPatIntegerSetCCSDNode_VV_VX<SETLT, "PseudoVMSLT">;
+defm : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
+defm : VPatIntegerSetCCSDNode_VIPlus1<SETLT, "PseudoVMSLE",
+ SplatPat_simm5_plus1>;
+defm : VPatIntegerSetCCSDNode_VIPlus1<SETULT, "PseudoVMSLEU",
+ SplatPat_simm5_plus1_nonzero>;
-defm "" : VPatIntegerSetCCSDNode_VV<SETGT, "PseudoVMSLT", /*swap*/1>;
-defm "" : VPatIntegerSetCCSDNode_VV<SETUGT, "PseudoVMSLTU", /*swap*/1>;
-defm "" : VPatIntegerSetCCSDNode_VX_VI<SETGT, "PseudoVMSGT">;
-defm "" : VPatIntegerSetCCSDNode_VX_VI<SETUGT, "PseudoVMSGTU">;
+defm : VPatIntegerSetCCSDNode_VV<SETGT, "PseudoVMSLT", /*swap*/1>;
+defm : VPatIntegerSetCCSDNode_VV<SETUGT, "PseudoVMSLTU", /*swap*/1>;
+defm : VPatIntegerSetCCSDNode_VX_VI<SETGT, "PseudoVMSGT">;
+defm : VPatIntegerSetCCSDNode_VX_VI<SETUGT, "PseudoVMSGTU">;
-defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETLE, "PseudoVMSLE">;
-defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETULE, "PseudoVMSLEU">;
+defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETLE, "PseudoVMSLE">;
+defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETULE, "PseudoVMSLEU">;
-// FIXME: Support immediate forms of these by choosing SGT and decrementing the
-// immediate
-defm "" : VPatIntegerSetCCSDNode_VV<SETGE, "PseudoVMSLE", /*swap*/1>;
-defm "" : VPatIntegerSetCCSDNode_VV<SETUGE, "PseudoVMSLEU", /*swap*/1>;
+defm : VPatIntegerSetCCSDNode_VV<SETGE, "PseudoVMSLE", /*swap*/1>;
+defm : VPatIntegerSetCCSDNode_VV<SETUGE, "PseudoVMSLEU", /*swap*/1>;
+defm : VPatIntegerSetCCSDNode_VIPlus1<SETGE, "PseudoVMSGT",
+ SplatPat_simm5_plus1>;
+defm : VPatIntegerSetCCSDNode_VIPlus1<SETUGE, "PseudoVMSGTU",
+ SplatPat_simm5_plus1_nonzero>;
// 12.9. Vector Integer Min/Max Instructions
-defm "" : VPatBinarySDNode_VV_VX<umin, "PseudoVMINU">;
-defm "" : VPatBinarySDNode_VV_VX<smin, "PseudoVMIN">;
-defm "" : VPatBinarySDNode_VV_VX<umax, "PseudoVMAXU">;
-defm "" : VPatBinarySDNode_VV_VX<smax, "PseudoVMAX">;
+defm : VPatBinarySDNode_VV_VX<umin, "PseudoVMINU">;
+defm : VPatBinarySDNode_VV_VX<smin, "PseudoVMIN">;
+defm : VPatBinarySDNode_VV_VX<umax, "PseudoVMAXU">;
+defm : VPatBinarySDNode_VV_VX<smax, "PseudoVMAX">;
// 12.10. Vector Single-Width Integer Multiply Instructions
-defm "" : VPatBinarySDNode_VV_VX<mul, "PseudoVMUL">;
-defm "" : VPatBinarySDNode_VV_VX<mulhs, "PseudoVMULH">;
-defm "" : VPatBinarySDNode_VV_VX<mulhu, "PseudoVMULHU">;
+defm : VPatBinarySDNode_VV_VX<mul, "PseudoVMUL">;
+defm : VPatBinarySDNode_VV_VX<mulhs, "PseudoVMULH">;
+defm : VPatBinarySDNode_VV_VX<mulhu, "PseudoVMULHU">;
// 12.11. Vector Integer Divide Instructions
-defm "" : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIVU">;
-defm "" : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIV">;
-defm "" : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
-defm "" : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
+defm : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIVU">;
+defm : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIV">;
+defm : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
+defm : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
-// 12.16. Vector Integer Merge Instructions
+// 12.13 Vector Single-Width Integer Multiply-Add Instructions.
+foreach vti = AllIntegerVectors in {
+ // NOTE: We choose VMADD because it has the most commuting freedom. So it
+ // works best with how TwoAddressInstructionPass tries commuting.
+ defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+ def : Pat<(vti.Vector (add vti.RegClass:$rs2,
+ (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
+ (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ vti.AVL, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
+ (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
+ (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ vti.AVL, vti.Log2SEW)>;
+
+ // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
+ // commutable.
+ def : Pat<(vti.Vector (add vti.RegClass:$rs2,
+ (mul_oneuse (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rd))),
+ (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ vti.AVL, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
+ (mul_oneuse (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rd))),
+ (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ vti.AVL, vti.Log2SEW)>;
+}
+
+// 12.15. Vector Integer Merge Instructions
foreach vti = AllIntegerVectors in {
def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1,
vti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
- vti.AVL, vti.SEW)>;
+ vti.AVL, vti.Log2SEW)>;
def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1),
vti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
- vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+ vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, vti.AVL, vti.Log2SEW)>;
def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1),
vti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
- vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+ vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, vti.AVL, vti.Log2SEW)>;
}
+// 12.1. Vector Single-Width Saturating Add and Subtract
+defm : VPatBinarySDNode_VV_VX_VI<saddsat, "PseudoVSADD">;
+defm : VPatBinarySDNode_VV_VX_VI<uaddsat, "PseudoVSADDU">;
+defm : VPatBinarySDNode_VV_VX<ssubsat, "PseudoVSSUB">;
+defm : VPatBinarySDNode_VV_VX<usubsat, "PseudoVSSUBU">;
+
// 16.1. Vector Mask-Register Logical Instructions
foreach mti = AllMasks in {
def : Pat<(mti.Mask (and VR:$rs1, VR:$rs2)),
(!cast<Instruction>("PseudoVMAND_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
def : Pat<(mti.Mask (or VR:$rs1, VR:$rs2)),
(!cast<Instruction>("PseudoVMOR_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
def : Pat<(mti.Mask (xor VR:$rs1, VR:$rs2)),
(!cast<Instruction>("PseudoVMXOR_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
- def : Pat<(mti.Mask (vnot (and VR:$rs1, VR:$rs2))),
+ def : Pat<(mti.Mask (rvv_vnot (and VR:$rs1, VR:$rs2))),
(!cast<Instruction>("PseudoVMNAND_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
- def : Pat<(mti.Mask (vnot (or VR:$rs1, VR:$rs2))),
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (rvv_vnot (or VR:$rs1, VR:$rs2))),
(!cast<Instruction>("PseudoVMNOR_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
- def : Pat<(mti.Mask (vnot (xor VR:$rs1, VR:$rs2))),
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (rvv_vnot (xor VR:$rs1, VR:$rs2))),
(!cast<Instruction>("PseudoVMXNOR_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
- def : Pat<(mti.Mask (and VR:$rs1, (vnot VR:$rs2))),
+ def : Pat<(mti.Mask (and VR:$rs1, (rvv_vnot VR:$rs2))),
(!cast<Instruction>("PseudoVMANDNOT_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
- def : Pat<(mti.Mask (or VR:$rs1, (vnot VR:$rs2))),
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (or VR:$rs1, (rvv_vnot VR:$rs2))),
(!cast<Instruction>("PseudoVMORNOT_MM_"#mti.LMul.MX)
- VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
+
+ // Handle rvv_vnot the same as the vmnot.m pseudoinstruction.
+ def : Pat<(mti.Mask (rvv_vnot VR:$rs)),
+ (!cast<Instruction>("PseudoVMNAND_MM_"#mti.LMul.MX)
+ VR:$rs, VR:$rs, mti.AVL, mti.Log2SEW)>;
}
} // Predicates = [HasStdExtV]
@@ -443,37 +584,138 @@
let Predicates = [HasStdExtV, HasStdExtF] in {
// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
-defm "" : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
-defm "" : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">;
-defm "" : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">;
+defm : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
+defm : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">;
+defm : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">;
// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm "" : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">;
-defm "" : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">;
-defm "" : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
+defm : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">;
+defm : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">;
+defm : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
-// 14.11. Vector Floating-Point Compare Instructions
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+// 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions.
+foreach fvti = AllFloatVectors in {
+ // NOTE: We choose VFMADD because it has the most commuting freedom. So it
+ // works best with how TwoAddressInstructionPass tries commuting.
+ defvar suffix = fvti.LMul.MX # "_COMMUTABLE";
+ def : Pat<(fvti.Vector (fma fvti.RegClass:$rs1, fvti.RegClass:$rd,
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFMADD_VV_"# suffix)
+ fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma fvti.RegClass:$rs1, fvti.RegClass:$rd,
+ (fneg fvti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFMSUB_VV_"# suffix)
+ fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma (fneg fvti.RegClass:$rs1), fvti.RegClass:$rd,
+ (fneg fvti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFNMADD_VV_"# suffix)
+ fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma (fneg fvti.RegClass:$rs1), fvti.RegClass:$rd,
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFNMSUB_VV_"# suffix)
+ fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETNE, "PseudoVMFNE", "PseudoVMFNE">;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETUNE, "PseudoVMFNE", "PseudoVMFNE">;
+ // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
+ // commutable.
+ def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rd, fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFMADD_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLT, "PseudoVMFLT", "PseudoVMFGT">;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLT, "PseudoVMFLT", "PseudoVMFGT">;
+ def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+ (fneg fvti.RegClass:$rd), (fneg fvti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+ (fneg fvti.RegClass:$rd), fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLE, "PseudoVMFLE", "PseudoVMFGE">;
-defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
+ // The splat might be negated.
+ def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
+ fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+ def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
+ fvti.RegClass:$rd, fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
+ fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
+ fvti.AVL, fvti.Log2SEW)>;
+}
+
+foreach vti = AllFloatVectors in {
+ // 14.8. Vector Floating-Point Square-Root Instruction
+ def : Pat<(fsqrt (vti.Vector vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFSQRT_V_"# vti.LMul.MX)
+ vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+
+ // 14.12. Vector Floating-Point Sign-Injection Instructions
+ def : Pat<(fabs (vti.Vector vti.RegClass:$rs)),
+ (!cast<Instruction>("PseudoVFSGNJX_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW)>;
+ // Handle fneg with VFSGNJN using the same input for both operands.
+ def : Pat<(fneg (vti.Vector vti.RegClass:$rs)),
+ (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector vti.RegClass:$rs2))),
+ (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (splat_vector vti.ScalarRegClass:$rs2)))),
+ (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+ vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (fneg vti.RegClass:$rs2)))),
+ (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (fneg (splat_vector vti.ScalarRegClass:$rs2))))),
+ (!cast<Instruction>("PseudoVFSGNJN_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+ vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+}
+
+// 14.11. Vector Floating-Point MIN/MAX Instructions
+defm : VPatBinaryFPSDNode_VV_VF<fminnum, "PseudoVFMIN">;
+defm : VPatBinaryFPSDNode_VV_VF<fmaxnum, "PseudoVFMAX">;
+
+// 14.13. Vector Floating-Point Compare Instructions
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETOEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETNE, "PseudoVMFNE", "PseudoVMFNE">;
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETUNE, "PseudoVMFNE", "PseudoVMFNE">;
+
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETLT, "PseudoVMFLT", "PseudoVMFGT">;
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLT, "PseudoVMFLT", "PseudoVMFGT">;
+
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETLE, "PseudoVMFLE", "PseudoVMFGE">;
+defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
// Floating-point vselects:
-// 12.16. Vector Integer Merge Instructions
-// 14.13. Vector Floating-Point Merge Instruction
+// 12.15. Vector Integer Merge Instructions
+// 14.15. Vector Floating-Point Merge Instruction
foreach fvti = AllFloatVectors in {
def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
fvti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
- fvti.AVL, fvti.SEW)>;
+ fvti.AVL, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
(splat_vector fvti.ScalarRegClass:$rs1),
@@ -481,13 +723,45 @@
(!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
fvti.RegClass:$rs2,
(fvti.Scalar fvti.ScalarRegClass:$rs1),
- VMV0:$vm, fvti.AVL, fvti.SEW)>;
+ VMV0:$vm, fvti.AVL, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
(splat_vector (fvti.Scalar fpimm0)),
fvti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
- fvti.RegClass:$rs2, 0, VMV0:$vm, fvti.AVL, fvti.SEW)>;
+ fvti.RegClass:$rs2, 0, VMV0:$vm, fvti.AVL, fvti.Log2SEW)>;
+}
+
+// 14.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
+defm : VPatConvertFP2ISDNode_V<fp_to_sint, "PseudoVFCVT_RTZ_X_F_V">;
+defm : VPatConvertFP2ISDNode_V<fp_to_uint, "PseudoVFCVT_RTZ_XU_F_V">;
+defm : VPatConvertI2FPSDNode_V<sint_to_fp, "PseudoVFCVT_F_X_V">;
+defm : VPatConvertI2FPSDNode_V<uint_to_fp, "PseudoVFCVT_F_XU_V">;
+
+// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+defm : VPatWConvertFP2ISDNode_V<fp_to_sint, "PseudoVFWCVT_RTZ_X_F_V">;
+defm : VPatWConvertFP2ISDNode_V<fp_to_uint, "PseudoVFWCVT_RTZ_XU_F_V">;
+defm : VPatWConvertI2FPSDNode_V<sint_to_fp, "PseudoVFWCVT_F_X_V">;
+defm : VPatWConvertI2FPSDNode_V<uint_to_fp, "PseudoVFWCVT_F_XU_V">;
+foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fwti.Vector (fpextend (fvti.Vector fvti.RegClass:$rs1))),
+ (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+}
+
+// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+defm : VPatNConvertFP2ISDNode_V<fp_to_sint, "PseudoVFNCVT_RTZ_X_F_W">;
+defm : VPatNConvertFP2ISDNode_V<fp_to_uint, "PseudoVFNCVT_RTZ_XU_F_W">;
+defm : VPatNConvertI2FPSDNode_V<sint_to_fp, "PseudoVFNCVT_F_X_W">;
+defm : VPatNConvertI2FPSDNode_V<uint_to_fp, "PseudoVFNCVT_F_XU_W">;
+foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
+ fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
}
} // Predicates = [HasStdExtV, HasStdExtF]
@@ -497,147 +771,39 @@
let Predicates = [HasStdExtV] in {
foreach vti = AllIntegerVectors in {
- def : Pat<(vti.Vector (splat_vector GPR:$rs1)),
+ def : Pat<(vti.Vector (SplatPat GPR:$rs1)),
(!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
- GPR:$rs1, vti.AVL, vti.SEW)>;
- def : Pat<(vti.Vector (splat_vector simm5:$rs1)),
+ GPR:$rs1, vti.AVL, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (SplatPat_simm5 simm5:$rs1)),
(!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
- simm5:$rs1, vti.AVL, vti.SEW)>;
-}
-
-foreach mti = AllMasks in {
- def : Pat<(mti.Mask immAllOnesV),
- (!cast<Instruction>("PseudoVMSET_M_"#mti.BX) mti.AVL, mti.SEW)>;
- def : Pat<(mti.Mask immAllZerosV),
- (!cast<Instruction>("PseudoVMCLR_M_"#mti.BX) mti.AVL, mti.SEW)>;
+ simm5:$rs1, vti.AVL, vti.Log2SEW)>;
}
} // Predicates = [HasStdExtV]
-let Predicates = [HasStdExtV, IsRV32] in {
-foreach vti = AllIntegerVectors in {
- if !eq(vti.SEW, 64) then {
- def : Pat<(vti.Vector (rv32_splat_i64 GPR:$rs1)),
- (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
- GPR:$rs1, vti.AVL, vti.SEW)>;
- def : Pat<(vti.Vector (rv32_splat_i64 simm5:$rs1)),
- (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
- simm5:$rs1, vti.AVL, vti.SEW)>;
- }
-}
-} // Predicates = [HasStdExtV, IsRV32]
-
let Predicates = [HasStdExtV, HasStdExtF] in {
foreach fvti = AllFloatVectors in {
def : Pat<(fvti.Vector (splat_vector fvti.ScalarRegClass:$rs1)),
(!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
(fvti.Scalar fvti.ScalarRegClass:$rs1),
- fvti.AVL, fvti.SEW)>;
+ fvti.AVL, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (splat_vector (fvti.Scalar fpimm0))),
(!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
- 0, fvti.AVL, fvti.SEW)>;
+ 0, fvti.AVL, fvti.Log2SEW)>;
}
} // Predicates = [HasStdExtV, HasStdExtF]
//===----------------------------------------------------------------------===//
-// Vector Element Inserts/Extracts
+// Vector Element Extracts
//===----------------------------------------------------------------------===//
-
-// The built-in TableGen 'extractelt' and 'insertelt' nodes must return the
-// same type as the vector element type. On RISC-V, XLenVT is the only legal
-// integer type, so for integer inserts/extracts we use a custom node which
-// returns XLenVT.
-def riscv_insert_vector_elt
- : SDNode<"ISD::INSERT_VECTOR_ELT",
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, XLenVT>,
- SDTCisPtrTy<3>]>, []>;
-def riscv_extract_vector_elt
- : SDNode<"ISD::EXTRACT_VECTOR_ELT",
- SDTypeProfile<1, 2, [SDTCisVT<0, XLenVT>, SDTCisPtrTy<2>]>, []>;
-
-multiclass VPatInsertExtractElt_XI_Idx<bit IsFloat> {
- defvar vtilist = !if(IsFloat, AllFloatVectors, AllIntegerVectors);
- defvar insertelt_node = !if(IsFloat, insertelt, riscv_insert_vector_elt);
- defvar extractelt_node = !if(IsFloat, extractelt, riscv_extract_vector_elt);
- foreach vti = vtilist in {
- defvar MX = vti.LMul.MX;
- defvar vmv_xf_s_inst = !cast<Instruction>(!strconcat("PseudoV",
- !if(IsFloat, "F", ""),
- "MV_",
- vti.ScalarSuffix,
- "_S_", MX));
- defvar vmv_s_xf_inst = !cast<Instruction>(!strconcat("PseudoV",
- !if(IsFloat, "F", ""),
- "MV_S_",
- vti.ScalarSuffix,
- "_", MX));
- // Only pattern-match insert/extract-element operations where the index is
- // 0. Any other index will have been custom-lowered to slide the vector
- // correctly into place (and, in the case of insert, slide it back again
- // afterwards).
- def : Pat<(vti.Scalar (extractelt_node (vti.Vector vti.RegClass:$rs2), 0)),
- (vmv_xf_s_inst vti.RegClass:$rs2, vti.SEW)>;
-
- def : Pat<(vti.Vector (insertelt_node (vti.Vector vti.RegClass:$merge),
- vti.ScalarRegClass:$rs1, 0)),
- (vmv_s_xf_inst vti.RegClass:$merge,
- (vti.Scalar vti.ScalarRegClass:$rs1),
- vti.AVL, vti.SEW)>;
- }
-}
-
-let Predicates = [HasStdExtV] in
-defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/0>;
let Predicates = [HasStdExtV, HasStdExtF] in
-defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/1>;
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous RISCVISD SDNodes
-//===----------------------------------------------------------------------===//
-
-def riscv_vid
- : SDNode<"RISCVISD::VID", SDTypeProfile<1, 0, [SDTCisVec<0>]>, []>;
-
-def SDTRVVSlide : SDTypeProfile<1, 3, [
- SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>
-]>;
-
-def riscv_slideup : SDNode<"RISCVISD::VSLIDEUP", SDTRVVSlide, []>;
-def riscv_slidedown : SDNode<"RISCVISD::VSLIDEDOWN", SDTRVVSlide, []>;
-
-let Predicates = [HasStdExtV] in {
-
-foreach vti = AllIntegerVectors in
- def : Pat<(vti.Vector riscv_vid),
- (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX) vti.AVL, vti.SEW)>;
-
-foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
- def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
- (vti.Vector vti.RegClass:$rs1),
- uimm5:$rs2)),
- (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
- vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
- vti.AVL, vti.SEW)>;
-
- def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
- (vti.Vector vti.RegClass:$rs1),
- GPR:$rs2)),
- (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
- vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
- vti.AVL, vti.SEW)>;
-
- def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
- (vti.Vector vti.RegClass:$rs1),
- uimm5:$rs2)),
- (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
- vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
- vti.AVL, vti.SEW)>;
-
- def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
- (vti.Vector vti.RegClass:$rs1),
- GPR:$rs2)),
- (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
- vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
- vti.AVL, vti.SEW)>;
+foreach vti = AllFloatVectors in {
+ defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
+ vti.ScalarSuffix,
+ "_S_", vti.LMul.MX));
+ // Only pattern-match extract-element operations where the index is 0. Any
+ // other index will have been custom-lowered to slide the vector correctly
+ // into place.
+ def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
+ (vmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
}
-} // Predicates = [HasStdExtV]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
new file mode 100644
index 0000000..c9c4215
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -0,0 +1,1493 @@
+//===- RISCVInstrInfoVVLPatterns.td - RVV VL patterns ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the required infrastructure and VL patterns to
+/// support code generation for the standard 'V' (Vector) extension, version
+/// 0.10. This version is still experimental as the 'V' extension hasn't been
+/// ratified yet.
+///
+/// This file is included from and depends upon RISCVInstrInfoVPseudos.td
+///
+/// Note: the patterns for RVV intrinsics are found in
+/// RISCVInstrInfoVPseudos.td.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the VL patterns.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVVLE_VL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, XLenVT>]>;
+def SDT_RISCVVSE_VL : SDTypeProfile<0, 3, [SDTCisVec<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, XLenVT>]>;
+
+def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisVec<0>, SDTCisInt<0>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, XLenVT>]>;
+
+def SDT_RISCVFPUnOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisVec<0>, SDTCisFP<0>,
+ SDTCVecEltisVT<2, i1>,
+ SDTCisSameNumEltsAs<0, 2>,
+ SDTCisVT<3, XLenVT>]>;
+def SDT_RISCVFPBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisVec<0>, SDTCisFP<0>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, XLenVT>]>;
+
+def riscv_vmv_v_x_vl : SDNode<"RISCVISD::VMV_V_X_VL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
+ SDTCisVT<1, XLenVT>,
+ SDTCisVT<2, XLenVT>]>>;
+def riscv_vfmv_v_f_vl : SDNode<"RISCVISD::VFMV_V_F_VL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisEltOfVec<1, 0>,
+ SDTCisVT<2, XLenVT>]>>;
+def riscv_vmv_s_x_vl : SDNode<"RISCVISD::VMV_S_X_VL",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisInt<0>,
+ SDTCisVT<2, XLenVT>,
+ SDTCisVT<3, XLenVT>]>>;
+def riscv_vfmv_s_f_vl : SDNode<"RISCVISD::VFMV_S_F_VL",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisFP<0>,
+ SDTCisEltOfVec<2, 0>,
+ SDTCisVT<3, XLenVT>]>>;
+
+def riscv_vle_vl : SDNode<"RISCVISD::VLE_VL", SDT_RISCVVLE_VL,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def riscv_vse_vl : SDNode<"RISCVISD::VSE_VL", SDT_RISCVVSE_VL,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def riscv_add_vl : SDNode<"RISCVISD::ADD_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_sub_vl : SDNode<"RISCVISD::SUB_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_mul_vl : SDNode<"RISCVISD::MUL_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_mulhs_vl : SDNode<"RISCVISD::MULHS_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_mulhu_vl : SDNode<"RISCVISD::MULHU_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_and_vl : SDNode<"RISCVISD::AND_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_or_vl : SDNode<"RISCVISD::OR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_xor_vl : SDNode<"RISCVISD::XOR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_sdiv_vl : SDNode<"RISCVISD::SDIV_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_srem_vl : SDNode<"RISCVISD::SREM_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_udiv_vl : SDNode<"RISCVISD::UDIV_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_urem_vl : SDNode<"RISCVISD::UREM_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_shl_vl : SDNode<"RISCVISD::SHL_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_sra_vl : SDNode<"RISCVISD::SRA_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_srl_vl : SDNode<"RISCVISD::SRL_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL>;
+
+def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_usubsat_vl : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>;
+
+def riscv_fadd_vl : SDNode<"RISCVISD::FADD_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
+def riscv_fsub_vl : SDNode<"RISCVISD::FSUB_VL", SDT_RISCVFPBinOp_VL>;
+def riscv_fmul_vl : SDNode<"RISCVISD::FMUL_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
+def riscv_fdiv_vl : SDNode<"RISCVISD::FDIV_VL", SDT_RISCVFPBinOp_VL>;
+def riscv_fneg_vl : SDNode<"RISCVISD::FNEG_VL", SDT_RISCVFPUnOp_VL>;
+def riscv_fabs_vl : SDNode<"RISCVISD::FABS_VL", SDT_RISCVFPUnOp_VL>;
+def riscv_fsqrt_vl : SDNode<"RISCVISD::FSQRT_VL", SDT_RISCVFPUnOp_VL>;
+def riscv_fcopysign_vl : SDNode<"RISCVISD::FCOPYSIGN_VL", SDT_RISCVFPBinOp_VL>;
+def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL>;
+def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL>;
+
+def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisVec<0>, SDTCisFP<0>,
+ SDTCVecEltisVT<4, i1>,
+ SDTCisSameNumEltsAs<0, 4>,
+ SDTCisVT<5, XLenVT>]>;
+def riscv_fma_vl : SDNode<"RISCVISD::FMA_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
+
+def SDT_RISCVFPRoundOp_VL : SDTypeProfile<1, 3, [
+ SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+def SDT_RISCVFPExtendOp_VL : SDTypeProfile<1, 3, [
+ SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+
+def riscv_fpround_vl : SDNode<"RISCVISD::FP_ROUND_VL", SDT_RISCVFPRoundOp_VL>;
+def riscv_fpextend_vl : SDNode<"RISCVISD::FP_EXTEND_VL", SDT_RISCVFPExtendOp_VL>;
+def riscv_fncvt_rod_vl : SDNode<"RISCVISD::VFNCVT_ROD_VL", SDT_RISCVFPRoundOp_VL>;
+
+def SDT_RISCVFP2IOp_VL : SDTypeProfile<1, 3, [
+ SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+def SDT_RISCVI2FPOp_VL : SDTypeProfile<1, 3, [
+ SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+
+def riscv_fp_to_sint_vl : SDNode<"RISCVISD::FP_TO_SINT_VL", SDT_RISCVFP2IOp_VL>;
+def riscv_fp_to_uint_vl : SDNode<"RISCVISD::FP_TO_UINT_VL", SDT_RISCVFP2IOp_VL>;
+def riscv_sint_to_fp_vl : SDNode<"RISCVISD::SINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
+def riscv_uint_to_fp_vl : SDNode<"RISCVISD::UINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
+
+def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL",
+ SDTypeProfile<1, 5, [SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, OtherVT>,
+ SDTCisSameAs<0, 4>,
+ SDTCisVT<5, XLenVT>]>>;
+
+def riscv_vrgather_vx_vl : SDNode<"RISCVISD::VRGATHER_VX_VL",
+ SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisVT<2, XLenVT>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, XLenVT>]>>;
+def riscv_vrgather_vv_vl : SDNode<"RISCVISD::VRGATHER_VV_VL",
+ SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisInt<2>,
+ SDTCisSameNumEltsAs<0, 2>,
+ SDTCisSameSizeAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, XLenVT>]>>;
+def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",
+ SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisInt<2>,
+ SDTCVecEltisVT<2, i16>,
+ SDTCisSameNumEltsAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, XLenVT>]>>;
+
+def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL",
+ SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisVec<1>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<1, i1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisVT<4, XLenVT>]>>;
+
+def SDT_RISCVMaskBinOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<0, i1>,
+ SDTCisVT<3, XLenVT>]>;
+def riscv_vmand_vl : SDNode<"RISCVISD::VMAND_VL", SDT_RISCVMaskBinOp_VL, [SDNPCommutative]>;
+def riscv_vmor_vl : SDNode<"RISCVISD::VMOR_VL", SDT_RISCVMaskBinOp_VL, [SDNPCommutative]>;
+def riscv_vmxor_vl : SDNode<"RISCVISD::VMXOR_VL", SDT_RISCVMaskBinOp_VL, [SDNPCommutative]>;
+
+def true_mask : PatLeaf<(riscv_vmset_vl (XLenVT srcvalue))>;
+
+def riscv_vmnot_vl : PatFrag<(ops node:$rs, node:$vl),
+ (riscv_vmxor_vl node:$rs, true_mask, node:$vl)>;
+
+def riscv_vpopc_vl : SDNode<"RISCVISD::VPOPC_VL",
+ SDTypeProfile<1, 3, [SDTCisVT<0, XLenVT>,
+ SDTCisVec<1>, SDTCisInt<1>,
+ SDTCVecEltisVT<2, i1>,
+ SDTCisSameNumEltsAs<1, 2>,
+ SDTCisVT<3, XLenVT>]>>;
+
+def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameNumEltsAs<1, 2>,
+ SDTCVecEltisVT<2, i1>,
+ SDTCisVT<3, XLenVT>]>;
+def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>;
+def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
+
+def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisVec<1>,
+ SDTCisSameNumEltsAs<0, 2>,
+ SDTCVecEltisVT<2, i1>,
+ SDTCisVT<3, XLenVT>]>>;
+
+def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisSameNumEltsAs<1, 3>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisVT<4, XLenVT>]>;
+def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
+def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
+
+def SDTRVVVecReduce : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>, SDTCisVT<4, XLenVT>
+]>;
+
+def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
+ (riscv_mul_vl node:$A, node:$B, node:$C,
+ node:$D), [{
+ return N->hasOneUse();
+}]>;
+
+def riscv_vwmul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
+ (riscv_vwmul_vl node:$A, node:$B, node:$C,
+ node:$D), [{
+ return N->hasOneUse();
+}]>;
+
+def riscv_vwmulu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
+ (riscv_vwmulu_vl node:$A, node:$B, node:$C,
+ node:$D), [{
+ return N->hasOneUse();
+}]>;
+
+foreach kind = ["ADD", "UMAX", "SMAX", "UMIN", "SMIN", "AND", "OR", "XOR",
+ "FADD", "SEQ_FADD", "FMIN", "FMAX"] in
+ def rvv_vecreduce_#kind#_vl : SDNode<"RISCVISD::VECREDUCE_"#kind#"_VL", SDTRVVVecReduce>;
+
+// Ignore the vl operand.
+def SplatFPOp : PatFrag<(ops node:$op),
+ (riscv_vfmv_v_f_vl node:$op, srcvalue)>;
+
+def sew8simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<8>", []>;
+def sew16simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<16>", []>;
+def sew32simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<32>", []>;
+def sew64simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<64>", []>;
+
+multiclass VPatBinaryVL_VV<SDNode vop,
+ string instruction_name,
+ ValueType result_type,
+ ValueType op_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg RetClass,
+ VReg op_reg_class> {
+ def : Pat<(result_type (vop
+ (op_type op_reg_class:$rs1),
+ (op_type op_reg_class:$rs2),
+ (mask_type true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX)
+ op_reg_class:$rs1,
+ op_reg_class:$rs2,
+ GPR:$vl, sew)>;
+ def : Pat<(result_type (vop
+ (op_type op_reg_class:$rs1),
+ (op_type op_reg_class:$rs2),
+ (mask_type VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX#"_MASK")
+ (result_type (IMPLICIT_DEF)),
+ op_reg_class:$rs1,
+ op_reg_class:$rs2,
+ VMV0:$vm, GPR:$vl, sew)>;
+}
+
+multiclass VPatBinaryVL_XI<SDNode vop,
+ string instruction_name,
+ string suffix,
+ ValueType result_type,
+ ValueType vop_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg RetClass,
+ VReg vop_reg_class,
+ ComplexPattern SplatPatKind,
+ DAGOperand xop_kind> {
+ def : Pat<(result_type (vop
+ (vop_type vop_reg_class:$rs1),
+ (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))),
+ (mask_type true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX)
+ vop_reg_class:$rs1,
+ xop_kind:$rs2,
+ GPR:$vl, sew)>;
+ def : Pat<(result_type (vop
+ (vop_type vop_reg_class:$rs1),
+ (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))),
+ (mask_type VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX#"_MASK")
+ (result_type (IMPLICIT_DEF)),
+ vop_reg_class:$rs1,
+ xop_kind:$rs2,
+ VMV0:$vm, GPR:$vl, sew)>;
+}
+
+multiclass VPatBinaryVL_VV_VX<SDNode vop, string instruction_name> {
+ foreach vti = AllIntegerVectors in {
+ defm : VPatBinaryVL_VV<vop, instruction_name,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, vti.RegClass>;
+ defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, vti.RegClass,
+ SplatPat, GPR>;
+ }
+}
+
+multiclass VPatBinaryVL_VV_VX_VI<SDNode vop, string instruction_name,
+ Operand ImmType = simm5>
+ : VPatBinaryVL_VV_VX<vop, instruction_name> {
+ foreach vti = AllIntegerVectors in {
+ defm : VPatBinaryVL_XI<vop, instruction_name, "VI",
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, vti.RegClass,
+ !cast<ComplexPattern>(SplatPat#_#ImmType),
+ ImmType>;
+ }
+}
+
+multiclass VPatBinaryWVL_VV_VX<SDNode vop, string instruction_name> {
+ foreach VtiToWti = AllWidenableIntVectors in {
+ defvar vti = VtiToWti.Vti;
+ defvar wti = VtiToWti.Wti;
+ defm : VPatBinaryVL_VV<vop, instruction_name,
+ wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, wti.RegClass, vti.RegClass>;
+ defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
+ wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, wti.RegClass, vti.RegClass,
+ SplatPat, GPR>;
+ }
+}
+
+class VPatBinaryVL_VF<SDNode vop,
+ string instruction_name,
+ ValueType result_type,
+ ValueType vop_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg RetClass,
+ VReg vop_reg_class,
+ RegisterClass scalar_reg_class> :
+ Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
+ (vop_type (SplatFPOp scalar_reg_class:$rs2)),
+ (mask_type true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
+ vop_reg_class:$rs1,
+ scalar_reg_class:$rs2,
+ GPR:$vl, sew)>;
+
+multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> {
+ foreach vti = AllFloatVectors in {
+ defm : VPatBinaryVL_VV<vop, instruction_name,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, vti.RegClass>;
+ def : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
+ vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, vti.RegClass,
+ vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in
+ def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
+ fvti.RegClass:$rs1,
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+ GPR:$vl, fvti.Log2SEW)>;
+}
+
+multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
+ CondCode cc> {
+ def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
+ vti.RegClass:$rs2, cc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl,
+ vti.Log2SEW)>;
+}
+
+// Inherits from VPatIntegerSetCCVL_VV and adds a pattern with operands swapped.
+multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_name,
+ CondCode cc, CondCode invcc> :
+ VPatIntegerSetCCVL_VV<vti, instruction_name, cc> {
+ def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs2),
+ vti.RegClass:$rs1, invcc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl,
+ vti.Log2SEW)>;
+}
+
+multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_name,
+ CondCode cc, CondCode invcc> {
+ defvar instruction = !cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX);
+ def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
+ (SplatPat (XLenVT GPR:$rs2)), cc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (instruction vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
+ (vti.Vector vti.RegClass:$rs1), invcc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (instruction vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+}
+
+multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_name,
+ CondCode cc, CondCode invcc> {
+ defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
+ def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
+ (SplatPat_simm5 simm5:$rs2), cc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (instruction vti.RegClass:$rs1, XLenVT:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
+ (vti.Vector vti.RegClass:$rs1), invcc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (instruction vti.RegClass:$rs1, simm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+}
+
+multiclass VPatIntegerSetCCVL_VIPlus1<VTypeInfo vti, string instruction_name,
+ CondCode cc, ComplexPattern splatpat_kind> {
+ defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
+ def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
+ (splatpat_kind simm5:$rs2), cc,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2),
+ GPR:$vl, vti.Log2SEW)>;
+}
+
+multiclass VPatFPSetCCVL_VV_VF_FV<CondCode cc,
+ string inst_name,
+ string swapped_op_inst_name> {
+ foreach fvti = AllFloatVectors in {
+ def : Pat<(fvti.Mask (riscv_setcc_vl (fvti.Vector fvti.RegClass:$rs1),
+ fvti.RegClass:$rs2,
+ cc,
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.RegClass:$rs2, GPR:$vl, fvti.Log2SEW)>;
+ def : Pat<(fvti.Mask (riscv_setcc_vl (fvti.Vector fvti.RegClass:$rs1),
+ (SplatFPOp fvti.ScalarRegClass:$rs2),
+ cc,
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+ GPR:$vl, fvti.Log2SEW)>;
+ def : Pat<(fvti.Mask (riscv_setcc_vl (SplatFPOp fvti.ScalarRegClass:$rs2),
+ (fvti.Vector fvti.RegClass:$rs1),
+ cc,
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+ GPR:$vl, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatExtendSDNode_V_VL<SDNode vop, string inst_name, string suffix,
+ list <VTypeInfoToFraction> fraction_list> {
+ foreach vtiTofti = fraction_list in {
+ defvar vti = vtiTofti.Vti;
+ defvar fti = vtiTofti.Fti;
+ def : Pat<(vti.Vector (vop (fti.Vector fti.RegClass:$rs2),
+ true_mask, VLOpFrag)),
+ (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
+ fti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+ }
+}
+
+multiclass VPatConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
+ fvti.RegClass:$rs1, GPR:$vl, ivti.Log2SEW)>;
+ }
+}
+
+multiclass VPatConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
+ (ivti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ ivti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatWConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+ def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatWConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in {
+ defvar ivti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
+ (ivti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
+ ivti.RegClass:$rs1, GPR:$vl, ivti.Log2SEW)>;
+ }
+}
+
+multiclass VPatNConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX)
+ fwti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+ }
+}
+
+multiclass VPatNConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+ def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
+ (iwti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
+ iwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+ }
+}
+
+multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
+ foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
+ defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
+ def: Pat<(vti_m1.Vector (vop (vti.Vector vti.RegClass:$rs1), VR:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
+ (vti_m1.Vector (IMPLICIT_DEF)),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti_m1.Vector VR:$rs2),
+ GPR:$vl, vti.Log2SEW)>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+
+// 7.4. Vector Unit-Stride Instructions
+foreach vti = AllVectors in {
+ defvar load_instr = !cast<Instruction>("PseudoVLE"#vti.SEW#"_V_"#vti.LMul.MX);
+ defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX);
+ // Load
+ def : Pat<(vti.Vector (riscv_vle_vl BaseAddr:$rs1, VLOpFrag)),
+ (load_instr BaseAddr:$rs1, GPR:$vl, vti.Log2SEW)>;
+ // Store
+ def : Pat<(riscv_vse_vl (vti.Vector vti.RegClass:$rs2), BaseAddr:$rs1,
+ VLOpFrag),
+ (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, GPR:$vl, vti.Log2SEW)>;
+}
+
+foreach mti = AllMasks in {
+ defvar load_instr = !cast<Instruction>("PseudoVLE1_V_"#mti.BX);
+ defvar store_instr = !cast<Instruction>("PseudoVSE1_V_"#mti.BX);
+ def : Pat<(mti.Mask (riscv_vle_vl BaseAddr:$rs1, VLOpFrag)),
+ (load_instr BaseAddr:$rs1, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(riscv_vse_vl (mti.Mask VR:$rs2), BaseAddr:$rs1,
+ VLOpFrag),
+ (store_instr VR:$rs2, BaseAddr:$rs1, GPR:$vl, mti.Log2SEW)>;
+}
+
+// 12.1. Vector Single-Width Integer Add and Subtract
+defm : VPatBinaryVL_VV_VX_VI<riscv_add_vl, "PseudoVADD">;
+defm : VPatBinaryVL_VV_VX<riscv_sub_vl, "PseudoVSUB">;
+// Handle VRSUB specially since it's the only integer binary op with reversed
+// pattern operands
+foreach vti = AllIntegerVectors in {
+ def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
+ (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
+ vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
+ (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
+ (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2,
+ VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
+ (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
+ vti.RegClass:$rs1, simm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
+ (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK")
+ (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, simm5:$rs2,
+ VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.3. Vector Integer Extension
+defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatExtendSDNode_V_VL<riscv_sext_vl, "PseudoVSEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatExtendSDNode_V_VL<riscv_sext_vl, "PseudoVSEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+defm : VPatExtendSDNode_V_VL<riscv_sext_vl, "PseudoVSEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+
+// 12.5. Vector Bitwise Logical Instructions
+defm : VPatBinaryVL_VV_VX_VI<riscv_and_vl, "PseudoVAND">;
+defm : VPatBinaryVL_VV_VX_VI<riscv_or_vl, "PseudoVOR">;
+defm : VPatBinaryVL_VV_VX_VI<riscv_xor_vl, "PseudoVXOR">;
+
+// 12.6. Vector Single-Width Bit Shift Instructions
+defm : VPatBinaryVL_VV_VX_VI<riscv_shl_vl, "PseudoVSLL", uimm5>;
+defm : VPatBinaryVL_VV_VX_VI<riscv_srl_vl, "PseudoVSRL", uimm5>;
+defm : VPatBinaryVL_VV_VX_VI<riscv_sra_vl, "PseudoVSRA", uimm5>;
+
+foreach vti = AllIntegerVectors in {
+ // Emit shift by 1 as an add since it might be faster.
+ def : Pat<(riscv_shl_vl (vti.Vector vti.RegClass:$rs1),
+ (riscv_vmv_v_x_vl 1, (XLenVT srcvalue)),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+foreach vtiTowti = AllWidenableIntVectors in {
+ defvar vti = vtiTowti.Vti;
+ defvar wti = vtiTowti.Wti;
+ def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
+ wti.RegClass:$rs1, 0, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector
+ (riscv_trunc_vector_vl
+ (wti.Vector
+ (riscv_sra_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
+ true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+ (!cast<Instruction>("PseudoVNSRA_WX_"#vti.LMul.MX)
+ wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector
+ (riscv_trunc_vector_vl
+ (wti.Vector
+ (riscv_sra_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
+ true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+ (!cast<Instruction>("PseudoVNSRA_WI_"#vti.LMul.MX)
+ wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector
+ (riscv_trunc_vector_vl
+ (wti.Vector
+ (riscv_srl_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
+ true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+ (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX)
+ wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector
+ (riscv_trunc_vector_vl
+ (wti.Vector
+ (riscv_srl_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
+ true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+ (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
+ wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.8. Vector Integer Comparison Instructions
+foreach vti = AllIntegerVectors in {
+ defm : VPatIntegerSetCCVL_VV<vti, "PseudoVMSEQ", SETEQ>;
+ defm : VPatIntegerSetCCVL_VV<vti, "PseudoVMSNE", SETNE>;
+
+ defm : VPatIntegerSetCCVL_VV_Swappable<vti, "PseudoVMSLT", SETLT, SETGT>;
+ defm : VPatIntegerSetCCVL_VV_Swappable<vti, "PseudoVMSLTU", SETULT, SETUGT>;
+ defm : VPatIntegerSetCCVL_VV_Swappable<vti, "PseudoVMSLE", SETLE, SETGE>;
+ defm : VPatIntegerSetCCVL_VV_Swappable<vti, "PseudoVMSLEU", SETULE, SETUGE>;
+
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSEQ", SETEQ, SETEQ>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSNE", SETNE, SETNE>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSLT", SETLT, SETGT>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSLTU", SETULT, SETUGT>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSLE", SETLE, SETGE>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSLEU", SETULE, SETUGE>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSGT", SETGT, SETLT>;
+ defm : VPatIntegerSetCCVL_VX_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>;
+ // There is no VMSGE(U)_VX instruction
+
+ defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSEQ", SETEQ, SETEQ>;
+ defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSNE", SETNE, SETNE>;
+ defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLE", SETLE, SETGE>;
+ defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLEU", SETULE, SETUGE>;
+
+ defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE", SETLT,
+ SplatPat_simm5_plus1>;
+ defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLEU", SETULT,
+ SplatPat_simm5_plus1_nonzero>;
+ defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGT", SETGE,
+ SplatPat_simm5_plus1>;
+ defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGTU", SETUGE,
+ SplatPat_simm5_plus1_nonzero>;
+} // foreach vti = AllIntegerVectors
+
+// 12.9. Vector Integer Min/Max Instructions
+defm : VPatBinaryVL_VV_VX<riscv_umin_vl, "PseudoVMINU">;
+defm : VPatBinaryVL_VV_VX<riscv_smin_vl, "PseudoVMIN">;
+defm : VPatBinaryVL_VV_VX<riscv_umax_vl, "PseudoVMAXU">;
+defm : VPatBinaryVL_VV_VX<riscv_smax_vl, "PseudoVMAX">;
+
+// 12.10. Vector Single-Width Integer Multiply Instructions
+defm : VPatBinaryVL_VV_VX<riscv_mul_vl, "PseudoVMUL">;
+defm : VPatBinaryVL_VV_VX<riscv_mulhs_vl, "PseudoVMULH">;
+defm : VPatBinaryVL_VV_VX<riscv_mulhu_vl, "PseudoVMULHU">;
+
+// 12.11. Vector Integer Divide Instructions
+defm : VPatBinaryVL_VV_VX<riscv_udiv_vl, "PseudoVDIVU">;
+defm : VPatBinaryVL_VV_VX<riscv_sdiv_vl, "PseudoVDIV">;
+defm : VPatBinaryVL_VV_VX<riscv_urem_vl, "PseudoVREMU">;
+defm : VPatBinaryVL_VV_VX<riscv_srem_vl, "PseudoVREM">;
+
+// 12.12. Vector Widening Integer Multiply Instructions
+defm : VPatBinaryWVL_VV_VX<riscv_vwmul_vl, "PseudoVWMUL">;
+defm : VPatBinaryWVL_VV_VX<riscv_vwmulu_vl, "PseudoVWMULU">;
+
+// 12.13 Vector Single-Width Integer Multiply-Add Instructions
+foreach vti = AllIntegerVectors in {
+ // NOTE: We choose VMADD because it has the most commuting freedom. So it
+ // works best with how TwoAddressInstructionPass tries commuting.
+ defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+ def : Pat<(vti.Vector
+ (riscv_add_vl vti.RegClass:$rs2,
+ (riscv_mul_vl_oneuse vti.RegClass:$rs1,
+ vti.RegClass:$rd,
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector
+ (riscv_sub_vl vti.RegClass:$rs2,
+ (riscv_mul_vl_oneuse vti.RegClass:$rs1,
+ vti.RegClass:$rd,
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
+ // commutable.
+ def : Pat<(vti.Vector
+ (riscv_add_vl vti.RegClass:$rs2,
+ (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rd,
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector
+ (riscv_sub_vl vti.RegClass:$rs2,
+ (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rd,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.14. Vector Widening Integer Multiply-Add Instructions
+foreach vtiTowti = AllWidenableIntVectors in {
+ defvar vti = vtiTowti.Vti;
+ defvar wti = vtiTowti.Wti;
+ def : Pat<(wti.Vector
+ (riscv_add_vl wti.RegClass:$rd,
+ (riscv_vwmul_vl_oneuse vti.RegClass:$rs1,
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVWMACC_VV_" # vti.LMul.MX # "_TA")
+ wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(wti.Vector
+ (riscv_add_vl wti.RegClass:$rd,
+ (riscv_vwmulu_vl_oneuse vti.RegClass:$rs1,
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVWMACCU_VV_" # vti.LMul.MX # "_TA")
+ wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(wti.Vector
+ (riscv_add_vl wti.RegClass:$rd,
+ (riscv_vwmul_vl_oneuse (SplatPat XLenVT:$rs1),
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVWMACC_VX_" # vti.LMul.MX # "_TA")
+ wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(wti.Vector
+ (riscv_add_vl wti.RegClass:$rd,
+ (riscv_vwmulu_vl_oneuse (SplatPat XLenVT:$rs1),
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask true_mask), VLOpFrag),
+ (vti.Mask true_mask), VLOpFrag)),
+ (!cast<Instruction>("PseudoVWMACCU_VX_" # vti.LMul.MX # "_TA")
+ wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.15. Vector Integer Merge Instructions
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ vti.RegClass:$rs1,
+ vti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
+ GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (SplatPat_simm5 simm5:$rs1),
+ vti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.16. Vector Integer Move Instructions
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (riscv_vmv_v_x_vl GPR:$rs2, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
+ $rs2, GPR:$vl, vti.Log2SEW)>;
+ defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
+ def : Pat<(vti.Vector (riscv_vmv_v_x_vl (ImmPat XLenVT:$imm5),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
+ XLenVT:$imm5, GPR:$vl, vti.Log2SEW)>;
+}
+
+// 12.1. Vector Single-Width Saturating Add and Subtract
+defm : VPatBinaryVL_VV_VX_VI<riscv_saddsat_vl, "PseudoVSADD">;
+defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">;
+defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">;
+defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">;
+
+} // Predicates = [HasStdExtV]
+
+// 15.1. Vector Single-Width Integer Reduction Instructions
+let Predicates = [HasStdExtV] in {
+defm : VPatReductionVL<rvv_vecreduce_ADD_vl, "PseudoVREDSUM", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_UMAX_vl, "PseudoVREDMAXU", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_SMAX_vl, "PseudoVREDMAX", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_UMIN_vl, "PseudoVREDMINU", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_SMIN_vl, "PseudoVREDMIN", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_AND_vl, "PseudoVREDAND", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_OR_vl, "PseudoVREDOR", /*is_float*/0>;
+defm : VPatReductionVL<rvv_vecreduce_XOR_vl, "PseudoVREDXOR", /*is_float*/0>;
+} // Predicates = [HasStdExtV]
+
+// 15.3. Vector Single-Width Floating-Point Reduction Instructions
+let Predicates = [HasStdExtV, HasStdExtF] in {
+defm : VPatReductionVL<rvv_vecreduce_SEQ_FADD_vl, "PseudoVFREDOSUM", /*is_float*/1>;
+defm : VPatReductionVL<rvv_vecreduce_FADD_vl, "PseudoVFREDSUM", /*is_float*/1>;
+defm : VPatReductionVL<rvv_vecreduce_FMIN_vl, "PseudoVFREDMIN", /*is_float*/1>;
+defm : VPatReductionVL<rvv_vecreduce_FMAX_vl, "PseudoVFREDMAX", /*is_float*/1>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+defm : VPatBinaryFPVL_VV_VF<riscv_fadd_vl, "PseudoVFADD">;
+defm : VPatBinaryFPVL_VV_VF<riscv_fsub_vl, "PseudoVFSUB">;
+defm : VPatBinaryFPVL_R_VF<riscv_fsub_vl, "PseudoVFRSUB">;
+
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+defm : VPatBinaryFPVL_VV_VF<riscv_fmul_vl, "PseudoVFMUL">;
+defm : VPatBinaryFPVL_VV_VF<riscv_fdiv_vl, "PseudoVFDIV">;
+defm : VPatBinaryFPVL_R_VF<riscv_fdiv_vl, "PseudoVFRDIV">;
+
+// 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions.
+foreach vti = AllFloatVectors in {
+ // NOTE: We choose VFMADD because it has the most commuting freedom. So it
+ // works best with how TwoAddressInstructionPass tries commuting.
+ defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+ def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
+ vti.RegClass:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMADD_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMSUB_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$rd,
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMADD_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$rd, vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMSUB_VV_"# suffix)
+ vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
+ // commutable.
+ def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ vti.RegClass:$rd, vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMADD_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ vti.RegClass:$rd,
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMSUB_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ (riscv_fneg_vl vti.RegClass:$rd,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ (riscv_fneg_vl vti.RegClass:$rd,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ // The splat might be negated.
+ def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$rd,
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$rd, vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
+ vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+}
+
+// 14.11. Vector Floating-Point MIN/MAX Instructions
+defm : VPatBinaryFPVL_VV_VF<riscv_fminnum_vl, "PseudoVFMIN">;
+defm : VPatBinaryFPVL_VV_VF<riscv_fmaxnum_vl, "PseudoVFMAX">;
+
+// 14.13. Vector Floating-Point Compare Instructions
+defm : VPatFPSetCCVL_VV_VF_FV<SETEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+defm : VPatFPSetCCVL_VV_VF_FV<SETOEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+
+defm : VPatFPSetCCVL_VV_VF_FV<SETNE, "PseudoVMFNE", "PseudoVMFNE">;
+defm : VPatFPSetCCVL_VV_VF_FV<SETUNE, "PseudoVMFNE", "PseudoVMFNE">;
+
+defm : VPatFPSetCCVL_VV_VF_FV<SETLT, "PseudoVMFLT", "PseudoVMFGT">;
+defm : VPatFPSetCCVL_VV_VF_FV<SETOLT, "PseudoVMFLT", "PseudoVMFGT">;
+
+defm : VPatFPSetCCVL_VV_VF_FV<SETLE, "PseudoVMFLE", "PseudoVMFGE">;
+defm : VPatFPSetCCVL_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
+
+foreach vti = AllFloatVectors in {
+ // 14.8. Vector Floating-Point Square-Root Instruction
+ def : Pat<(riscv_fsqrt_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSQRT_V_"# vti.LMul.MX)
+ vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+
+ // 14.12. Vector Floating-Point Sign-Injection Instructions
+ def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSGNJX_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs, vti.RegClass:$rs, GPR:$vl, vti.Log2SEW)>;
+ // Handle fneg with VFSGNJN using the same input for both operands.
+ def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs, vti.RegClass:$rs, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+ (riscv_fneg_vl vti.RegClass:$rs2,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+ (SplatFPOp vti.ScalarRegClass:$rs2),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"# vti.LMul.MX)
+ vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+}
+
+foreach fvti = AllFloatVectors in {
+ // Floating-point vselects:
+ // 12.15. Vector Integer Merge Instructions
+ // 14.15. Vector Floating-Point Merge Instruction
+ def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+ fvti.RegClass:$rs1,
+ fvti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
+ GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+ (SplatFPOp fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2,
+ (fvti.Scalar fvti.ScalarRegClass:$rs1),
+ VMV0:$vm, GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+ (SplatFPOp (fvti.Scalar fpimm0)),
+ fvti.RegClass:$rs2,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2, 0, VMV0:$vm, GPR:$vl, fvti.Log2SEW)>;
+
+ // 14.16. Vector Floating-Point Move Instruction
+ // If we're splatting fpimm0, use vmv.v.x vd, x0.
+ def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+ (fvti.Scalar (fpimm0)), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+ 0, GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+ (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+ fvti.LMul.MX)
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ GPR:$vl, fvti.Log2SEW)>;
+
+ // 14.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
+ defm : VPatConvertFP2ISDNode_V_VL<riscv_fp_to_sint_vl, "PseudoVFCVT_RTZ_X_F_V">;
+ defm : VPatConvertFP2ISDNode_V_VL<riscv_fp_to_uint_vl, "PseudoVFCVT_RTZ_XU_F_V">;
+ defm : VPatConvertI2FPSDNode_V_VL<riscv_sint_to_fp_vl, "PseudoVFCVT_F_X_V">;
+ defm : VPatConvertI2FPSDNode_V_VL<riscv_uint_to_fp_vl, "PseudoVFCVT_F_XU_V">;
+
+ // 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+ defm : VPatWConvertFP2ISDNode_V_VL<riscv_fp_to_sint_vl, "PseudoVFWCVT_RTZ_X_F_V">;
+ defm : VPatWConvertFP2ISDNode_V_VL<riscv_fp_to_uint_vl, "PseudoVFWCVT_RTZ_XU_F_V">;
+ defm : VPatWConvertI2FPSDNode_V_VL<riscv_sint_to_fp_vl, "PseudoVFWCVT_F_X_V">;
+ defm : VPatWConvertI2FPSDNode_V_VL<riscv_uint_to_fp_vl, "PseudoVFWCVT_F_XU_V">;
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fwti.Vector (riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+ }
+
+ // 14.19 Narrowing Floating-Point/Integer Type-Convert Instructions
+ defm : VPatNConvertFP2ISDNode_V_VL<riscv_fp_to_sint_vl, "PseudoVFNCVT_RTZ_X_F_W">;
+ defm : VPatNConvertFP2ISDNode_V_VL<riscv_fp_to_uint_vl, "PseudoVFNCVT_RTZ_XU_F_W">;
+ defm : VPatNConvertI2FPSDNode_V_VL<riscv_sint_to_fp_vl, "PseudoVFNCVT_F_X_W">;
+ defm : VPatNConvertI2FPSDNode_V_VL<riscv_uint_to_fp_vl, "PseudoVFNCVT_F_XU_W">;
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fvti.Vector (riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
+ fwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_fncvt_rod_vl (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX)
+ fwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+ }
+}
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV] in {
+
+foreach mti = AllMasks in {
+ // 16.1 Vector Mask-Register Logical Instructions
+ def : Pat<(mti.Mask (riscv_vmset_vl VLOpFrag)),
+ (!cast<Instruction>("PseudoVMSET_M_" # mti.BX) GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmclr_vl VLOpFrag)),
+ (!cast<Instruction>("PseudoVMCLR_M_" # mti.BX) GPR:$vl, mti.Log2SEW)>;
+
+ def : Pat<(mti.Mask (riscv_vmand_vl VR:$rs1, VR:$rs2, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMAND_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmor_vl VR:$rs1, VR:$rs2, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMOR_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmxor_vl VR:$rs1, VR:$rs2, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMXOR_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+
+ def : Pat<(mti.Mask (riscv_vmand_vl VR:$rs1,
+ (riscv_vmnot_vl VR:$rs2, VLOpFrag),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMANDNOT_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmor_vl VR:$rs1,
+ (riscv_vmnot_vl VR:$rs2, VLOpFrag),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMORNOT_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ // XOR is associative so we need 2 patterns for VMXNOR.
+ def : Pat<(mti.Mask (riscv_vmxor_vl (riscv_vmnot_vl VR:$rs1,
+ VLOpFrag),
+ VR:$rs2, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMXNOR_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+
+ def : Pat<(mti.Mask (riscv_vmnot_vl (riscv_vmand_vl VR:$rs1, VR:$rs2,
+ VLOpFrag),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMNAND_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmnot_vl (riscv_vmor_vl VR:$rs1, VR:$rs2,
+ VLOpFrag),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMNOR_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(mti.Mask (riscv_vmnot_vl (riscv_vmxor_vl VR:$rs1, VR:$rs2,
+ VLOpFrag),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMXNOR_MM_" # mti.LMul.MX)
+ VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+
+ // Match the not idiom to the vmnot.m pseudo.
+ def : Pat<(mti.Mask (riscv_vmnot_vl VR:$rs, VLOpFrag)),
+ (!cast<Instruction>("PseudoVMNAND_MM_" # mti.LMul.MX)
+ VR:$rs, VR:$rs, GPR:$vl, mti.Log2SEW)>;
+
+ // 16.2 Vector Mask Population Count vpopc
+ def : Pat<(XLenVT (riscv_vpopc_vl (mti.Mask VR:$rs2), (mti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVPOPC_M_" # mti.BX)
+ VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV] in {
+// 17.1. Integer Scalar Move Instructions
+// 17.4. Vector Register Gather Instruction
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$merge),
+ vti.ScalarRegClass:$rs1,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
+ vti.RegClass:$merge,
+ (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs2, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX)
+ vti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
+ vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (riscv_vrgather_vv_vl
+ vti.RegClass:$rs2,
+ vti.RegClass:$rs1,
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$merge,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+ vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
+ // emul = lmul * 16 / sew
+ defvar vlmul = vti.LMul;
+ defvar octuple_lmul = vlmul.octuple;
+ defvar octuple_emul = !srl(!mul(octuple_lmul, 16), vti.Log2SEW);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emul_str = octuple_to_str<octuple_emul>.ret;
+ defvar ivti = !cast<VTypeInfo>("VI16" # emul_str);
+ defvar inst = "PseudoVRGATHEREI16_VV_" # vti.LMul.MX # "_" # emul_str;
+ def : Pat<(vti.Vector (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
+ (ivti.Vector ivti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(inst)
+ vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (riscv_vrgatherei16_vv_vl
+ vti.RegClass:$rs2,
+ (ivti.Vector ivti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$merge,
+ VLOpFrag)),
+ (!cast<Instruction>(inst#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+ vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+ }
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+
+// 17.2. Floating-Point Scalar Move Instructions
+foreach vti = AllFloatVectors in {
+ def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+ vti.ScalarRegClass:$rs1,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+ vti.RegClass:$merge,
+ (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+ defvar ivti = GetIntVTypeInfo<vti>.Vti;
+ def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
+ (ivti.Vector vti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX)
+ vti.RegClass:$rs2, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX)
+ vti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm,
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
+ vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (riscv_vrgather_vv_vl
+ vti.RegClass:$rs2,
+ (ivti.Vector vti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$merge,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+ vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
+ defvar vlmul = vti.LMul;
+ defvar octuple_lmul = vlmul.octuple;
+ defvar octuple_emul = !srl(!mul(octuple_lmul, 16), vti.Log2SEW);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emul_str = octuple_to_str<octuple_emul>.ret;
+ defvar ivti = !cast<VTypeInfo>("VI16" # emul_str);
+ defvar inst = "PseudoVRGATHEREI16_VV_" # vti.LMul.MX # "_" # emul_str;
+ def : Pat<(vti.Vector (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
+ (ivti.Vector ivti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>(inst)
+ vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+ (riscv_vrgatherei16_vv_vl
+ vti.RegClass:$rs2,
+ (ivti.Vector ivti.RegClass:$rs1),
+ (vti.Mask true_mask),
+ VLOpFrag),
+ vti.RegClass:$merge,
+ VLOpFrag)),
+ (!cast<Instruction>(inst#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+ vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+ }
+}
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous RISCVISD SDNodes
+//===----------------------------------------------------------------------===//
+
+def riscv_vid_vl : SDNode<"RISCVISD::VID_VL", SDTypeProfile<1, 2,
+ [SDTCisVec<0>, SDTCVecEltisVT<1, i1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<2, XLenVT>]>, []>;
+
+def SDTRVVSlide : SDTypeProfile<1, 5, [
+ SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>,
+ SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<0, 4>, SDTCisVT<5, XLenVT>
+]>;
+def SDTRVVSlide1 : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisInt<0>, SDTCisVT<2, XLenVT>,
+ SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, XLenVT>
+]>;
+
+def riscv_slideup_vl : SDNode<"RISCVISD::VSLIDEUP_VL", SDTRVVSlide, []>;
+def riscv_slide1up_vl : SDNode<"RISCVISD::VSLIDE1UP_VL", SDTRVVSlide1, []>;
+def riscv_slidedown_vl : SDNode<"RISCVISD::VSLIDEDOWN_VL", SDTRVVSlide, []>;
+def riscv_slide1down_vl : SDNode<"RISCVISD::VSLIDE1DOWN_VL", SDTRVVSlide1, []>;
+
+let Predicates = [HasStdExtV] in {
+
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX) GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDE1UP_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_slide1down_vl (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDE1DOWN_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+}
+
+foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
+ def : Pat<(vti.Vector (riscv_slideup_vl (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ uimm5:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slideup_vl (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ uimm5:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2, (vti.Mask true_mask),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+ GPR:$vl, vti.Log2SEW)>;
+}
+
+} // Predicates = [HasStdExtV]
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 85ebe05..7316b7a 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -33,9 +33,9 @@
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class FPFMAH_rrr_frm<RISCVOpcode opcode, string opcodestr>
- : RVInstR4<0b10, opcode, (outs FPR16:$rd),
- (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3),
- opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+ : RVInstR4Frm<0b10, opcode, (outs FPR16:$rd),
+ (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3),
+ opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
class FPFMAHDynFrmAlias<FPFMAH_rrr_frm Inst, string OpcodeStr>
: InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
@@ -60,7 +60,7 @@
class FPCmpH_rr<bits<3> funct3, string opcodestr>
: RVInstR<0b1010010, funct3, OPC_OP_FP, (outs GPR:$rd),
(ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">,
- Sched<[]>;
+ Sched<[WriteFCmp16, ReadFCmp16, ReadFCmp16]>;
//===----------------------------------------------------------------------===//
// Instructions
@@ -71,7 +71,7 @@
def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd),
(ins GPR:$rs1, simm12:$imm12),
"flh", "$rd, ${imm12}(${rs1})">,
- Sched<[]>;
+ Sched<[WriteFLD16, ReadFMemBase]>;
// Operands for stores are in the order srcreg, base, offset rather than
// reflecting the order these fields are specified in the instruction
@@ -80,94 +80,93 @@
def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
(ins FPR16:$rs2, GPR:$rs1, simm12:$imm12),
"fsh", "$rs2, ${imm12}(${rs1})">,
- Sched<[]>;
+ Sched<[WriteFST16, ReadStoreData, ReadFMemBase]>;
def FMADD_H : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">,
- Sched<[]>;
+ Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
def : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">;
def FMSUB_H : FPFMAH_rrr_frm<OPC_MSUB, "fmsub.h">,
- Sched<[]>;
+ Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
def : FPFMAHDynFrmAlias<FMSUB_H, "fmsub.h">;
def FNMSUB_H : FPFMAH_rrr_frm<OPC_NMSUB, "fnmsub.h">,
- Sched<[]>;
+ Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
def : FPFMAHDynFrmAlias<FNMSUB_H, "fnmsub.h">;
def FNMADD_H : FPFMAH_rrr_frm<OPC_NMADD, "fnmadd.h">,
- Sched<[]>;
+ Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
def : FPFMAHDynFrmAlias<FNMADD_H, "fnmadd.h">;
def FADD_H : FPALUH_rr_frm<0b0000010, "fadd.h">,
- Sched<[]>;
+ Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
def : FPALUHDynFrmAlias<FADD_H, "fadd.h">;
def FSUB_H : FPALUH_rr_frm<0b0000110, "fsub.h">,
- Sched<[]>;
+ Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
def : FPALUHDynFrmAlias<FSUB_H, "fsub.h">;
def FMUL_H : FPALUH_rr_frm<0b0001010, "fmul.h">,
- Sched<[]>;
+ Sched<[WriteFMul16, ReadFMul16, ReadFMul16]>;
def : FPALUHDynFrmAlias<FMUL_H, "fmul.h">;
def FDIV_H : FPALUH_rr_frm<0b0001110, "fdiv.h">,
- Sched<[]>;
+ Sched<[WriteFDiv16, ReadFDiv16, ReadFDiv16]>;
def : FPALUHDynFrmAlias<FDIV_H, "fdiv.h">;
def FSQRT_H : FPUnaryOp_r_frm<0b0101110, FPR16, FPR16, "fsqrt.h">,
- Sched<[]> {
+ Sched<[WriteFSqrt16, ReadFSqrt16]> {
let rs2 = 0b00000;
}
def : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
def FSGNJ_H : FPALUH_rr<0b0010010, 0b000, "fsgnj.h">,
- Sched<[]>;
+ Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
def FSGNJN_H : FPALUH_rr<0b0010010, 0b001, "fsgnjn.h">,
- Sched<[]>;
+ Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
def FSGNJX_H : FPALUH_rr<0b0010010, 0b010, "fsgnjx.h">,
- Sched<[]>;
-
+ Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
def FMIN_H : FPALUH_rr<0b0010110, 0b000, "fmin.h">,
- Sched<[]>;
+ Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>;
def FMAX_H : FPALUH_rr<0b0010110, 0b001, "fmax.h">,
- Sched<[]>;
+ Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>;
def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.w.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> {
let rs2 = 0b00000;
}
def : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.wu.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> {
let rs2 = 0b00001;
}
def : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.w">,
- Sched<[]> {
+ Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> {
let rs2 = 0b00000;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">,
- Sched<[]> {
+ Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> {
let rs2 = 0b00001;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">,
- Sched<[]> {
+ Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]> {
let rs2 = 0b00000;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b000, FPR32, FPR16, "fcvt.s.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]> {
let rs2 = 0b00010;
}
def FMV_X_H : FPUnaryOp_r<0b1110010, 0b000, GPR, FPR16, "fmv.x.h">,
- Sched<[]> {
+ Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]> {
let rs2 = 0b00000;
}
def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">,
- Sched<[]> {
+ Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]> {
let rs2 = 0b00000;
}
@@ -176,32 +175,32 @@
def FLE_H : FPCmpH_rr<0b000, "fle.h">;
def FCLASS_H : FPUnaryOp_r<0b1110010, 0b001, GPR, FPR16, "fclass.h">,
- Sched<[]> {
+ Sched<[WriteFClass16, ReadFClass16]> {
let rs2 = 0b00000;
}
} // Predicates = [HasStdExtZfh]
let Predicates = [HasStdExtZfh, IsRV64] in {
def FCVT_L_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.l.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> {
let rs2 = 0b00010;
}
def : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
def FCVT_LU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.lu.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> {
let rs2 = 0b00011;
}
def : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.l">,
- Sched<[]> {
+ Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> {
let rs2 = 0b00010;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">,
- Sched<[]> {
+ Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> {
let rs2 = 0b00011;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
@@ -209,13 +208,13 @@
let Predicates = [HasStdExtZfh, HasStdExtD] in {
def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">,
- Sched<[]> {
+ Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]> {
let rs2 = 0b00001;
}
def : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">,
- Sched<[]> {
+ Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]> {
let rs2 = 0b00010;
}
} // Predicates = [HasStdExtZfh, HasStdExtD]
@@ -280,10 +279,7 @@
def : Pat<(fcopysign FPR16:$rs1, (fneg FPR16:$rs2)), (FSGNJN_H $rs1, $rs2)>;
def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2),
(FSGNJ_H $rs1, (FCVT_H_S $rs2, 0b111))>;
-def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
- (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
def : Pat<(fcopysign FPR32:$rs1, FPR16:$rs2), (FSGNJ_S $rs1, (FCVT_S_H $rs2))>;
-def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
// fmadd: rs1 * rs2 + rs3
def : Pat<(fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3),
@@ -301,6 +297,9 @@
def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
(FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+// The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
+// LLVM's fminnum and fmaxnum
+// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
def : PatFpr16Fpr16<fminnum, FMIN_H>;
def : PatFpr16Fpr16<fmaxnum, FMAX_H>;
@@ -317,11 +316,11 @@
/// Loads
-defm : LdPat<load, FLH>;
+defm : LdPat<load, FLH, f16>;
/// Stores
-defm : StPat<store, FSH, FPR16>;
+defm : StPat<store, FSH, FPR16, f16>;
/// Float conversion operations
@@ -335,32 +334,45 @@
} // Predicates = [HasStdExtZfh]
let Predicates = [HasStdExtZfh, IsRV32] in {
-// float->[u]int. Round-to-zero must be used.
-def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+// half->[u]int. Round-to-zero must be used.
+def : Pat<(i32 (fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(i32 (fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
-// [u]int->float. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_WU $rs1, 0b111)>;
+// half->int32 with current rounding mode.
+def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>;
+
+// half->int32 rounded to nearest with ties rounded away from zero.
+def : Pat<(i32 (lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>;
+
+// [u]int->half. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
} // Predicates = [HasStdExtZfh, IsRV32]
let Predicates = [HasStdExtZfh, IsRV64] in {
-// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
-// because fpto[u|s]i produces poison if the value can't fit into the target.
-// We match the single case below because fcvt.wu.s sign-extends its result so
-// is cheaper than fcvt.lu.h+sext.w.
-def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR16:$rs1)), i32),
- (FCVT_WU_H $rs1, 0b001)>;
+// Use target specific isd nodes to help us remember the result is sign
+// extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
+// duplicated if it has another user that didn't need the sign_extend.
+def : Pat<(riscv_fcvt_w_rv64 FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
-// FP->[u]int64
-def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_L_H $rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_LU_H $rs1, 0b001)>;
+// half->[u]int64. Round-to-zero must be used.
+def : Pat<(i64 (fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
+def : Pat<(i64 (fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
+
+// half->int64 with current rounding mode.
+def : Pat<(i64 (lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
+def : Pat<(i64 (llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
+
+// half->int64 rounded to nearest with ties rounded away from zero.
+def : Pat<(i64 (lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
+def : Pat<(i64 (llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
// [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_L $rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_LU $rs1, 0b111)>;
+def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_H_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>;
+def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>;
} // Predicates = [HasStdExtZfh, IsRV64]
let Predicates = [HasStdExtZfh, HasStdExtD] in {
@@ -368,4 +380,9 @@
// f64 -> f16, f16 -> f64
def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
+
+/// Float arithmetic operations
+def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
+ (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
+def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVLegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVLegalizerInfo.cpp
index c92f4a3..f6256de 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVLegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVLegalizerInfo.cpp
@@ -19,5 +19,5 @@
using namespace llvm;
RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
- computeTables();
+ getLegacyLegalizerInfo().computeTables();
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index 3c38dd1..74d9246 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -109,7 +109,7 @@
MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP);
break;
case MachineOperand::MO_GlobalAddress:
- MCOp = lowerSymbolOperand(MO, AP.getSymbol(MO.getGlobal()), AP);
+ MCOp = lowerSymbolOperand(MO, AP.getSymbolPreferLocal(*MO.getGlobal()), AP);
break;
case MachineOperand::MO_BlockAddress:
MCOp = lowerSymbolOperand(
@@ -155,13 +155,13 @@
assert(OpNo >= 0 && "Operand number doesn't fit in an 'int' type");
// Skip VL and SEW operands which are the last two operands if present.
- if ((TSFlags & RISCVII::HasVLOpMask) && OpNo == (NumOps - 2))
+ if (RISCVII::hasVLOp(TSFlags) && OpNo == (NumOps - 2))
continue;
- if ((TSFlags & RISCVII::HasSEWOpMask) && OpNo == (NumOps - 1))
+ if (RISCVII::hasSEWOp(TSFlags) && OpNo == (NumOps - 1))
continue;
// Skip merge op. It should be the first operand after the result.
- if ((TSFlags & RISCVII::HasMergeOpMask) && OpNo == 1) {
+ if (RISCVII::hasMergeOp(TSFlags) && OpNo == 1) {
assert(MI->getNumExplicitDefs() == 1);
continue;
}
@@ -198,16 +198,16 @@
// Unmasked pseudo instructions need to append dummy mask operand to
// V instructions. All V instructions are modeled as the masked version.
- if (TSFlags & RISCVII::HasDummyMaskOpMask)
+ if (RISCVII::hasDummyMaskOp(TSFlags))
OutMI.addOperand(MCOperand::createReg(RISCV::NoRegister));
return true;
}
-void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
- const AsmPrinter &AP) {
+bool llvm::lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ AsmPrinter &AP) {
if (lowerRISCVVMachineInstrToMCInst(MI, OutMI))
- return;
+ return false;
OutMI.setOpcode(MI->getOpcode());
@@ -217,19 +217,32 @@
OutMI.addOperand(MCOp);
}
- if (OutMI.getOpcode() == RISCV::PseudoReadVLENB) {
+ switch (OutMI.getOpcode()) {
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+ const Function &F = MI->getParent()->getParent()->getFunction();
+ if (F.hasFnAttribute("patchable-function-entry")) {
+ unsigned Num;
+ if (F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num))
+ return false;
+ AP.emitNops(Num);
+ return true;
+ }
+ break;
+ }
+ case RISCV::PseudoReadVLENB:
OutMI.setOpcode(RISCV::CSRRS);
OutMI.addOperand(MCOperand::createImm(
RISCVSysReg::lookupSysRegByName("VLENB")->Encoding));
OutMI.addOperand(MCOperand::createReg(RISCV::X0));
- return;
- }
-
- if (OutMI.getOpcode() == RISCV::PseudoReadVL) {
+ break;
+ case RISCV::PseudoReadVL:
OutMI.setOpcode(RISCV::CSRRS);
- OutMI.addOperand(MCOperand::createImm(
- RISCVSysReg::lookupSysRegByName("VL")->Encoding));
+ OutMI.addOperand(
+ MCOperand::createImm(RISCVSysReg::lookupSysRegByName("VL")->Encoding));
OutMI.addOperand(MCOperand::createReg(RISCV::X0));
- return;
+ break;
}
+ return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index c379a8d8f..b5609e9 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -32,6 +32,12 @@
int MoveF64FrameIndex = -1;
/// Size of any opaque stack adjustment due to save/restore libcalls.
unsigned LibCallStackSize = 0;
+ /// Size of RVV stack.
+ uint64_t RVVStackSize = 0;
+ /// Padding required to keep RVV stack aligned within the main stack.
+ uint64_t RVVPadding = 0;
+ /// Size of stack frame to save callee saved registers
+ unsigned CalleeSavedStackSize = 0;
public:
RISCVMachineFunctionInfo(const MachineFunction &MF) {}
@@ -54,10 +60,20 @@
bool useSaveRestoreLibCalls(const MachineFunction &MF) const {
// We cannot use fixed locations for the callee saved spill slots if the
- // function uses a varargs save area.
+ // function uses a varargs save area, or is an interrupt handler.
return MF.getSubtarget<RISCVSubtarget>().enableSaveRestore() &&
- VarArgsSaveSize == 0 && !MF.getFrameInfo().hasTailCall();
+ VarArgsSaveSize == 0 && !MF.getFrameInfo().hasTailCall() &&
+ !MF.getFunction().hasFnAttribute("interrupt");
}
+
+ uint64_t getRVVStackSize() const { return RVVStackSize; }
+ void setRVVStackSize(uint64_t Size) { RVVStackSize = Size; }
+
+ uint64_t getRVVPadding() const { return RVVPadding; }
+ void setRVVPadding(uint64_t Padding) { RVVPadding = Padding; }
+
+ unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+ void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 631077e..388cce0 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -101,6 +101,11 @@
markSuperRegs(Reserved, RISCV::VXSAT);
markSuperRegs(Reserved, RISCV::VXRM);
+ // Floating point environment registers.
+ markSuperRegs(Reserved, RISCV::FRM);
+ markSuperRegs(Reserved, RISCV::FFLAGS);
+ markSuperRegs(Reserved, RISCV::FCSR);
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
@@ -164,12 +169,13 @@
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
Register FrameReg;
- int Offset = getFrameLowering(MF)
- ->getFrameIndexReference(MF, FrameIndex, FrameReg)
- .getFixed() +
- MI.getOperand(FIOperandNum + 1).getImm();
+ StackOffset Offset =
+ getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg);
+ bool IsRVVSpill = TII->isRVVSpill(MI, /*CheckFIs*/ false);
+ if (!IsRVVSpill)
+ Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
- if (!isInt<32>(Offset)) {
+ if (!isInt<32>(Offset.getFixed())) {
report_fatal_error(
"Frame offsets outside of the signed 32-bit range not supported");
}
@@ -177,23 +183,107 @@
MachineBasicBlock &MBB = *MI.getParent();
bool FrameRegIsKill = false;
- if (!isInt<12>(Offset)) {
- assert(isInt<32>(Offset) && "Int32 expected");
+ // If required, pre-compute the scalable factor amount which will be used in
+ // later offset computation. Since this sequence requires up to two scratch
+ // registers -- after which one is made free -- this grants us better
+ // scavenging of scratch registers as only up to two are live at one time,
+ // rather than three.
+ Register ScalableFactorRegister;
+ unsigned ScalableAdjOpc = RISCV::ADD;
+ if (Offset.getScalable()) {
+ int64_t ScalableValue = Offset.getScalable();
+ if (ScalableValue < 0) {
+ ScalableValue = -ScalableValue;
+ ScalableAdjOpc = RISCV::SUB;
+ }
+ // 1. Get vlenb && multiply vlen with the number of vector registers.
+ ScalableFactorRegister =
+ TII->getVLENFactoredAmount(MF, MBB, II, DL, ScalableValue);
+ }
+
+ if (!isInt<12>(Offset.getFixed())) {
// The offset won't fit in an immediate, so use a scratch register instead
// Modify Offset and FrameReg appropriately
Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- TII->movImm(MBB, II, DL, ScratchReg, Offset);
+ TII->movImm(MBB, II, DL, ScratchReg, Offset.getFixed());
+ if (MI.getOpcode() == RISCV::ADDI && !Offset.getScalable()) {
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADD), MI.getOperand(0).getReg())
+ .addReg(FrameReg)
+ .addReg(ScratchReg, RegState::Kill);
+ MI.eraseFromParent();
+ return;
+ }
BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg)
.addReg(FrameReg)
.addReg(ScratchReg, RegState::Kill);
- Offset = 0;
+ Offset = StackOffset::get(0, Offset.getScalable());
FrameReg = ScratchReg;
FrameRegIsKill = true;
}
- MI.getOperand(FIOperandNum)
- .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ if (!Offset.getScalable()) {
+ // Offset = (fixed offset, 0)
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
+ if (!IsRVVSpill)
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
+ else {
+ if (Offset.getFixed()) {
+ Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), ScratchReg)
+ .addReg(FrameReg, getKillRegState(FrameRegIsKill))
+ .addImm(Offset.getFixed());
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(ScratchReg, false, false, true);
+ }
+ }
+ } else {
+ // Offset = (fixed offset, scalable offset)
+ // Step 1, the scalable offset, has already been computed.
+ assert(ScalableFactorRegister &&
+ "Expected pre-computation of scalable factor in earlier step");
+
+ // 2. Calculate address: FrameReg + result of multiply
+ if (MI.getOpcode() == RISCV::ADDI && !Offset.getFixed()) {
+ BuildMI(MBB, II, DL, TII->get(ScalableAdjOpc), MI.getOperand(0).getReg())
+ .addReg(FrameReg, getKillRegState(FrameRegIsKill))
+ .addReg(ScalableFactorRegister, RegState::Kill);
+ MI.eraseFromParent();
+ return;
+ }
+ Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(ScalableAdjOpc), VL)
+ .addReg(FrameReg, getKillRegState(FrameRegIsKill))
+ .addReg(ScalableFactorRegister, RegState::Kill);
+
+ if (IsRVVSpill && Offset.getFixed()) {
+ // Scalable load/store has no immediate argument. We need to add the
+ // fixed part into the load/store base address.
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), VL)
+ .addReg(VL)
+ .addImm(Offset.getFixed());
+ }
+
+ // 3. Replace address register with calculated address register
+ MI.getOperand(FIOperandNum).ChangeToRegister(VL, false, false, true);
+ if (!IsRVVSpill)
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
+ }
+
+ auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MI.getOpcode());
+ if (ZvlssegInfo) {
+ Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
+ uint32_t ShiftAmount = Log2_32(ZvlssegInfo->second);
+ if (ShiftAmount != 0)
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
+ .addReg(VL)
+ .addImm(ShiftAmount);
+ // The last argument of pseudo spilling opcode for zvlsseg is the length of
+ // one element of zvlsseg types. For example, for vint32m2x2_t, it will be
+ // the length of vint32m2_t.
+ MI.getOperand(FIOperandNum + 1).ChangeToRegister(VL, /*isDef=*/false);
+ }
}
Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
@@ -222,3 +312,11 @@
return CSR_ILP32D_LP64D_RegMask;
}
}
+
+const TargetRegisterClass *
+RISCVRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &) const {
+ if (RC == &RISCV::VMV0RegClass)
+ return &RISCV::VRRegClass;
+ return RC;
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index ffbb60a..74a5b83 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -59,6 +59,10 @@
unsigned Kind = 0) const override {
return &RISCV::GPRRegClass;
}
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &) const override;
};
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index e1a11fd..fde7520 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -51,20 +51,20 @@
def ABIRegAltName : RegAltNameIndex;
-def sub_vrm1_0 : SubRegIndex<64, -1>;
-def sub_vrm1_1 : SubRegIndex<64, -1>;
-def sub_vrm1_2 : SubRegIndex<64, -1>;
-def sub_vrm1_3 : SubRegIndex<64, -1>;
-def sub_vrm1_4 : SubRegIndex<64, -1>;
-def sub_vrm1_5 : SubRegIndex<64, -1>;
-def sub_vrm1_6 : SubRegIndex<64, -1>;
-def sub_vrm1_7 : SubRegIndex<64, -1>;
-def sub_vrm2_0 : SubRegIndex<128, -1>;
-def sub_vrm2_1 : SubRegIndex<128, -1>;
-def sub_vrm2_2 : SubRegIndex<128, -1>;
-def sub_vrm2_3 : SubRegIndex<128, -1>;
-def sub_vrm4_0 : SubRegIndex<256, -1>;
-def sub_vrm4_1 : SubRegIndex<256, -1>;
+def sub_vrm4_0 : SubRegIndex<256>;
+def sub_vrm4_1 : SubRegIndex<256, 256>;
+def sub_vrm2_0 : SubRegIndex<128>;
+def sub_vrm2_1 : SubRegIndex<128, 128>;
+def sub_vrm2_2 : ComposedSubRegIndex<sub_vrm4_1, sub_vrm2_0>;
+def sub_vrm2_3 : ComposedSubRegIndex<sub_vrm4_1, sub_vrm2_1>;
+def sub_vrm1_0 : SubRegIndex<64>;
+def sub_vrm1_1 : SubRegIndex<64, 64>;
+def sub_vrm1_2 : ComposedSubRegIndex<sub_vrm2_1, sub_vrm1_0>;
+def sub_vrm1_3 : ComposedSubRegIndex<sub_vrm2_1, sub_vrm1_1>;
+def sub_vrm1_4 : ComposedSubRegIndex<sub_vrm2_2, sub_vrm1_0>;
+def sub_vrm1_5 : ComposedSubRegIndex<sub_vrm2_2, sub_vrm1_1>;
+def sub_vrm1_6 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_0>;
+def sub_vrm1_7 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_1>;
} // Namespace = "RISCV"
@@ -78,7 +78,7 @@
let RegAltNameIndices = [ABIRegAltName] in {
def X0 : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
- let CostPerUse = 1 in {
+ let CostPerUse = [1] in {
def X1 : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
def X2 : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
def X3 : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
@@ -95,7 +95,7 @@
def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
- let CostPerUse = 1 in {
+ let CostPerUse = [1] in {
def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
@@ -117,6 +117,9 @@
def XLenVT : ValueTypeByHwMode<[RV32, RV64],
[i32, i64]>;
+def XLenRI : RegInfoByHwMode<
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
// The order of registers represents the preferred allocation sequence.
// Registers are listed in the order caller-save, callee-save, specials.
@@ -128,15 +131,11 @@
(sequence "X%u", 18, 27),
(sequence "X%u", 0, 4)
)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
// The order of registers represents the preferred allocation sequence.
@@ -149,9 +148,7 @@
(sequence "X%u", 18, 27),
(sequence "X%u", 1, 4)
)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -162,37 +159,44 @@
(sequence "X%u", 18, 27),
X1, X3, X4
)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
+}
+
+// Don't use X1 or X5 for JALR since that is a hint to pop the return address
+// stack on some microarchitectures. Also remove the reserved registers X0, X2,
+// X3, and X4 as it reduces the number of register classes that get synthesized
+// by tablegen.
+def GPRJALR : RegisterClass<"RISCV", [XLenVT], 32, (add
+ (sequence "X%u", 10, 17),
+ (sequence "X%u", 6, 7),
+ (sequence "X%u", 28, 31),
+ (sequence "X%u", 8, 9),
+ (sequence "X%u", 18, 27)
+ )> {
+ let RegInfos = XLenRI;
}
def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
(sequence "X%u", 10, 15),
(sequence "X%u", 8, 9)
)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
// For indirect tail calls, we can't use callee-saved registers, as they are
// restored to the saved value before the tail call, which would clobber a call
-// address.
+// address. We shouldn't use x5 since that is a hint for to pop the return
+// address stack on some microarchitectures.
def GPRTC : RegisterClass<"RISCV", [XLenVT], 32, (add
- (sequence "X%u", 5, 7),
+ (sequence "X%u", 6, 7),
(sequence "X%u", 10, 17),
(sequence "X%u", 28, 31)
)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
- let RegInfos = RegInfoByHwMode<
- [RV32, RV64],
- [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+ let RegInfos = XLenRI;
}
// Floating point registers
@@ -281,8 +285,7 @@
// Vector type mapping to LLVM types.
//
-// Though the V extension allows that VLEN be as small as 8,
-// this approach assumes that VLEN>=64.
+// The V vector extension requires that VLEN >= 128 and <= 65536.
// Additionally, the only supported ELEN values are 32 and 64,
// thus `vscale` can be defined as VLEN/64,
// allowing the same types with either ELEN value.
@@ -394,18 +397,24 @@
), [!mul(i, lmul)], [])));
}
-class VRegList<list<dag> LIn, int start, int nf, int lmul> {
+class VRegList<list<dag> LIn, int start, int nf, int lmul, bit NoV0> {
list<dag> L =
!if(!ge(start, nf),
LIn,
!listconcat(
[!dag(add,
- !foreach(i, IndexSet<start, nf, lmul>.R,
- !cast<Register>("V" # i # !cond(!eq(lmul, 2): "M2",
- !eq(lmul, 4): "M4",
- true: ""))),
- !listsplat("", !size(IndexSet<start, nf, lmul>.R)))],
- VRegList<LIn, !add(start, 1), nf, lmul>.L));
+ !foreach(i,
+ !if(NoV0,
+ !tail(IndexSet<start, nf, lmul>.R),
+ [!head(IndexSet<start, nf, lmul>.R)]),
+ !cast<Register>("V" # i # !cond(!eq(lmul, 2): "M2",
+ !eq(lmul, 4): "M4",
+ true: ""))),
+ !listsplat("",
+ !if(NoV0,
+ !size(!tail(IndexSet<start, nf, lmul>.R)),
+ !size([!head(IndexSet<start, nf, lmul>.R)]))))],
+ VRegList<LIn, !add(start, 1), nf, lmul, NoV0>.L));
}
// Vector registers
@@ -453,8 +462,12 @@
foreach m = [1, 2, 4] in {
foreach n = NFList<m>.L in {
- def "VN" # n # "M" # m: RegisterTuples<SubRegSet<[], 0, n, m>.L,
- VRegList<[], 0, n, m>.L>;
+ def "VN" # n # "M" # m # "NoV0": RegisterTuples<
+ SubRegSet<[], 0, n, m>.L,
+ VRegList<[], 0, n, m, 1>.L>;
+ def "VN" # n # "M" # m # "V0" : RegisterTuples<
+ SubRegSet<[], 0, n, m>.L,
+ VRegList<[], 0, n, m, 0>.L>;
}
}
@@ -467,22 +480,22 @@
int Size = !mul(Vlmul, 64);
}
-def VR : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+def VR : VReg<[vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+ vfloat16m1_t, vfloat32m1_t, vfloat64m1_t,
+ vint8mf2_t, vint8mf4_t, vint8mf8_t,
vint16mf2_t, vint16mf4_t, vint32mf2_t,
- vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
- vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
- vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+ vfloat16mf4_t, vfloat16mf2_t, vfloat32mf2_t,
vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
vbool2_t, vbool1_t],
(add (sequence "V%u", 25, 31),
(sequence "V%u", 8, 24),
(sequence "V%u", 0, 7)), 1>;
-def VRNoV0 : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+def VRNoV0 : VReg<[vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+ vfloat16m1_t, vfloat32m1_t, vfloat64m1_t,
+ vint8mf2_t, vint8mf4_t, vint8mf8_t,
vint16mf2_t, vint16mf4_t, vint32mf2_t,
- vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
- vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
- vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+ vfloat16mf4_t, vfloat16mf2_t, vfloat32mf2_t,
vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
vbool2_t, vbool1_t],
(add (sequence "V%u", 25, 31),
@@ -522,10 +535,25 @@
let Size = 64;
}
+// The register class is added for inline assembly for vector mask types.
+def VM : VReg<[vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
+ vbool32_t, vbool64_t],
+ (add (sequence "V%u", 25, 31),
+ (sequence "V%u", 8, 24),
+ (sequence "V%u", 0, 7)), 1>;
+
foreach m = LMULList.m in {
foreach nf = NFList<m>.L in {
- def "VRN" # nf # "M" # m : VReg<[untyped],
- (add !cast<RegisterTuples>("VN" # nf # "M" # m)),
+ def "VRN" # nf # "M" # m: VReg<[untyped],
+ (add !cast<RegisterTuples>("VN" # nf # "M" # m # "V0"), !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0")),
+ !mul(nf, m)>;
+ def "VRN" # nf # "M" # m # "NoV0": VReg<[untyped],
+ (add !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0")),
!mul(nf, m)>;
}
}
+
+// Special registers
+def FFLAGS : RISCVReg<0, "fflags">;
+def FRM : RISCVReg<0, "frm">;
+def FCSR : RISCVReg<0, "fcsr">;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index de2cdf5..14f5915 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -52,8 +52,10 @@
// Integer arithmetic and logic
def : WriteRes<WriteIALU32, [RocketUnitALU]>;
def : WriteRes<WriteIALU, [RocketUnitALU]>;
-def : WriteRes<WriteShift32, [RocketUnitALU]>;
-def : WriteRes<WriteShift, [RocketUnitALU]>;
+def : WriteRes<WriteShiftImm32, [RocketUnitALU]>;
+def : WriteRes<WriteShiftImm, [RocketUnitALU]>;
+def : WriteRes<WriteShiftReg32, [RocketUnitALU]>;
+def : WriteRes<WriteShiftReg, [RocketUnitALU]>;
// Integer multiplication
let Latency = 4 in {
@@ -143,14 +145,12 @@
// FP multiplication
let Latency = 5 in {
def : WriteRes<WriteFMul32, [RocketUnitFPALU]>;
-def : WriteRes<WriteFMulAdd32, [RocketUnitFPALU]>;
-def : WriteRes<WriteFMulSub32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMA32, [RocketUnitFPALU]>;
}
let Latency = 7 in {
def : WriteRes<WriteFMul64, [RocketUnitFPALU]>;
-def : WriteRes<WriteFMulAdd64, [RocketUnitFPALU]>;
-def : WriteRes<WriteFMulSub64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMA64, [RocketUnitFPALU]>;
}
// FP division
@@ -181,8 +181,10 @@
def : ReadAdvance<ReadMemBase, 0>;
def : ReadAdvance<ReadIALU, 0>;
def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
def : ReadAdvance<ReadIDiv, 0>;
def : ReadAdvance<ReadIDiv32, 0>;
def : ReadAdvance<ReadIMul, 0>;
@@ -199,11 +201,9 @@
def : ReadAdvance<ReadFALU32, 0>;
def : ReadAdvance<ReadFALU64, 0>;
def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
def : ReadAdvance<ReadFDiv32, 0>;
def : ReadAdvance<ReadFDiv64, 0>;
def : ReadAdvance<ReadFSqrt32, 0>;
@@ -230,4 +230,11 @@
def : ReadAdvance<ReadFMovI64ToF64, 0>;
def : ReadAdvance<ReadFClass32, 0>;
def : ReadAdvance<ReadFClass64, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZba;
+defm : UnsupportedSchedZbb;
+defm : UnsupportedSchedZfh;
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index e57ba4f..75ca6ca 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -45,8 +45,10 @@
let Latency = 3 in {
def : WriteRes<WriteIALU, [SiFive7PipeAB]>;
def : WriteRes<WriteIALU32, [SiFive7PipeAB]>;
-def : WriteRes<WriteShift, [SiFive7PipeAB]>;
-def : WriteRes<WriteShift32, [SiFive7PipeAB]>;
+def : WriteRes<WriteShiftImm, [SiFive7PipeAB]>;
+def : WriteRes<WriteShiftImm32, [SiFive7PipeAB]>;
+def : WriteRes<WriteShiftReg, [SiFive7PipeAB]>;
+def : WriteRes<WriteShiftReg32, [SiFive7PipeAB]>;
}
// Integer multiplication
@@ -101,8 +103,7 @@
let Latency = 5 in {
def : WriteRes<WriteFALU32, [SiFive7PipeB]>;
def : WriteRes<WriteFMul32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMulAdd32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMulSub32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMA32, [SiFive7PipeB]>;
}
let Latency = 3 in {
def : WriteRes<WriteFSGNJ32, [SiFive7PipeB]>;
@@ -118,8 +119,7 @@
let Latency = 7 in {
def : WriteRes<WriteFALU64, [SiFive7PipeB]>;
def : WriteRes<WriteFMul64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMulAdd64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMulSub64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMA64, [SiFive7PipeB]>;
}
let Latency = 3 in {
def : WriteRes<WriteFSGNJ64, [SiFive7PipeB]>;
@@ -160,7 +160,6 @@
def : InstRW<[WriteIALU], (instrs COPY)>;
-
//===----------------------------------------------------------------------===//
// Bypass and advance
def : ReadAdvance<ReadJmp, 0>;
@@ -170,8 +169,10 @@
def : ReadAdvance<ReadMemBase, 0>;
def : ReadAdvance<ReadIALU, 0>;
def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
def : ReadAdvance<ReadIDiv, 0>;
def : ReadAdvance<ReadIDiv32, 0>;
def : ReadAdvance<ReadIMul, 0>;
@@ -188,11 +189,9 @@
def : ReadAdvance<ReadFALU32, 0>;
def : ReadAdvance<ReadFALU64, 0>;
def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
def : ReadAdvance<ReadFDiv32, 0>;
def : ReadAdvance<ReadFDiv64, 0>;
def : ReadAdvance<ReadFSqrt32, 0>;
@@ -219,4 +218,11 @@
def : ReadAdvance<ReadFMovI64ToF64, 0>;
def : ReadAdvance<ReadFClass32, 0>;
def : ReadAdvance<ReadFClass64, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZba;
+defm : UnsupportedSchedZbb;
+defm : UnsupportedSchedZfh;
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
index 0806be8..4971ca1 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -9,8 +9,10 @@
/// Define scheduler resources associated with def operands.
def WriteIALU : SchedWrite; // 32 or 64-bit integer ALU operations
def WriteIALU32 : SchedWrite; // 32-bit integer ALU operations on RV64I
-def WriteShift32 : SchedWrite; // 32-bit shift operations on RV64Ix
-def WriteShift : SchedWrite; // 32 or 64-bit shift operations
+def WriteShiftImm : SchedWrite; // 32 or 64-bit shift by immediate operations
+def WriteShiftImm32 : SchedWrite; // 32-bit shift by immediate operations on RV64Ix
+def WriteShiftReg : SchedWrite; // 32 or 64-bit shift by immediate operations
+def WriteShiftReg32 : SchedWrite; // 32-bit shift by immediate operations on RV64Ix
def WriteIDiv : SchedWrite; // 32-bit or 64-bit divide and remainder
def WriteIDiv32 : SchedWrite; // 32-bit divide and remainder on RV64I
def WriteIMul : SchedWrite; // 32-bit or 64-bit multiply
@@ -36,26 +38,33 @@
def WriteAtomicLDD : SchedWrite; // Atomic load double word
def WriteAtomicSTW : SchedWrite; // Atomic store word
def WriteAtomicSTD : SchedWrite; // Atomic store double word
+def WriteFALU16 : SchedWrite; // FP 16-bit computation
def WriteFALU32 : SchedWrite; // FP 32-bit computation
def WriteFALU64 : SchedWrite; // FP 64-bit computation
+def WriteFMul16 : SchedWrite; // 16-bit floating point multiply
+def WriteFMA16 : SchedWrite; // 16-bit floating point fused multiply-add
def WriteFMul32 : SchedWrite; // 32-bit floating point multiply
-def WriteFMulAdd32 : SchedWrite; // 32-bit floating point multiply add
-def WriteFMulSub32 : SchedWrite; // 32-bit floating point multiply sub
+def WriteFMA32 : SchedWrite; // 32-bit floating point fused multiply-add
def WriteFMul64 : SchedWrite; // 64-bit floating point multiply
-def WriteFMulAdd64 : SchedWrite; // 64-bit floating point multiply add
-def WriteFMulSub64 : SchedWrite; // 64-bit floating point multiply sub
+def WriteFMA64 : SchedWrite; // 64-bit floating point fused multiply-add
+def WriteFDiv16 : SchedWrite; // 16-bit floating point divide
def WriteFDiv32 : SchedWrite; // 32-bit floating point divide
def WriteFDiv64 : SchedWrite; // 64-bit floating point divide
+def WriteFSqrt16 : SchedWrite; // 16-bit floating point sqrt
def WriteFSqrt32 : SchedWrite; // 32-bit floating point sqrt
def WriteFSqrt64 : SchedWrite; // 64-bit floating point sqrt
// Integer to float conversions
+def WriteFCvtI32ToF16 : SchedWrite;
def WriteFCvtI32ToF32 : SchedWrite;
def WriteFCvtI32ToF64 : SchedWrite;
+def WriteFCvtI64ToF16 : SchedWrite; // RV64I only
def WriteFCvtI64ToF32 : SchedWrite; // RV64I only
def WriteFCvtI64ToF64 : SchedWrite; // RV64I only
//Float to integer conversions
+def WriteFCvtF16ToI32 : SchedWrite;
+def WriteFCvtF16ToI64 : SchedWrite; // RV64I only
def WriteFCvtF32ToI32 : SchedWrite;
def WriteFCvtF32ToI64 : SchedWrite; // RV64I only
def WriteFCvtF64ToI32 : SchedWrite;
@@ -64,27 +73,35 @@
// Float to float conversions
def WriteFCvtF32ToF64 : SchedWrite;
def WriteFCvtF64ToF32 : SchedWrite;
+def WriteFCvtF16ToF32 : SchedWrite;
+def WriteFCvtF32ToF16 : SchedWrite;
+def WriteFCvtF16ToF64 : SchedWrite;
+def WriteFCvtF64ToF16 : SchedWrite;
-def WriteFConv32 : SchedWrite; // 32-bit floating point convert
-def WriteFConv64 : SchedWrite; // 64-bit floating point convert
+def WriteFClass16 : SchedWrite; // 16-bit floating point classify
def WriteFClass32 : SchedWrite; // 32-bit floating point classify
def WriteFClass64 : SchedWrite; // 64-bit floating point classify
+def WriteFCmp16 : SchedWrite; // 16-bit floating point compare
def WriteFCmp32 : SchedWrite; // 32-bit floating point compare
def WriteFCmp64 : SchedWrite; // 64-bit floating point compare
+def WriteFSGNJ16 : SchedWrite; // 16-bit floating point sign-injection
def WriteFSGNJ32 : SchedWrite; // 32-bit floating point sign-injection
def WriteFSGNJ64 : SchedWrite; // 64-bit floating point sign-injection
+def WriteFMinMax16 : SchedWrite; // 16-bit floating point min or max
def WriteFMinMax32 : SchedWrite; // 32-bit floating point min or max
def WriteFMinMax64 : SchedWrite; // 64-bit floating point min or max
+def WriteFMovF16ToI16 : SchedWrite;
+def WriteFMovI16ToF16 : SchedWrite;
def WriteFMovF32ToI32 : SchedWrite;
def WriteFMovI32ToF32 : SchedWrite;
def WriteFMovF64ToI64 : SchedWrite; // RV64I only
def WriteFMovI64ToF64 : SchedWrite; // RV64I only
-def WriteFMov32 : SchedWrite; // 32-bit floating point move
-def WriteFMov64 : SchedWrite; // 64-bit floating point move
+def WriteFLD16 : SchedWrite; // Floating point sp load
def WriteFLD32 : SchedWrite; // Floating point sp load
def WriteFLD64 : SchedWrite; // Floating point dp load
+def WriteFST16 : SchedWrite; // Floating point sp store
def WriteFST32 : SchedWrite; // Floating point sp store
def WriteFST64 : SchedWrite; // Floating point dp store
@@ -97,8 +114,10 @@
def ReadStoreData : SchedRead;
def ReadIALU : SchedRead;
def ReadIALU32 : SchedRead; // 32-bit integer ALU operations on RV64I
-def ReadShift : SchedRead;
-def ReadShift32 : SchedRead; // 32-bit shift operations on RV64Ix
+def ReadShiftImm : SchedRead;
+def ReadShiftImm32 : SchedRead; // 32-bit shift by immediate operations on RV64Ix
+def ReadShiftReg : SchedRead;
+def ReadShiftReg32 : SchedRead; // 32-bit shift by register operations on RV64Ix
def ReadIDiv : SchedRead;
def ReadIDiv32 : SchedRead;
def ReadIMul : SchedRead;
@@ -111,37 +130,104 @@
def ReadAtomicLDD : SchedRead; // Atomic load double word
def ReadAtomicSTW : SchedRead; // Atomic store word
def ReadAtomicSTD : SchedRead; // Atomic store double word
+def ReadFALU16 : SchedRead; // FP 16-bit computation
def ReadFALU32 : SchedRead; // FP 32-bit computation
def ReadFALU64 : SchedRead; // FP 64-bit computation
+def ReadFMul16 : SchedRead; // 16-bit floating point multiply
+def ReadFMA16 : SchedRead; // 16-bit floating point fused multiply-add
def ReadFMul32 : SchedRead; // 32-bit floating point multiply
-def ReadFMulAdd32 : SchedRead; // 32-bit floating point multiply add
-def ReadFMulSub32 : SchedRead; // 32-bit floating point multiply sub
+def ReadFMA32 : SchedRead; // 32-bit floating point fused multiply-add
def ReadFMul64 : SchedRead; // 64-bit floating point multiply
-def ReadFMulAdd64 : SchedRead; // 64-bit floating point multiply add
-def ReadFMulSub64 : SchedRead; // 64-bit floating point multiply sub
+def ReadFMA64 : SchedRead; // 64-bit floating point fused multiply-add
+def ReadFDiv16 : SchedRead; // 16-bit floating point divide
def ReadFDiv32 : SchedRead; // 32-bit floating point divide
def ReadFDiv64 : SchedRead; // 64-bit floating point divide
+def ReadFSqrt16 : SchedRead; // 16-bit floating point sqrt
def ReadFSqrt32 : SchedRead; // 32-bit floating point sqrt
def ReadFSqrt64 : SchedRead; // 64-bit floating point sqrt
+def ReadFCmp16 : SchedRead;
def ReadFCmp32 : SchedRead;
def ReadFCmp64 : SchedRead;
+def ReadFSGNJ16 : SchedRead;
def ReadFSGNJ32 : SchedRead;
def ReadFSGNJ64 : SchedRead;
+def ReadFMinMax16 : SchedRead;
def ReadFMinMax32 : SchedRead;
def ReadFMinMax64 : SchedRead;
+def ReadFCvtF16ToI32 : SchedRead;
+def ReadFCvtF16ToI64 : SchedRead;
def ReadFCvtF32ToI32 : SchedRead;
def ReadFCvtF32ToI64 : SchedRead;
def ReadFCvtF64ToI32 : SchedRead;
def ReadFCvtF64ToI64 : SchedRead;
+def ReadFCvtI32ToF16 : SchedRead;
def ReadFCvtI32ToF32 : SchedRead;
def ReadFCvtI32ToF64 : SchedRead;
+def ReadFCvtI64ToF16 : SchedRead;
def ReadFCvtI64ToF32 : SchedRead;
def ReadFCvtI64ToF64 : SchedRead;
+def ReadFMovF16ToI16 : SchedRead;
+def ReadFMovI16ToF16 : SchedRead;
def ReadFMovF32ToI32 : SchedRead;
def ReadFMovI32ToF32 : SchedRead;
def ReadFMovF64ToI64 : SchedRead;
def ReadFMovI64ToF64 : SchedRead;
def ReadFCvtF32ToF64 : SchedRead;
def ReadFCvtF64ToF32 : SchedRead;
+def ReadFCvtF16ToF32 : SchedRead;
+def ReadFCvtF32ToF16 : SchedRead;
+def ReadFCvtF16ToF64 : SchedRead;
+def ReadFCvtF64ToF16 : SchedRead;
+def ReadFClass16 : SchedRead;
def ReadFClass32 : SchedRead;
def ReadFClass64 : SchedRead;
+
+multiclass UnsupportedSchedZfh {
+let Unsupported = true in {
+def : WriteRes<WriteFALU16, []>;
+def : WriteRes<WriteFClass16, []>;
+def : WriteRes<WriteFCvtF16ToF64, []>;
+def : WriteRes<WriteFCvtF64ToF16, []>;
+def : WriteRes<WriteFCvtI64ToF16, []>;
+def : WriteRes<WriteFCvtF32ToF16, []>;
+def : WriteRes<WriteFCvtI32ToF16, []>;
+def : WriteRes<WriteFCvtF16ToI64, []>;
+def : WriteRes<WriteFCvtF16ToF32, []>;
+def : WriteRes<WriteFCvtF16ToI32, []>;
+def : WriteRes<WriteFDiv16, []>;
+def : WriteRes<WriteFCmp16, []>;
+def : WriteRes<WriteFLD16, []>;
+def : WriteRes<WriteFMA16, []>;
+def : WriteRes<WriteFMinMax16, []>;
+def : WriteRes<WriteFMul16, []>;
+def : WriteRes<WriteFMovI16ToF16, []>;
+def : WriteRes<WriteFMovF16ToI16, []>;
+def : WriteRes<WriteFSGNJ16, []>;
+def : WriteRes<WriteFST16, []>;
+def : WriteRes<WriteFSqrt16, []>;
+
+def : ReadAdvance<ReadFALU16, 0>;
+def : ReadAdvance<ReadFClass16, 0>;
+def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+def : ReadAdvance<ReadFDiv16, 0>;
+def : ReadAdvance<ReadFCmp16, 0>;
+def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMinMax16, 0>;
+def : ReadAdvance<ReadFMul16, 0>;
+def : ReadAdvance<ReadFMovI16ToF16, 0>;
+def : ReadAdvance<ReadFMovF16ToI16, 0>;
+def : ReadAdvance<ReadFSGNJ16, 0>;
+def : ReadAdvance<ReadFSqrt16, 0>;
+} // Unsupported = true
+}
+
+// Include the scheduler resources for other instruction extensions.
+include "RISCVScheduleB.td"
+include "RISCVScheduleV.td"
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleB.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleB.td
new file mode 100644
index 0000000..b668b0a
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleB.td
@@ -0,0 +1,89 @@
+//===-- RISCVScheduleB.td - RISCV Scheduling Definitions B -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// Define scheduler resources associated with def operands.
+
+// Zba extension
+def WriteSHXADD : SchedWrite; // sh1add/sh2add/sh3add
+def WriteSHXADD32 : SchedWrite; // sh1add.uw/sh2add.uw/sh3add.uw
+
+// Zbb extension
+def WriteRotateImm : SchedWrite;
+def WriteRotateImm32 : SchedWrite;
+def WriteRotateReg : SchedWrite;
+def WriteRotateReg32 : SchedWrite;
+def WriteCLZ : SchedWrite;
+def WriteCLZ32 : SchedWrite;
+def WriteCTZ : SchedWrite;
+def WriteCTZ32 : SchedWrite;
+def WriteCPOP : SchedWrite;
+def WriteCPOP32 : SchedWrite;
+def WriteREV8 : SchedWrite;
+def WriteORCB : SchedWrite;
+
+/// Define scheduler resources associated with use operands.
+
+// Zba extension
+def ReadSHXADD : SchedRead; // sh1add/sh2add/sh3add
+def ReadSHXADD32 : SchedRead; // sh1add.uw/sh2add.uw/sh3add.uw
+
+// Zbb extension
+def ReadRotateImm : SchedRead;
+def ReadRotateImm32 : SchedRead;
+def ReadRotateReg : SchedRead;
+def ReadRotateReg32 : SchedRead;
+def ReadCLZ : SchedRead;
+def ReadCLZ32 : SchedRead;
+def ReadCTZ : SchedRead;
+def ReadCTZ32 : SchedRead;
+def ReadCPOP : SchedRead;
+def ReadCPOP32 : SchedRead;
+def ReadREV8 : SchedRead;
+def ReadORCB : SchedRead;
+
+/// Define default scheduler resources for B.
+
+multiclass UnsupportedSchedZba {
+let Unsupported = true in {
+def : WriteRes<WriteSHXADD, []>;
+def : WriteRes<WriteSHXADD32, []>;
+
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbb {
+let Unsupported = true in {
+def : WriteRes<WriteRotateImm, []>;
+def : WriteRes<WriteRotateImm32, []>;
+def : WriteRes<WriteRotateReg, []>;
+def : WriteRes<WriteRotateReg32, []>;
+def : WriteRes<WriteCLZ, []>;
+def : WriteRes<WriteCLZ32, []>;
+def : WriteRes<WriteCTZ, []>;
+def : WriteRes<WriteCTZ32, []>;
+def : WriteRes<WriteCPOP, []>;
+def : WriteRes<WriteCPOP32, []>;
+def : WriteRes<WriteREV8, []>;
+def : WriteRes<WriteORCB, []>;
+
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+}
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleV.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleV.td
new file mode 100644
index 0000000..43af180
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -0,0 +1,820 @@
+//===-- RISCVScheduleV.td - RISCV Scheduling Definitions V -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with def operands.
+
+// 7. Vector Loads and Stores
+// 7.4. Vector Unit-Stride Instructions
+def WriteVLDE8 : SchedWrite;
+def WriteVLDE16 : SchedWrite;
+def WriteVLDE32 : SchedWrite;
+def WriteVLDE64 : SchedWrite;
+def WriteVSTE8 : SchedWrite;
+def WriteVSTE16 : SchedWrite;
+def WriteVSTE32 : SchedWrite;
+def WriteVSTE64 : SchedWrite;
+// 7.4.1. Vector Unit-Strided Mask
+def WriteVLDM : SchedWrite;
+def WriteVSTM : SchedWrite;
+// 7.5. Vector Strided Instructions
+def WriteVLDS8 : SchedWrite;
+def WriteVLDS16 : SchedWrite;
+def WriteVLDS32 : SchedWrite;
+def WriteVLDS64 : SchedWrite;
+def WriteVSTS8 : SchedWrite;
+def WriteVSTS16 : SchedWrite;
+def WriteVSTS32 : SchedWrite;
+def WriteVSTS64 : SchedWrite;
+// 7.6. Vector Indexed Instructions
+def WriteVLDUX8 : SchedWrite;
+def WriteVLDUX16 : SchedWrite;
+def WriteVLDUX32 : SchedWrite;
+def WriteVLDUX64 : SchedWrite;
+def WriteVLDOX8 : SchedWrite;
+def WriteVLDOX16 : SchedWrite;
+def WriteVLDOX32 : SchedWrite;
+def WriteVLDOX64 : SchedWrite;
+def WriteVSTUX8 : SchedWrite;
+def WriteVSTUX16 : SchedWrite;
+def WriteVSTUX32 : SchedWrite;
+def WriteVSTUX64 : SchedWrite;
+def WriteVSTOX8 : SchedWrite;
+def WriteVSTOX16 : SchedWrite;
+def WriteVSTOX32 : SchedWrite;
+def WriteVSTOX64 : SchedWrite;
+// 7.7. Vector Unit-stride Fault-Only-First Loads
+def WriteVLDFF8 : SchedWrite;
+def WriteVLDFF16 : SchedWrite;
+def WriteVLDFF32 : SchedWrite;
+def WriteVLDFF64 : SchedWrite;
+// 7.9. Vector Whole Register Instructions
+def WriteVLD1R8 : SchedWrite;
+def WriteVLD1R16 : SchedWrite;
+def WriteVLD1R32 : SchedWrite;
+def WriteVLD1R64 : SchedWrite;
+def WriteVLD2R8 : SchedWrite;
+def WriteVLD2R16 : SchedWrite;
+def WriteVLD2R32 : SchedWrite;
+def WriteVLD2R64 : SchedWrite;
+def WriteVLD4R8 : SchedWrite;
+def WriteVLD4R16 : SchedWrite;
+def WriteVLD4R32 : SchedWrite;
+def WriteVLD4R64 : SchedWrite;
+def WriteVLD8R8 : SchedWrite;
+def WriteVLD8R16 : SchedWrite;
+def WriteVLD8R32 : SchedWrite;
+def WriteVLD8R64 : SchedWrite;
+def WriteVST1R : SchedWrite;
+def WriteVST2R : SchedWrite;
+def WriteVST4R : SchedWrite;
+def WriteVST8R : SchedWrite;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def WriteVIALUV : SchedWrite;
+def WriteVIALUX : SchedWrite;
+def WriteVIALUI : SchedWrite;
+// 11.2. Vector Widening Integer Add/Subtract
+def WriteVIWALUV : SchedWrite;
+def WriteVIWALUX : SchedWrite;
+def WriteVIWALUI : SchedWrite;
+// 11.3. Vector Integer Extension
+def WriteVExtV : SchedWrite;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def WriteVICALUV : SchedWrite;
+def WriteVICALUX : SchedWrite;
+def WriteVICALUI : SchedWrite;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def WriteVShiftV : SchedWrite;
+def WriteVShiftX : SchedWrite;
+def WriteVShiftI : SchedWrite;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def WriteVNShiftV : SchedWrite;
+def WriteVNShiftX : SchedWrite;
+def WriteVNShiftI : SchedWrite;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def WriteVICmpV : SchedWrite;
+def WriteVICmpX : SchedWrite;
+def WriteVICmpI : SchedWrite;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def WriteVIMulV : SchedWrite;
+def WriteVIMulX : SchedWrite;
+// 11.11. Vector Integer Divide Instructions
+def WriteVIDivV : SchedWrite;
+def WriteVIDivX : SchedWrite;
+// 11.12. Vector Widening Integer Multiply Instructions
+def WriteVIWMulV : SchedWrite;
+def WriteVIWMulX : SchedWrite;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def WriteVIMulAddV : SchedWrite;
+def WriteVIMulAddX : SchedWrite;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def WriteVIWMulAddV : SchedWrite;
+def WriteVIWMulAddX : SchedWrite;
+// 11.15. Vector Integer Merge Instructions
+def WriteVIMergeV : SchedWrite;
+def WriteVIMergeX : SchedWrite;
+def WriteVIMergeI : SchedWrite;
+// 11.16. Vector Integer Move Instructions
+def WriteVIMovV : SchedWrite;
+def WriteVIMovX : SchedWrite;
+def WriteVIMovI : SchedWrite;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def WriteVSALUV : SchedWrite;
+def WriteVSALUX : SchedWrite;
+def WriteVSALUI : SchedWrite;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def WriteVAALUV : SchedWrite;
+def WriteVAALUX : SchedWrite;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def WriteVSMulV : SchedWrite;
+def WriteVSMulX : SchedWrite;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def WriteVSShiftV : SchedWrite;
+def WriteVSShiftX : SchedWrite;
+def WriteVSShiftI : SchedWrite;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def WriteVNClipV : SchedWrite;
+def WriteVNClipX : SchedWrite;
+def WriteVNClipI : SchedWrite;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def WriteVFALUV : SchedWrite;
+def WriteVFALUF : SchedWrite;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def WriteVFWALUV : SchedWrite;
+def WriteVFWALUF : SchedWrite;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def WriteVFMulV : SchedWrite;
+def WriteVFMulF : SchedWrite;
+def WriteVFDivV : SchedWrite;
+def WriteVFDivF : SchedWrite;
+// 13.5. Vector Widening Floating-Point Multiply
+def WriteVFWMulV : SchedWrite;
+def WriteVFWMulF : SchedWrite;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def WriteVFMulAddV : SchedWrite;
+def WriteVFMulAddF : SchedWrite;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def WriteVFWMulAddV : SchedWrite;
+def WriteVFWMulAddF : SchedWrite;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def WriteVFSqrtV : SchedWrite;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def WriteVFRecpV : SchedWrite;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def WriteVFCmpV : SchedWrite;
+def WriteVFCmpF : SchedWrite;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def WriteVFSgnjV : SchedWrite;
+def WriteVFSgnjF : SchedWrite;
+// 13.14. Vector Floating-Point Classify Instruction
+def WriteVFClassV : SchedWrite;
+// 13.15. Vector Floating-Point Merge Instruction
+def WriteVFMergeV : SchedWrite;
+// 13.16. Vector Floating-Point Move Instruction
+def WriteVFMovV : SchedWrite;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def WriteVFCvtIToFV : SchedWrite;
+def WriteVFCvtFToIV : SchedWrite;
+def WriteVFCvtFToFV : SchedWrite;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def WriteVFWCvtIToFV : SchedWrite;
+def WriteVFWCvtFToIV : SchedWrite;
+def WriteVFWCvtFToFV : SchedWrite;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def WriteVFNCvtIToFV : SchedWrite;
+def WriteVFNCvtFToIV : SchedWrite;
+def WriteVFNCvtFToFV : SchedWrite;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def WriteVIRedV : SchedWrite;
+// 14.2. Vector Widening Integer Reduction Instructions
+def WriteVIWRedV : SchedWrite;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def WriteVFRedV : SchedWrite;
+def WriteVFRedOV : SchedWrite;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def WriteVFWRedV : SchedWrite;
+def WriteVFWRedOV : SchedWrite;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def WriteVMALUV : SchedWrite;
+// 15.2. Vector Mask Population Count
+def WriteVMPopV : SchedWrite;
+// 15.3. Vector Find-First-Set Mask Bit
+def WriteVMFFSV : SchedWrite;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def WriteVMSFSV : SchedWrite;
+// 15.8. Vector Iota Instruction
+def WriteVMIotV : SchedWrite;
+// 15.9. Vector Element Index Instruction
+def WriteVMIdxV : SchedWrite;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def WriteVIMovVX : SchedWrite;
+def WriteVIMovXV : SchedWrite;
+// 16.2. Floating-Point Scalar Move Instructions
+def WriteVFMovVF : SchedWrite;
+def WriteVFMovFV : SchedWrite;
+// 16.3. Vector Slide Instructions
+def WriteVISlideX : SchedWrite;
+def WriteVISlideI : SchedWrite;
+def WriteVISlide1X : SchedWrite;
+def WriteVFSlide1F : SchedWrite;
+// 16.4. Vector Register Gather Instructions
+def WriteVGatherV : SchedWrite;
+def WriteVGatherX : SchedWrite;
+def WriteVGatherI : SchedWrite;
+// 16.5. Vector Compress Instruction
+def WriteVCompressV : SchedWrite;
+// 16.6. Whole Vector Register Move
+def WriteVMov1V : SchedWrite;
+def WriteVMov2V : SchedWrite;
+def WriteVMov4V : SchedWrite;
+def WriteVMov8V : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with use operands.
+
+// 7. Vector Loads and Stores
+def ReadVLDX : SchedRead;
+def ReadVSTX : SchedRead;
+// 7.4. Vector Unit-Stride Instructions
+def ReadVSTE8V : SchedRead;
+def ReadVSTE16V : SchedRead;
+def ReadVSTE32V : SchedRead;
+def ReadVSTE64V : SchedRead;
+// 7.4.1. Vector Unit-Strided Mask
+def ReadVSTM : SchedRead;
+// 7.5. Vector Strided Instructions
+def ReadVLDSX : SchedRead;
+def ReadVSTSX : SchedRead;
+def ReadVSTS8V : SchedRead;
+def ReadVSTS16V : SchedRead;
+def ReadVSTS32V : SchedRead;
+def ReadVSTS64V : SchedRead;
+// 7.6. Vector Indexed Instructions
+def ReadVLDUXV : SchedRead;
+def ReadVLDOXV : SchedRead;
+def ReadVSTUX8 : SchedRead;
+def ReadVSTUX16 : SchedRead;
+def ReadVSTUX32 : SchedRead;
+def ReadVSTUX64 : SchedRead;
+def ReadVSTUXV : SchedRead;
+def ReadVSTUX8V : SchedRead;
+def ReadVSTUX16V : SchedRead;
+def ReadVSTUX32V : SchedRead;
+def ReadVSTUX64V : SchedRead;
+def ReadVSTOX8 : SchedRead;
+def ReadVSTOX16 : SchedRead;
+def ReadVSTOX32 : SchedRead;
+def ReadVSTOX64 : SchedRead;
+def ReadVSTOXV : SchedRead;
+def ReadVSTOX8V : SchedRead;
+def ReadVSTOX16V : SchedRead;
+def ReadVSTOX32V : SchedRead;
+def ReadVSTOX64V : SchedRead;
+// 7.9. Vector Whole Register Instructions
+def ReadVST1R : SchedRead;
+def ReadVST2R : SchedRead;
+def ReadVST4R : SchedRead;
+def ReadVST8R : SchedRead;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def ReadVIALUV : SchedRead;
+def ReadVIALUX : SchedRead;
+// 11.2. Vector Widening Integer Add/Subtract
+def ReadVIWALUV : SchedRead;
+def ReadVIWALUX : SchedRead;
+// 11.3. Vector Integer Extension
+def ReadVExtV : SchedRead;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def ReadVIALUCV : SchedRead;
+def ReadVIALUCX : SchedRead;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def ReadVShiftV : SchedRead;
+def ReadVShiftX : SchedRead;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def ReadVNShiftV : SchedRead;
+def ReadVNShiftX : SchedRead;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def ReadVICmpV : SchedRead;
+def ReadVICmpX : SchedRead;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def ReadVIMulV : SchedRead;
+def ReadVIMulX : SchedRead;
+// 11.11. Vector Integer Divide Instructions
+def ReadVIDivV : SchedRead;
+def ReadVIDivX : SchedRead;
+// 11.12. Vector Widening Integer Multiply Instructions
+def ReadVIWMulV : SchedRead;
+def ReadVIWMulX : SchedRead;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def ReadVIMulAddV : SchedRead;
+def ReadVIMulAddX : SchedRead;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def ReadVIWMulAddV : SchedRead;
+def ReadVIWMulAddX : SchedRead;
+// 11.15. Vector Integer Merge Instructions
+def ReadVIMergeV : SchedRead;
+def ReadVIMergeX : SchedRead;
+// 11.16. Vector Integer Move Instructions
+def ReadVIMovV : SchedRead;
+def ReadVIMovX : SchedRead;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def ReadVSALUV : SchedRead;
+def ReadVSALUX : SchedRead;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def ReadVAALUV : SchedRead;
+def ReadVAALUX : SchedRead;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def ReadVSMulV : SchedRead;
+def ReadVSMulX : SchedRead;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def ReadVSShiftV : SchedRead;
+def ReadVSShiftX : SchedRead;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def ReadVNClipV : SchedRead;
+def ReadVNClipX : SchedRead;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def ReadVFALUV : SchedRead;
+def ReadVFALUF : SchedRead;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def ReadVFWALUV : SchedRead;
+def ReadVFWALUF : SchedRead;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def ReadVFMulV : SchedRead;
+def ReadVFMulF : SchedRead;
+def ReadVFDivV : SchedRead;
+def ReadVFDivF : SchedRead;
+// 13.5. Vector Widening Floating-Point Multiply
+def ReadVFWMulV : SchedRead;
+def ReadVFWMulF : SchedRead;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def ReadVFMulAddV : SchedRead;
+def ReadVFMulAddF : SchedRead;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def ReadVFWMulAddV : SchedRead;
+def ReadVFWMulAddF : SchedRead;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def ReadVFSqrtV : SchedRead;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def ReadVFRecpV : SchedRead;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def ReadVFCmpV : SchedRead;
+def ReadVFCmpF : SchedRead;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def ReadVFSgnjV : SchedRead;
+def ReadVFSgnjF : SchedRead;
+// 13.14. Vector Floating-Point Classify Instruction
+def ReadVFClassV : SchedRead;
+// 13.15. Vector Floating-Point Merge Instruction
+def ReadVFMergeV : SchedRead;
+def ReadVFMergeF : SchedRead;
+// 13.16. Vector Floating-Point Move Instruction
+def ReadVFMovF : SchedRead;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def ReadVFCvtIToFV : SchedRead;
+def ReadVFCvtFToIV : SchedRead;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def ReadVFWCvtIToFV : SchedRead;
+def ReadVFWCvtFToIV : SchedRead;
+def ReadVFWCvtFToFV : SchedRead;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def ReadVFNCvtIToFV : SchedRead;
+def ReadVFNCvtFToIV : SchedRead;
+def ReadVFNCvtFToFV : SchedRead;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def ReadVIRedV : SchedRead;
+def ReadVIRedV0 : SchedRead;
+// 14.2. Vector Widening Integer Reduction Instructions
+def ReadVIWRedV : SchedRead;
+def ReadVIWRedV0 : SchedRead;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def ReadVFRedV : SchedRead;
+def ReadVFRedV0 : SchedRead;
+def ReadVFRedOV : SchedRead;
+def ReadVFRedOV0 : SchedRead;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def ReadVFWRedV : SchedRead;
+def ReadVFWRedV0 : SchedRead;
+def ReadVFWRedOV : SchedRead;
+def ReadVFWRedOV0 : SchedRead;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def ReadVMALUV : SchedRead;
+// 15.2. Vector Mask Population Count
+def ReadVMPopV : SchedRead;
+// 15.3. Vector Find-First-Set Mask Bit
+def ReadVMFFSV : SchedRead;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def ReadVMSFSV : SchedRead;
+// 15.8. Vector Iota Instruction
+def ReadVMIotV : SchedRead;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def ReadVIMovVX : SchedRead;
+def ReadVIMovXV : SchedRead;
+def ReadVIMovXX : SchedRead;
+// 16.2. Floating-Point Scalar Move Instructions
+def ReadVFMovVF : SchedRead;
+def ReadVFMovFV : SchedRead;
+def ReadVFMovFX : SchedRead;
+// 16.3. Vector Slide Instructions
+def ReadVISlideV : SchedRead;
+def ReadVISlideX : SchedRead;
+def ReadVFSlideV : SchedRead;
+def ReadVFSlideF : SchedRead;
+// 16.4. Vector Register Gather Instructions
+def ReadVGatherV : SchedRead;
+def ReadVGatherX : SchedRead;
+// 16.5. Vector Compress Instruction
+def ReadVCompressV : SchedRead;
+// 16.6. Whole Vector Register Move
+def ReadVMov1V : SchedRead;
+def ReadVMov2V : SchedRead;
+def ReadVMov4V : SchedRead;
+def ReadVMov8V : SchedRead;
+
+// Others
+def ReadVMask : SchedRead;
+
+//===----------------------------------------------------------------------===//
+/// Define default scheduler resources for V.
+
+multiclass UnsupportedSchedV {
+let Unsupported = true in {
+
+// 7. Vector Loads and Stores
+def : WriteRes<WriteVLDE8, []>;
+def : WriteRes<WriteVLDE16, []>;
+def : WriteRes<WriteVLDE32, []>;
+def : WriteRes<WriteVLDE64, []>;
+def : WriteRes<WriteVSTE8, []>;
+def : WriteRes<WriteVSTE16, []>;
+def : WriteRes<WriteVSTE32, []>;
+def : WriteRes<WriteVSTE64, []>;
+def : WriteRes<WriteVLDM, []>;
+def : WriteRes<WriteVSTM, []>;
+def : WriteRes<WriteVLDS8, []>;
+def : WriteRes<WriteVLDS16, []>;
+def : WriteRes<WriteVLDS32, []>;
+def : WriteRes<WriteVLDS64, []>;
+def : WriteRes<WriteVSTS8, []>;
+def : WriteRes<WriteVSTS16, []>;
+def : WriteRes<WriteVSTS32, []>;
+def : WriteRes<WriteVSTS64, []>;
+def : WriteRes<WriteVLDUX8, []>;
+def : WriteRes<WriteVLDUX16, []>;
+def : WriteRes<WriteVLDUX32, []>;
+def : WriteRes<WriteVLDUX64, []>;
+def : WriteRes<WriteVLDOX8, []>;
+def : WriteRes<WriteVLDOX16, []>;
+def : WriteRes<WriteVLDOX32, []>;
+def : WriteRes<WriteVLDOX64, []>;
+def : WriteRes<WriteVSTUX8, []>;
+def : WriteRes<WriteVSTUX16, []>;
+def : WriteRes<WriteVSTUX32, []>;
+def : WriteRes<WriteVSTUX64, []>;
+def : WriteRes<WriteVSTOX8, []>;
+def : WriteRes<WriteVSTOX16, []>;
+def : WriteRes<WriteVSTOX32, []>;
+def : WriteRes<WriteVSTOX64, []>;
+def : WriteRes<WriteVLDFF8, []>;
+def : WriteRes<WriteVLDFF16, []>;
+def : WriteRes<WriteVLDFF32, []>;
+def : WriteRes<WriteVLDFF64, []>;
+def : WriteRes<WriteVLD1R8, []>;
+def : WriteRes<WriteVLD1R16, []>;
+def : WriteRes<WriteVLD1R32, []>;
+def : WriteRes<WriteVLD1R64, []>;
+def : WriteRes<WriteVLD2R8, []>;
+def : WriteRes<WriteVLD2R16, []>;
+def : WriteRes<WriteVLD2R32, []>;
+def : WriteRes<WriteVLD2R64, []>;
+def : WriteRes<WriteVLD4R8, []>;
+def : WriteRes<WriteVLD4R16, []>;
+def : WriteRes<WriteVLD4R32, []>;
+def : WriteRes<WriteVLD4R64, []>;
+def : WriteRes<WriteVLD8R8, []>;
+def : WriteRes<WriteVLD8R16, []>;
+def : WriteRes<WriteVLD8R32, []>;
+def : WriteRes<WriteVLD8R64, []>;
+def : WriteRes<WriteVST1R, []>;
+def : WriteRes<WriteVST2R, []>;
+def : WriteRes<WriteVST4R, []>;
+def : WriteRes<WriteVST8R, []>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : WriteRes<WriteVIALUV, []>;
+def : WriteRes<WriteVIALUX, []>;
+def : WriteRes<WriteVIALUI, []>;
+def : WriteRes<WriteVIWALUV, []>;
+def : WriteRes<WriteVIWALUX, []>;
+def : WriteRes<WriteVIWALUI, []>;
+def : WriteRes<WriteVExtV, []>;
+def : WriteRes<WriteVICALUV, []>;
+def : WriteRes<WriteVICALUX, []>;
+def : WriteRes<WriteVICALUI, []>;
+def : WriteRes<WriteVShiftV, []>;
+def : WriteRes<WriteVShiftX, []>;
+def : WriteRes<WriteVShiftI, []>;
+def : WriteRes<WriteVNShiftV, []>;
+def : WriteRes<WriteVNShiftX, []>;
+def : WriteRes<WriteVNShiftI, []>;
+def : WriteRes<WriteVICmpV, []>;
+def : WriteRes<WriteVICmpX, []>;
+def : WriteRes<WriteVICmpI, []>;
+def : WriteRes<WriteVIMulV, []>;
+def : WriteRes<WriteVIMulX, []>;
+def : WriteRes<WriteVIDivV, []>;
+def : WriteRes<WriteVIDivX, []>;
+def : WriteRes<WriteVIWMulV, []>;
+def : WriteRes<WriteVIWMulX, []>;
+def : WriteRes<WriteVIMulAddV, []>;
+def : WriteRes<WriteVIMulAddX, []>;
+def : WriteRes<WriteVIWMulAddV, []>;
+def : WriteRes<WriteVIWMulAddX, []>;
+def : WriteRes<WriteVIMergeV, []>;
+def : WriteRes<WriteVIMergeX, []>;
+def : WriteRes<WriteVIMergeI, []>;
+def : WriteRes<WriteVIMovV, []>;
+def : WriteRes<WriteVIMovX, []>;
+def : WriteRes<WriteVIMovI, []>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : WriteRes<WriteVSALUV, []>;
+def : WriteRes<WriteVSALUX, []>;
+def : WriteRes<WriteVSALUI, []>;
+def : WriteRes<WriteVAALUV, []>;
+def : WriteRes<WriteVAALUX, []>;
+def : WriteRes<WriteVSMulV, []>;
+def : WriteRes<WriteVSMulX, []>;
+def : WriteRes<WriteVSShiftV, []>;
+def : WriteRes<WriteVSShiftX, []>;
+def : WriteRes<WriteVSShiftI, []>;
+def : WriteRes<WriteVNClipV, []>;
+def : WriteRes<WriteVNClipX, []>;
+def : WriteRes<WriteVNClipI, []>;
+
+// 14. Vector Floating-Point Instructions
+def : WriteRes<WriteVFALUV, []>;
+def : WriteRes<WriteVFALUF, []>;
+def : WriteRes<WriteVFWALUV, []>;
+def : WriteRes<WriteVFWALUF, []>;
+def : WriteRes<WriteVFMulV, []>;
+def : WriteRes<WriteVFMulF, []>;
+def : WriteRes<WriteVFDivV, []>;
+def : WriteRes<WriteVFDivF, []>;
+def : WriteRes<WriteVFWMulV, []>;
+def : WriteRes<WriteVFWMulF, []>;
+def : WriteRes<WriteVFMulAddV, []>;
+def : WriteRes<WriteVFMulAddF, []>;
+def : WriteRes<WriteVFWMulAddV, []>;
+def : WriteRes<WriteVFWMulAddF, []>;
+def : WriteRes<WriteVFSqrtV, []>;
+def : WriteRes<WriteVFRecpV, []>;
+def : WriteRes<WriteVFCmpV, []>;
+def : WriteRes<WriteVFCmpF, []>;
+def : WriteRes<WriteVFSgnjV, []>;
+def : WriteRes<WriteVFSgnjF, []>;
+def : WriteRes<WriteVFClassV, []>;
+def : WriteRes<WriteVFMergeV, []>;
+def : WriteRes<WriteVFMovV, []>;
+def : WriteRes<WriteVFCvtIToFV, []>;
+def : WriteRes<WriteVFCvtFToIV, []>;
+def : WriteRes<WriteVFCvtFToFV, []>;
+def : WriteRes<WriteVFWCvtIToFV, []>;
+def : WriteRes<WriteVFWCvtFToIV, []>;
+def : WriteRes<WriteVFWCvtFToFV, []>;
+def : WriteRes<WriteVFNCvtIToFV, []>;
+def : WriteRes<WriteVFNCvtFToIV, []>;
+def : WriteRes<WriteVFNCvtFToFV, []>;
+
+// 15. Vector Reduction Operations
+def : WriteRes<WriteVIRedV, []>;
+def : WriteRes<WriteVIWRedV, []>;
+def : WriteRes<WriteVFRedV, []>;
+def : WriteRes<WriteVFRedOV, []>;
+def : WriteRes<WriteVFWRedV, []>;
+def : WriteRes<WriteVFWRedOV, []>;
+
+// 16. Vector Mask Instructions
+def : WriteRes<WriteVMALUV, []>;
+def : WriteRes<WriteVMPopV, []>;
+def : WriteRes<WriteVMFFSV, []>;
+def : WriteRes<WriteVMSFSV, []>;
+def : WriteRes<WriteVMIotV, []>;
+def : WriteRes<WriteVMIdxV, []>;
+
+// 17. Vector Permutation Instructions
+def : WriteRes<WriteVIMovVX, []>;
+def : WriteRes<WriteVIMovXV, []>;
+def : WriteRes<WriteVFMovVF, []>;
+def : WriteRes<WriteVFMovFV, []>;
+def : WriteRes<WriteVISlideX, []>;
+def : WriteRes<WriteVISlideI, []>;
+def : WriteRes<WriteVISlide1X, []>;
+def : WriteRes<WriteVFSlide1F, []>;
+def : WriteRes<WriteVGatherV, []>;
+def : WriteRes<WriteVGatherX, []>;
+def : WriteRes<WriteVGatherI, []>;
+def : WriteRes<WriteVCompressV, []>;
+def : WriteRes<WriteVMov1V, []>;
+def : WriteRes<WriteVMov2V, []>;
+def : WriteRes<WriteVMov4V, []>;
+def : WriteRes<WriteVMov8V, []>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+def : ReadAdvance<ReadVSTE8V, 0>;
+def : ReadAdvance<ReadVSTE16V, 0>;
+def : ReadAdvance<ReadVSTE32V, 0>;
+def : ReadAdvance<ReadVSTE64V, 0>;
+def : ReadAdvance<ReadVSTM, 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+def : ReadAdvance<ReadVSTS8V, 0>;
+def : ReadAdvance<ReadVSTS16V, 0>;
+def : ReadAdvance<ReadVSTS32V, 0>;
+def : ReadAdvance<ReadVSTS64V, 0>;
+def : ReadAdvance<ReadVLDUXV, 0>;
+def : ReadAdvance<ReadVLDOXV, 0>;
+def : ReadAdvance<ReadVSTUXV, 0>;
+def : ReadAdvance<ReadVSTUX8, 0>;
+def : ReadAdvance<ReadVSTUX16, 0>;
+def : ReadAdvance<ReadVSTUX32, 0>;
+def : ReadAdvance<ReadVSTUX64, 0>;
+def : ReadAdvance<ReadVSTUX8V, 0>;
+def : ReadAdvance<ReadVSTUX16V, 0>;
+def : ReadAdvance<ReadVSTUX32V, 0>;
+def : ReadAdvance<ReadVSTUX64V, 0>;
+def : ReadAdvance<ReadVSTOX8, 0>;
+def : ReadAdvance<ReadVSTOX16, 0>;
+def : ReadAdvance<ReadVSTOX32, 0>;
+def : ReadAdvance<ReadVSTOX64, 0>;
+def : ReadAdvance<ReadVSTOXV, 0>;
+def : ReadAdvance<ReadVSTOX8V, 0>;
+def : ReadAdvance<ReadVSTOX16V, 0>;
+def : ReadAdvance<ReadVSTOX32V, 0>;
+def : ReadAdvance<ReadVSTOX64V, 0>;
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : ReadAdvance<ReadVIALUV, 0>;
+def : ReadAdvance<ReadVIALUX, 0>;
+def : ReadAdvance<ReadVIWALUV, 0>;
+def : ReadAdvance<ReadVIWALUX, 0>;
+def : ReadAdvance<ReadVExtV, 0>;
+def : ReadAdvance<ReadVIALUCV, 0>;
+def : ReadAdvance<ReadVIALUCX, 0>;
+def : ReadAdvance<ReadVShiftV, 0>;
+def : ReadAdvance<ReadVShiftX, 0>;
+def : ReadAdvance<ReadVNShiftV, 0>;
+def : ReadAdvance<ReadVNShiftX, 0>;
+def : ReadAdvance<ReadVICmpV, 0>;
+def : ReadAdvance<ReadVICmpX, 0>;
+def : ReadAdvance<ReadVIMulV, 0>;
+def : ReadAdvance<ReadVIMulX, 0>;
+def : ReadAdvance<ReadVIDivV, 0>;
+def : ReadAdvance<ReadVIDivX, 0>;
+def : ReadAdvance<ReadVIWMulV, 0>;
+def : ReadAdvance<ReadVIWMulX, 0>;
+def : ReadAdvance<ReadVIMulAddV, 0>;
+def : ReadAdvance<ReadVIMulAddX, 0>;
+def : ReadAdvance<ReadVIWMulAddV, 0>;
+def : ReadAdvance<ReadVIWMulAddX, 0>;
+def : ReadAdvance<ReadVIMergeV, 0>;
+def : ReadAdvance<ReadVIMergeX, 0>;
+def : ReadAdvance<ReadVIMovV, 0>;
+def : ReadAdvance<ReadVIMovX, 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : ReadAdvance<ReadVSALUV, 0>;
+def : ReadAdvance<ReadVSALUX, 0>;
+def : ReadAdvance<ReadVAALUV, 0>;
+def : ReadAdvance<ReadVAALUX, 0>;
+def : ReadAdvance<ReadVSMulV, 0>;
+def : ReadAdvance<ReadVSMulX, 0>;
+def : ReadAdvance<ReadVSShiftV, 0>;
+def : ReadAdvance<ReadVSShiftX, 0>;
+def : ReadAdvance<ReadVNClipV, 0>;
+def : ReadAdvance<ReadVNClipX, 0>;
+
+// 14. Vector Floating-Point Instructions
+def : ReadAdvance<ReadVFALUV, 0>;
+def : ReadAdvance<ReadVFALUF, 0>;
+def : ReadAdvance<ReadVFWALUV, 0>;
+def : ReadAdvance<ReadVFWALUF, 0>;
+def : ReadAdvance<ReadVFMulV, 0>;
+def : ReadAdvance<ReadVFMulF, 0>;
+def : ReadAdvance<ReadVFDivV, 0>;
+def : ReadAdvance<ReadVFDivF, 0>;
+def : ReadAdvance<ReadVFWMulV, 0>;
+def : ReadAdvance<ReadVFWMulF, 0>;
+def : ReadAdvance<ReadVFMulAddV, 0>;
+def : ReadAdvance<ReadVFMulAddF, 0>;
+def : ReadAdvance<ReadVFWMulAddV, 0>;
+def : ReadAdvance<ReadVFWMulAddF, 0>;
+def : ReadAdvance<ReadVFSqrtV, 0>;
+def : ReadAdvance<ReadVFRecpV, 0>;
+def : ReadAdvance<ReadVFCmpV, 0>;
+def : ReadAdvance<ReadVFCmpF, 0>;
+def : ReadAdvance<ReadVFSgnjV, 0>;
+def : ReadAdvance<ReadVFSgnjF, 0>;
+def : ReadAdvance<ReadVFClassV, 0>;
+def : ReadAdvance<ReadVFMergeV, 0>;
+def : ReadAdvance<ReadVFMergeF, 0>;
+def : ReadAdvance<ReadVFMovF, 0>;
+def : ReadAdvance<ReadVFCvtIToFV, 0>;
+def : ReadAdvance<ReadVFCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtIToFV, 0>;
+def : ReadAdvance<ReadVFWCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtFToFV, 0>;
+def : ReadAdvance<ReadVFNCvtIToFV, 0>;
+def : ReadAdvance<ReadVFNCvtFToIV, 0>;
+def : ReadAdvance<ReadVFNCvtFToFV, 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+def : ReadAdvance<ReadVMALUV, 0>;
+def : ReadAdvance<ReadVMPopV, 0>;
+def : ReadAdvance<ReadVMFFSV, 0>;
+def : ReadAdvance<ReadVMSFSV, 0>;
+def : ReadAdvance<ReadVMIotV, 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVIMovVX, 0>;
+def : ReadAdvance<ReadVIMovXV, 0>;
+def : ReadAdvance<ReadVIMovXX, 0>;
+def : ReadAdvance<ReadVFMovVF, 0>;
+def : ReadAdvance<ReadVFMovFV, 0>;
+def : ReadAdvance<ReadVFMovFX, 0>;
+def : ReadAdvance<ReadVISlideV, 0>;
+def : ReadAdvance<ReadVISlideX, 0>;
+def : ReadAdvance<ReadVFSlideV, 0>;
+def : ReadAdvance<ReadVFSlideF, 0>;
+def : ReadAdvance<ReadVGatherV, 0>;
+def : ReadAdvance<ReadVGatherX, 0>;
+def : ReadAdvance<ReadVCompressV, 0>;
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+
+} // Unsupported
+} // UnsupportedSchedV
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index df11d23..b19fdcb 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -27,19 +27,42 @@
#define GET_SUBTARGETINFO_CTOR
#include "RISCVGenSubtargetInfo.inc"
+static cl::opt<unsigned> RVVVectorBitsMax(
+ "riscv-v-vector-bits-max",
+ cl::desc("Assume V extension vector registers are at most this big, "
+ "with zero meaning no maximum size is assumed."),
+ cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> RVVVectorBitsMin(
+ "riscv-v-vector-bits-min",
+ cl::desc("Assume V extension vector registers are at least this big, "
+ "with zero meaning no minimum size is assumed."),
+ cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> RVVVectorLMULMax(
+ "riscv-v-fixed-length-vector-lmul-max",
+ cl::desc("The maximum LMUL value to use for fixed length vectors. "
+ "Fractional LMUL values are not supported."),
+ cl::init(8), cl::Hidden);
+
void RISCVSubtarget::anchor() {}
-RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
- const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, StringRef ABIName) {
+RISCVSubtarget &
+RISCVSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef CPU,
+ StringRef TuneCPU, StringRef FS,
+ StringRef ABIName) {
// Determine default and user-specified characteristics
bool Is64Bit = TT.isArch64Bit();
- std::string CPUName = std::string(CPU);
- std::string TuneCPUName = std::string(TuneCPU);
- if (CPUName.empty())
- CPUName = Is64Bit ? "generic-rv64" : "generic-rv32";
- if (TuneCPUName.empty())
- TuneCPUName = CPUName;
- ParseSubtargetFeatures(CPUName, TuneCPUName, FS);
+ if (CPU.empty())
+ CPU = Is64Bit ? "generic-rv64" : "generic-rv32";
+ if (CPU == "generic")
+ report_fatal_error(Twine("CPU 'generic' is not supported. Use ") +
+ (Is64Bit ? "generic-rv64" : "generic-rv32"));
+
+ if (TuneCPU.empty())
+ TuneCPU = CPU;
+
+ ParseSubtargetFeatures(CPU, TuneCPU, FS);
if (Is64Bit) {
XLenVT = MVT::i64;
XLen = 64;
@@ -81,3 +104,47 @@
const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const {
return RegBankInfo.get();
}
+
+unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
+ assert(hasStdExtV() && "Tried to get vector length without V support!");
+ if (RVVVectorBitsMax == 0)
+ return 0;
+ assert(RVVVectorBitsMax >= 128 && RVVVectorBitsMax <= 65536 &&
+ isPowerOf2_32(RVVVectorBitsMax) &&
+ "V extension requires vector length to be in the range of 128 to "
+ "65536 and a power of 2!");
+ assert(RVVVectorBitsMax >= RVVVectorBitsMin &&
+ "Minimum V extension vector length should not be larger than its "
+ "maximum!");
+ unsigned Max = std::max(RVVVectorBitsMin, RVVVectorBitsMax);
+ return PowerOf2Floor((Max < 128 || Max > 65536) ? 0 : Max);
+}
+
+unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const {
+ assert(hasStdExtV() &&
+ "Tried to get vector length without V extension support!");
+ assert((RVVVectorBitsMin == 0 ||
+ (RVVVectorBitsMin >= 128 && RVVVectorBitsMax <= 65536 &&
+ isPowerOf2_32(RVVVectorBitsMin))) &&
+ "V extension requires vector length to be in the range of 128 to "
+ "65536 and a power of 2!");
+ assert((RVVVectorBitsMax >= RVVVectorBitsMin || RVVVectorBitsMax == 0) &&
+ "Minimum V extension vector length should not be larger than its "
+ "maximum!");
+ unsigned Min = RVVVectorBitsMin;
+ if (RVVVectorBitsMax != 0)
+ Min = std::min(RVVVectorBitsMin, RVVVectorBitsMax);
+ return PowerOf2Floor((Min < 128 || Min > 65536) ? 0 : Min);
+}
+
+unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
+ assert(hasStdExtV() &&
+ "Tried to get maximum LMUL without V extension support!");
+ assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) &&
+ "V extension requires a LMUL to be at most 8 and a power of 2!");
+ return PowerOf2Floor(std::max<unsigned>(RVVVectorLMULMax, 1));
+}
+
+bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
+ return hasStdExtV() && getMinRVVVectorSizeInBits() != 0;
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 561b04c..ce36331 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -62,6 +62,7 @@
bool EnableSaveRestore = false;
unsigned XLen = 32;
MVT XLenVT = MVT::i32;
+ uint8_t MaxInterleaveFactor = 2;
RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
BitVector UserReservedRegister;
RISCVFrameLowering FrameLowering;
@@ -134,6 +135,9 @@
assert(i < RISCV::NUM_TARGET_REGS && "Register out of range");
return UserReservedRegister[i];
}
+ unsigned getMaxInterleaveFactor() const {
+ return hasStdExtV() ? MaxInterleaveFactor : 1;
+ }
protected:
// GlobalISel related APIs.
@@ -147,6 +151,14 @@
InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
+
+ // Return the known range for the bit length of RVV data registers. A value
+ // of 0 means nothing is known about that particular limit beyond what's
+ // implied by the architecture.
+ unsigned getMaxRVVVectorSizeInBits() const;
+ unsigned getMinRVVVectorSizeInBits() const;
+ unsigned getMaxLMULForFixedLengthVectors() const;
+ bool useRVVForFixedLengthVectors() const;
};
} // End llvm namespace
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 16399fe..a561772 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,9 +19,13 @@
class SysReg<string name, bits<12> op> {
string Name = name;
- bits<12> Encoding = op;
// A maximum of one alias is supported right now.
string AltName = name;
+ // A maximum of one deprecated name is supported right now. Unlike the
+ // `AltName` alias, a `DeprecatedName` generates a diagnostic when the name is
+ // used to encourage software to migrate away from the name.
+ string DeprecatedName = "";
+ bits<12> Encoding = op;
// FIXME: add these additional fields when needed.
// Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
// Privilege Mode: User = 0, System = 1 or Machine = 3.
@@ -38,7 +42,10 @@
def SysRegsList : GenericTable {
let FilterClass = "SysReg";
// FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
- let Fields = [ "Name", "Encoding", "AltName", "FeaturesRequired", "isRV32Only" ];
+ let Fields = [
+ "Name", "AltName", "DeprecatedName", "Encoding", "FeaturesRequired",
+ "isRV32Only",
+ ];
let PrimaryKey = [ "Encoding" ];
let PrimaryKeyName = "lookupSysRegByEncoding";
@@ -54,6 +61,11 @@
let Key = [ "AltName" ];
}
+def lookupSysRegByDeprecatedName : SearchIndex {
+ let Table = SysRegsList;
+ let Key = [ "DeprecatedName" ];
+}
+
// The following CSR encodings match those given in Tables 2.2,
// 2.3, 2.4 and 2.5 in the RISC-V Instruction Set Manual
// Volume II: Privileged Architecture.
@@ -71,6 +83,7 @@
def : SysReg<"uscratch", 0x040>;
def : SysReg<"uepc", 0x041>;
def : SysReg<"ucause", 0x042>;
+let DeprecatedName = "ubadaddr" in
def : SysReg<"utval", 0x043>;
def : SysReg<"uip", 0x044>;
@@ -78,9 +91,9 @@
// User Floating-Point CSRs
//===--------------------------
-def FFLAGS : SysReg<"fflags", 0x001>;
-def FRM : SysReg<"frm", 0x002>;
-def FCSR : SysReg<"fcsr", 0x003>;
+def SysRegFFLAGS : SysReg<"fflags", 0x001>;
+def SysRegFRM : SysReg<"frm", 0x002>;
+def SysRegFCSR : SysReg<"fcsr", 0x003>;
//===--------------------------
// User Counter/Timers
@@ -171,12 +184,14 @@
def : SysReg<"sscratch", 0x140>;
def : SysReg<"sepc", 0x141>;
def : SysReg<"scause", 0x142>;
+let DeprecatedName = "sbadaddr" in
def : SysReg<"stval", 0x143>;
def : SysReg<"sip", 0x144>;
//===-------------------------------------
// Supervisor Protection and Translation
//===-------------------------------------
+let DeprecatedName = "sptbr" in
def : SysReg<"satp", 0x180>;
//===-----------------------------
@@ -205,6 +220,7 @@
def : SysReg<"mscratch", 0x340>;
def : SysReg<"mepc", 0x341>;
def : SysReg<"mcause", 0x342>;
+let DeprecatedName = "mbadaddr" in
def : SysReg<"mtval", 0x343>;
def : SysReg<"mip", 0x344>;
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 32fb7cd..b18ee60 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -39,7 +39,7 @@
initializeGlobalISel(*PR);
initializeRISCVMergeBaseOffsetOptPass(*PR);
initializeRISCVExpandPseudoPass(*PR);
- initializeRISCVCleanupVSETVLIPass(*PR);
+ initializeRISCVInsertVSETVLIPass(*PR);
}
static StringRef computeDataLayout(const Triple &TT) {
@@ -174,7 +174,7 @@
}
bool RISCVPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
@@ -191,8 +191,7 @@
}
void RISCVPassConfig::addPreRegAlloc() {
- if (TM->getOptLevel() != CodeGenOpt::None) {
+ if (TM->getOptLevel() != CodeGenOpt::None)
addPass(createRISCVMergeBaseOffsetOptPass());
- addPass(createRISCVCleanupVSETVLIPass());
- }
+ addPass(createRISCVInsertVSETVLIPass());
}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index fba86b4..5208371 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -84,6 +84,7 @@
}
void RISCVELFTargetObjectFile::getModuleMetadata(Module &M) {
+ TargetLoweringObjectFileELF::getModuleMetadata(M);
SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
M.getModuleFlagsMetadata(ModuleFlags);
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 27714cf..fd110db 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -15,8 +15,8 @@
#define DEBUG_TYPE "riscvtti"
-int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
"getIntImmCost can only estimate cost of materialising integers");
@@ -27,13 +27,13 @@
// Otherwise, we check how many instructions it will take to materialise.
const DataLayout &DL = getDataLayout();
return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
- getST()->is64Bit());
+ getST()->getFeatureBits());
}
-int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy() &&
"getIntImmCost can only estimate cost of materialising integers");
@@ -88,9 +88,70 @@
return TTI::TCC_Free;
}
-int RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
// Prevent hoisting in unknown cases.
return TTI::TCC_Free;
}
+
+TargetTransformInfo::PopcntSupportKind
+RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+ // Currently, the ExpandReductions pass can't expand scalable-vector
+ // reductions, but we still request expansion as RVV doesn't support certain
+ // reductions and the SelectionDAG can't legalize them either.
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ // These reductions have no equivalent in RVV
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_fmul:
+ return true;
+ }
+}
+
+Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
+ // There is no assumption of the maximum vector length in V specification.
+ // We use the value specified by users as the maximum vector length.
+ // This function will use the assumed maximum vector length to get the
+ // maximum vscale for LoopVectorizer.
+ // If users do not specify the maximum vector length, we have no way to
+ // know whether the LoopVectorizer is safe to do or not.
+ // We only consider to use single vector register (LMUL = 1) to vectorize.
+ unsigned MaxVectorSizeInBits = ST->getMaxRVVVectorSizeInBits();
+ if (ST->hasStdExtV() && MaxVectorSizeInBits != 0)
+ return MaxVectorSizeInBits / RISCV::RVVBitsPerBlock;
+ return BaseT::getMaxVScale();
+}
+
+InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
+ unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+
+ if ((Opcode == Instruction::Load &&
+ !isLegalMaskedGather(DataTy, Align(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ !isLegalMaskedScatter(DataTy, Align(Alignment))))
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+
+ // FIXME: Only supporting fixed vectors for now.
+ if (!isa<FixedVectorType>(DataTy))
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+
+ auto *VTy = cast<FixedVectorType>(DataTy);
+ unsigned NumLoads = VTy->getNumElements();
+ InstructionCost MemOpCost =
+ getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, CostKind, I);
+ return NumLoads * MemOpCost;
+}
diff --git a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 8d077e9..95dacb1 100644
--- a/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -41,12 +41,146 @@
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
+
+ TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const;
+ bool supportsScalableVectors() const { return ST->hasStdExtV(); }
+ Optional<unsigned> getMaxVScale() const;
+
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(ST->getXLen());
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(
+ ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(
+ ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
+ }
+
+ llvm_unreachable("Unsupported register kind");
+ }
+
+ InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I);
+
+ bool isLegalElementTypeForRVV(Type *ScalarTy) const {
+ if (ScalarTy->isPointerTy())
+ return true;
+
+ if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
+ ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
+ return true;
+
+ if (ScalarTy->isHalfTy())
+ return ST->hasStdExtZfh();
+ if (ScalarTy->isFloatTy())
+ return ST->hasStdExtF();
+ if (ScalarTy->isDoubleTy())
+ return ST->hasStdExtD();
+
+ return false;
+ }
+
+ bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
+ if (!ST->hasStdExtV())
+ return false;
+
+ // Only support fixed vectors if we know the minimum vector size.
+ if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+ return false;
+
+ if (Alignment <
+ DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
+ return false;
+
+ return isLegalElementTypeForRVV(DataType->getScalarType());
+ }
+
+ bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
+ return isLegalMaskedLoadStore(DataType, Alignment);
+ }
+ bool isLegalMaskedStore(Type *DataType, Align Alignment) {
+ return isLegalMaskedLoadStore(DataType, Alignment);
+ }
+
+ bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) {
+ if (!ST->hasStdExtV())
+ return false;
+
+ // Only support fixed vectors if we know the minimum vector size.
+ if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+ return false;
+
+ if (Alignment <
+ DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
+ return false;
+
+ return isLegalElementTypeForRVV(DataType->getScalarType());
+ }
+
+ bool isLegalMaskedGather(Type *DataType, Align Alignment) {
+ return isLegalMaskedGatherScatter(DataType, Alignment);
+ }
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
+ return isLegalMaskedGatherScatter(DataType, Alignment);
+ }
+
+ /// \returns How the target needs this vector-predicated operation to be
+ /// transformed.
+ TargetTransformInfo::VPLegalization
+ getVPLegalizationStrategy(const VPIntrinsic &PI) const {
+ using VPLegalization = TargetTransformInfo::VPLegalization;
+ return VPLegalization(VPLegalization::Legal, VPLegalization::Legal);
+ }
+
+ bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
+ ElementCount VF) const {
+ if (!ST->hasStdExtV())
+ return false;
+
+ if (!VF.isScalable())
+ return true;
+
+ Type *Ty = RdxDesc.getRecurrenceType();
+ if (!isLegalElementTypeForRVV(Ty))
+ return false;
+
+ switch (RdxDesc.getRecurrenceKind()) {
+ case RecurKind::Add:
+ case RecurKind::FAdd:
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::Xor:
+ case RecurKind::SMin:
+ case RecurKind::SMax:
+ case RecurKind::UMin:
+ case RecurKind::UMax:
+ case RecurKind::FMin:
+ case RecurKind::FMax:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ unsigned getMaxInterleaveFactor(unsigned VF) {
+ return ST->getMaxInterleaveFactor();
+ }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/src/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 5f1bf31..9a2df8c 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -1034,6 +1034,9 @@
case Sparc::TBR:
Op = SparcOperand::CreateToken("%tbr", S);
break;
+ case Sparc::PC:
+ Op = SparcOperand::CreateToken("%pc", S);
+ break;
case Sparc::ICC:
if (name == "xcc")
Op = SparcOperand::CreateToken("%xcc", S);
@@ -1054,18 +1057,12 @@
case AsmToken::Integer:
case AsmToken::LParen:
case AsmToken::Dot:
- if (!getParser().parseExpression(EVal, E))
- Op = SparcOperand::CreateImm(EVal, S, E);
- break;
+ case AsmToken::Identifier:
+ if (getParser().parseExpression(EVal, E))
+ break;
- case AsmToken::Identifier: {
- StringRef Identifier;
- if (!getParser().parseIdentifier(Identifier)) {
- E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
- MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
-
- const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
- getContext());
+ int64_t Res;
+ if (!EVal->evaluateAsAbsolute(Res)) {
SparcMCExpr::VariantKind Kind = SparcMCExpr::VK_Sparc_13;
if (getContext().getObjectFileInfo()->isPositionIndependent()) {
@@ -1074,14 +1071,11 @@
else
Kind = SparcMCExpr::VK_Sparc_GOT13;
}
-
- Res = SparcMCExpr::create(Kind, Res, getContext());
-
- Op = SparcOperand::CreateImm(Res, S, E);
+ EVal = SparcMCExpr::create(Kind, EVal, getContext());
}
+ Op = SparcOperand::CreateImm(EVal, S, E);
break;
}
- }
return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
}
@@ -1131,9 +1125,8 @@
return true;
}
- if (name.substr(0, 3).equals_lower("asr")
- && !name.substr(3).getAsInteger(10, intVal)
- && intVal > 0 && intVal < 32) {
+ if (name.substr(0, 3).equals_insensitive("asr") &&
+ !name.substr(3).getAsInteger(10, intVal) && intVal > 0 && intVal < 32) {
RegNo = ASRRegs[intVal];
RegKind = SparcOperand::rk_Special;
return true;
@@ -1202,9 +1195,8 @@
}
// %fcc0 - %fcc3
- if (name.substr(0, 3).equals_lower("fcc")
- && !name.substr(3).getAsInteger(10, intVal)
- && intVal < 4) {
+ if (name.substr(0, 3).equals_insensitive("fcc") &&
+ !name.substr(3).getAsInteger(10, intVal) && intVal < 4) {
// FIXME: check 64bit and handle %fcc1 - %fcc3
RegNo = Sparc::FCC0 + intVal;
RegKind = SparcOperand::rk_Special;
@@ -1212,46 +1204,42 @@
}
// %g0 - %g7
- if (name.substr(0, 1).equals_lower("g")
- && !name.substr(1).getAsInteger(10, intVal)
- && intVal < 8) {
+ if (name.substr(0, 1).equals_insensitive("g") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
RegNo = IntRegs[intVal];
RegKind = SparcOperand::rk_IntReg;
return true;
}
// %o0 - %o7
- if (name.substr(0, 1).equals_lower("o")
- && !name.substr(1).getAsInteger(10, intVal)
- && intVal < 8) {
+ if (name.substr(0, 1).equals_insensitive("o") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
RegNo = IntRegs[8 + intVal];
RegKind = SparcOperand::rk_IntReg;
return true;
}
- if (name.substr(0, 1).equals_lower("l")
- && !name.substr(1).getAsInteger(10, intVal)
- && intVal < 8) {
+ if (name.substr(0, 1).equals_insensitive("l") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
RegNo = IntRegs[16 + intVal];
RegKind = SparcOperand::rk_IntReg;
return true;
}
- if (name.substr(0, 1).equals_lower("i")
- && !name.substr(1).getAsInteger(10, intVal)
- && intVal < 8) {
+ if (name.substr(0, 1).equals_insensitive("i") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
RegNo = IntRegs[24 + intVal];
RegKind = SparcOperand::rk_IntReg;
return true;
}
// %f0 - %f31
- if (name.substr(0, 1).equals_lower("f")
- && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
+ if (name.substr(0, 1).equals_insensitive("f") &&
+ !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
RegNo = FloatRegs[intVal];
RegKind = SparcOperand::rk_FloatReg;
return true;
}
// %f32 - %f62
- if (name.substr(0, 1).equals_lower("f")
- && !name.substr(1, 2).getAsInteger(10, intVal)
- && intVal >= 32 && intVal <= 62 && (intVal % 2 == 0)) {
+ if (name.substr(0, 1).equals_insensitive("f") &&
+ !name.substr(1, 2).getAsInteger(10, intVal) && intVal >= 32 &&
+ intVal <= 62 && (intVal % 2 == 0)) {
// FIXME: Check V9
RegNo = DoubleRegs[intVal/2];
RegKind = SparcOperand::rk_DoubleReg;
@@ -1259,17 +1247,16 @@
}
// %r0 - %r31
- if (name.substr(0, 1).equals_lower("r")
- && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
+ if (name.substr(0, 1).equals_insensitive("r") &&
+ !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
RegNo = IntRegs[intVal];
RegKind = SparcOperand::rk_IntReg;
return true;
}
// %c0 - %c31
- if (name.substr(0, 1).equals_lower("c")
- && !name.substr(1).getAsInteger(10, intVal)
- && intVal < 32) {
+ if (name.substr(0, 1).equals_insensitive("c") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal < 32) {
RegNo = CoprocRegs[intVal];
RegKind = SparcOperand::rk_CoprocReg;
return true;
@@ -1350,6 +1337,11 @@
RegKind = SparcOperand::rk_Special;
return true;
}
+ if (name.equals("pc")) {
+ RegNo = Sparc::PC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
}
return false;
}
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/src/llvm-project/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 1caa333..5c4419c 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -113,7 +113,7 @@
static const unsigned PRRegDecoderTable[] = {
SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE,
SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN,
- SP::OTHERWIN, SP::WSTATE
+ SP::OTHERWIN, SP::WSTATE, SP::PC
};
static const uint16_t IntPairDecoderTable[] = {
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp b/src/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
index 6ad6940..fa05a41 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -87,7 +87,7 @@
if (MO.isGlobal()) {
StringRef FuncName = MO.getGlobal()->getName();
- if (FuncName.compare_lower("fesetround") == 0) {
+ if (FuncName.compare_insensitive("fesetround") == 0) {
errs() << "Error: You are using the detectroundchange "
"option to detect rounding changes that will "
"cause LEON errata. The only way to fix this "
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 5a9ecfe..51eccfa 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -52,6 +52,7 @@
case Sparc::fixup_sparc_tls_ldm_hi22:
case Sparc::fixup_sparc_tls_ie_hi22:
case Sparc::fixup_sparc_hi22:
+ case Sparc::fixup_sparc_lm:
return (Value >> 10) & 0x3fffff;
case Sparc::fixup_sparc_got13:
@@ -146,6 +147,7 @@
{ "fixup_sparc_l44", 20, 12, 0 },
{ "fixup_sparc_hh", 10, 22, 0 },
{ "fixup_sparc_hm", 22, 10, 0 },
+ { "fixup_sparc_lm", 10, 22, 0 },
{ "fixup_sparc_pc22", 10, 22, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_pc10", 22, 10, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_got22", 10, 22, 0 },
@@ -187,6 +189,7 @@
{ "fixup_sparc_l44", 0, 12, 0 },
{ "fixup_sparc_hh", 0, 22, 0 },
{ "fixup_sparc_hm", 0, 10, 0 },
+ { "fixup_sparc_lm", 0, 22, 0 },
{ "fixup_sparc_pc22", 0, 22, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_pc10", 0, 10, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_got22", 0, 22, 0 },
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index c97a30e..bc508b4 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -86,6 +86,7 @@
case Sparc::fixup_sparc_l44: return ELF::R_SPARC_L44;
case Sparc::fixup_sparc_hh: return ELF::R_SPARC_HH22;
case Sparc::fixup_sparc_hm: return ELF::R_SPARC_HM10;
+ case Sparc::fixup_sparc_lm: return ELF::R_SPARC_LM22;
case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
case Sparc::fixup_sparc_got13: return ELF::R_SPARC_GOT13;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index b5fac02..e0a4309 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -54,6 +54,9 @@
/// fixup_sparc_hm - 10-bit fixup corresponding to %hm(foo)
fixup_sparc_hm,
+ /// fixup_sparc_lm - 22-bit fixup corresponding to %lm(foo)
+ fixup_sparc_lm,
+
/// fixup_sparc_pc22 - 22-bit fixup corresponding to %pc22(foo)
fixup_sparc_pc22,
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index b84ecf0..c2db452 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -50,6 +50,7 @@
case VK_Sparc_L44: OS << "%l44("; return true;
case VK_Sparc_HH: OS << "%hh("; return true;
case VK_Sparc_HM: OS << "%hm("; return true;
+ case VK_Sparc_LM: OS << "%lm("; return true;
// FIXME: use %pc22/%pc10, if system assembler supports them.
case VK_Sparc_PC22: OS << "%hi("; return true;
case VK_Sparc_PC10: OS << "%lo("; return true;
@@ -93,6 +94,7 @@
.Case("l44", VK_Sparc_L44)
.Case("hh", VK_Sparc_HH)
.Case("hm", VK_Sparc_HM)
+ .Case("lm", VK_Sparc_LM)
.Case("pc22", VK_Sparc_PC22)
.Case("pc10", VK_Sparc_PC10)
.Case("got22", VK_Sparc_GOT22)
@@ -130,6 +132,7 @@
case VK_Sparc_L44: return Sparc::fixup_sparc_l44;
case VK_Sparc_HH: return Sparc::fixup_sparc_hh;
case VK_Sparc_HM: return Sparc::fixup_sparc_hm;
+ case VK_Sparc_LM: return Sparc::fixup_sparc_lm;
case VK_Sparc_PC22: return Sparc::fixup_sparc_pc22;
case VK_Sparc_PC10: return Sparc::fixup_sparc_pc10;
case VK_Sparc_GOT22: return Sparc::fixup_sparc_got22;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 7660353..504e959 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -31,6 +31,7 @@
VK_Sparc_L44,
VK_Sparc_HH,
VK_Sparc_HM,
+ VK_Sparc_LM,
VK_Sparc_PC22,
VK_Sparc_PC10,
VK_Sparc_GOT22,
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
index 9f729a6..27976d1 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
@@ -13,6 +13,9 @@
#include "llvm/MC/MCStreamer.h"
namespace llvm {
+
+class formatted_raw_ostream;
+
class SparcTargetStreamer : public MCTargetStreamer {
virtual void anchor();
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index ee0b852..2006c9b 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -303,6 +303,7 @@
assert((TF == SparcMCExpr::VK_Sparc_HI
|| TF == SparcMCExpr::VK_Sparc_H44
|| TF == SparcMCExpr::VK_Sparc_HH
+ || TF == SparcMCExpr::VK_Sparc_LM
|| TF == SparcMCExpr::VK_Sparc_TLS_GD_HI22
|| TF == SparcMCExpr::VK_Sparc_TLS_LDM_HI22
|| TF == SparcMCExpr::VK_Sparc_TLS_LDO_HIX22
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 63187fd..d165052 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -97,14 +97,9 @@
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc dl;
- bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+ bool NeedsStackRealignment = RegInfo.shouldRealignStack(MF);
- // FIXME: unfortunately, returning false from canRealignStack
- // actually just causes needsStackRealignment to return false,
- // rather than reporting an error, as would be sensible. This is
- // poor, but fixing that bogosity is going to be a large project.
- // For now, just see if it's lied, and report an error here.
- if (!NeedsStackRealignment && MFI.getMaxAlign() > getStackAlign())
+ if (NeedsStackRealignment && !RegInfo.canRealignStack(MF))
report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
"stack re-alignment, but LLVM couldn't handle it "
"(probably because it has a dynamic alloca).");
@@ -252,9 +247,8 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- RegInfo->needsStackRealignment(MF) ||
- MFI.hasVarSizedObjects() ||
- MFI.isFrameAddressTaken();
+ RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken();
}
StackOffset
@@ -280,7 +274,7 @@
} else if (isFixed) {
// Otherwise, argument access should always use %fp.
UseFP = true;
- } else if (RegInfo->needsStackRealignment(MF)) {
+ } else if (RegInfo->hasStackRealignment(MF)) {
// If there is dynamic stack realignment, all local object
// references need to be via %sp, to take account of the
// re-alignment.
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index e5c7794..2007303 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1480,6 +1480,7 @@
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
@@ -1525,6 +1526,8 @@
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -2985,9 +2988,10 @@
}
static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
- if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
- // Expand with a fence.
- return SDValue();
+ if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) {
+ // Expand with a fence.
+ return SDValue();
+ }
// Monotonic load/stores are legal.
return Op;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.h b/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.h
index c6d0011..5c97038 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -82,13 +82,6 @@
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
- unsigned
- getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
- return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
- }
-
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrAliases.td b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrAliases.td
index 4a0e8c8..1b3ec19 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -444,6 +444,7 @@
def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>;
def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>;
def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
+def : InstAlias<"mov %pc, $rd", (RDPC IntRegs:$rd), 0>;
// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
@@ -491,6 +492,7 @@
def : MnemonicAlias<"stuha", "stha">;
def : MnemonicAlias<"stsha", "stha">;
+def : MnemonicAlias<"stw", "st">, Requires<[HasV9]>;
def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
@@ -521,3 +523,6 @@
// signx reg, rd -> sra reg, %g0, rd
def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>;
+
+// sir -> sir 0
+def : InstAlias<"sir", (SIR 0), 0>;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
index da53307..259ce96 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -224,7 +224,7 @@
// Define rr and ri shift instructions with patterns.
multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
- ValueType VT, ValueType SIT, RegisterClass RC,
+ ValueType VT, Operand SIT, RegisterClass RC,
InstrItinClass itin = IIC_iu_instr> {
def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
!strconcat(OpcStr, " $rs1, $rs2, $rd"),
@@ -237,7 +237,7 @@
}
class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern,
- InstrItinClass itin = NoItinerary>
+ InstrItinClass itin = NoItinerary>
: InstSP<outs, ins, asmstr, pattern, itin> {
bits<5> rd;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
index d1190ae..b161e2a 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -1056,6 +1056,14 @@
"rd %tbr, $rd", []>;
}
+// PC don't exist on the SparcV8, only the V9.
+let Predicates = [HasV9] in {
+ let rs2 = 0, rs1 = 5 in
+ def RDPC : F3_1<2, 0b101000,
+ (outs IntRegs:$rd), (ins),
+ "rd %pc, $rd", []>;
+}
+
// Section B.29 - Write State Register Instructions
def WRASRrr : F3_1<2, 0b110000,
(outs ASRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
@@ -1534,6 +1542,11 @@
def MEMBARi : F3_2<2, 0b101000, (outs), (ins MembarTag:$simm13),
"membar $simm13", []>;
+let Predicates = [HasV9], rd = 15, rs1 = 0b00000 in
+ def SIR: F3_2<2, 0b110000, (outs),
+ (ins simm13Op:$simm13),
+ "sir $simm13", []>;
+
// The CAS instruction, unlike other instructions, only comes in a
// form which requires an ASI be provided. The ASI value hardcoded
// here is ASI_PRIMARY, the default unprivileged ASI for SparcV9.
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/src/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index 8225bc2..9453efb 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -106,6 +106,8 @@
def PSR : SparcCtrlReg<0, "PSR">;
def WIM : SparcCtrlReg<0, "WIM">;
def TBR : SparcCtrlReg<0, "TBR">;
+// PC on the other hand is only available for SparcV9.
+def PC : SparcCtrlReg<5, "PC">;
def TPC : SparcCtrlReg<0, "TPC">;
def TNPC : SparcCtrlReg<1, "TNPC">;
diff --git a/src/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index ae5228d..083339b 100644
--- a/src/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -117,9 +117,7 @@
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
// function, so we can enable it as a subtarget feature.
- bool softFloat =
- F.hasFnAttribute("use-soft-float") &&
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool softFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
if (softFloat)
FS += FS.empty() ? "+soft-float" : ",+soft-float";
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 2b815a3..0de2424 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -7,11 +7,13 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SystemZInstPrinter.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
#include "MCTargetDesc/SystemZMCTargetDesc.h"
#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -428,6 +430,42 @@
bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ // Both the hlasm and att variants still rely on the basic gnu asm
+ // format with respect to inputs, clobbers, outputs etc.
+ //
+ // However, calling the overriden getAssemblerDialect() method in
+ // AsmParser is problematic. It either returns the AssemblerDialect field
+ // in the MCAsmInfo instance if the AssemblerDialect field in AsmParser is
+ // unset, otherwise it returns the private AssemblerDialect field in
+ // AsmParser.
+ //
+ // The problematic part is because, we forcibly set the inline asm dialect
+ // in the AsmParser instance in AsmPrinterInlineAsm.cpp. Soo any query
+ // to the overriden getAssemblerDialect function in AsmParser.cpp, will
+ // not return the assembler dialect set in the respective MCAsmInfo instance.
+ //
+ // For this purpose, we explicitly query the SystemZMCAsmInfo instance
+ // here, to get the "correct" assembler dialect, and use it in various
+ // functions.
+ unsigned getMAIAssemblerDialect() {
+ return Parser.getContext().getAsmInfo()->getAssemblerDialect();
+ }
+
+ // An alphabetic character in HLASM is a letter from 'A' through 'Z',
+ // or from 'a' through 'z', or '$', '_','#', or '@'.
+ inline bool isHLASMAlpha(char C) {
+ return isAlpha(C) || llvm::is_contained("_@#$", C);
+ }
+
+ // A digit in HLASM is a number from 0 to 9.
+ inline bool isHLASMAlnum(char C) { return isHLASMAlpha(C) || isDigit(C); }
+
+ // Are we parsing using the AD_HLASM dialect?
+ inline bool isParsingHLASM() { return getMAIAssemblerDialect() == AD_HLASM; }
+
+ // Are we parsing using the AD_ATT dialect?
+ inline bool isParsingATT() { return getMAIAssemblerDialect() == AD_ATT; }
+
public:
SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
const MCInstrInfo &MII,
@@ -455,6 +493,7 @@
OperandVector &Operands, MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
+ bool isLabel(AsmToken &Token) override;
// Used by the TableGen code to parse particular operand types.
OperandMatchResultTy parseGR32(OperandVector &Operands) {
@@ -791,7 +830,7 @@
}
// Handle register names of the form %<prefix><number>
- if (Parser.getTok().is(AsmToken::Percent)) {
+ if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) {
if (parseRegister(Reg))
return MatchOperand_ParseFail;
@@ -873,6 +912,9 @@
Operands.push_back(SystemZOperand::createImm(Register, StartLoc, EndLoc));
}
else {
+ if (isParsingHLASM())
+ return MatchOperand_NoMatch;
+
Register Reg;
if (parseRegister(Reg))
return MatchOperand_ParseFail;
@@ -980,7 +1022,7 @@
if (getLexer().is(AsmToken::LParen)) {
Parser.Lex();
- if (getLexer().is(AsmToken::Percent)) {
+ if (isParsingATT() && getLexer().is(AsmToken::Percent)) {
// Parse the first register.
HaveReg1 = true;
if (parseRegister(Reg1))
@@ -1023,7 +1065,7 @@
if (parseIntegerRegister(Reg2, RegGR))
return true;
} else {
- if (parseRegister(Reg2))
+ if (isParsingATT() && parseRegister(Reg2))
return true;
}
}
@@ -1325,7 +1367,7 @@
// Apply mnemonic aliases first, before doing anything else, in
// case the target uses it.
- applyMnemonicAliases(Name, getAvailableFeatures(), 0 /*VariantID*/);
+ applyMnemonicAliases(Name, getAvailableFeatures(), getMAIAssemblerDialect());
Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
@@ -1339,10 +1381,38 @@
// Read any subsequent operands.
while (getLexer().is(AsmToken::Comma)) {
Parser.Lex();
+
+ if (isParsingHLASM() && getLexer().is(AsmToken::Space))
+ return Error(
+ Parser.getTok().getLoc(),
+ "No space allowed between comma that separates operand entries");
+
if (parseOperand(Operands, Name)) {
return true;
}
}
+
+ // Under the HLASM variant, we could have the remark field
+ // The remark field occurs after the operation entries
+ // There is a space that separates the operation entries and the
+ // remark field.
+ if (isParsingHLASM() && getTok().is(AsmToken::Space)) {
+ // We've confirmed that there is a Remark field.
+ StringRef Remark(getLexer().LexUntilEndOfStatement());
+ Parser.Lex();
+
+ // If there is nothing after the space, then there is nothing to emit
+ // We could have a situation as this:
+ // " \n"
+ // After lexing above, we will have
+ // "\n"
+ // This isn't an explicit remark field, so we don't have to output
+ // this as a comment.
+ if (Remark.size())
+ // Output the entire Remarks Field as a comment
+ getStreamer().AddComment(Remark);
+ }
+
if (getLexer().isNot(AsmToken::EndOfStatement)) {
SMLoc Loc = getLexer().getLoc();
return Error(Loc, "unexpected token in argument list");
@@ -1380,7 +1450,7 @@
// a context-dependent parse routine, which gives the required register
// class. The code is here to mop up other cases, like those where
// the instruction isn't recognized.
- if (Parser.getTok().is(AsmToken::Percent)) {
+ if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) {
Register Reg;
if (parseRegister(Reg))
return true;
@@ -1428,9 +1498,11 @@
MCInst Inst;
unsigned MatchResult;
+ unsigned Dialect = getMAIAssemblerDialect();
+
FeatureBitset MissingFeatures;
- MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
- MissingFeatures, MatchingInlineAsm);
+ MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+ MatchingInlineAsm, Dialect);
switch (MatchResult) {
case Match_Success:
Inst.setLoc(IDLoc);
@@ -1467,7 +1539,7 @@
case Match_MnemonicFail: {
FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = SystemZMnemonicSpellCheck(
- ((SystemZOperand &)*Operands[0]).getToken(), FBS);
+ ((SystemZOperand &)*Operands[0]).getToken(), FBS, Dialect);
return Error(IDLoc, "invalid instruction" + Suggestion,
((SystemZOperand &)*Operands[0]).getLocRange());
}
@@ -1498,6 +1570,10 @@
// For consistency with the GNU assembler, treat immediates as offsets
// from ".".
if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+ if (isParsingHLASM()) {
+ Error(StartLoc, "Expected PC-relative expression");
+ return MatchOperand_ParseFail;
+ }
if (isOutOfRangeConstant(CE)) {
Error(StartLoc, "offset out of range");
return MatchOperand_ParseFail;
@@ -1570,6 +1646,47 @@
return MatchOperand_Success;
}
+bool SystemZAsmParser::isLabel(AsmToken &Token) {
+ if (isParsingATT())
+ return true;
+
+ // HLASM labels are ordinary symbols.
+ // An HLASM label always starts at column 1.
+ // An ordinary symbol syntax is laid out as follows:
+ // Rules:
+ // 1. Has to start with an "alphabetic character". Can be followed by up to
+ // 62 alphanumeric characters. An "alphabetic character", in this scenario,
+ // is a letter from 'A' through 'Z', or from 'a' through 'z',
+ // or '$', '_', '#', or '@'
+ // 2. Labels are case-insensitive. E.g. "lab123", "LAB123", "lAb123", etc.
+ // are all treated as the same symbol. However, the processing for the case
+ // folding will not be done in this function.
+ StringRef RawLabel = Token.getString();
+ SMLoc Loc = Token.getLoc();
+
+ // An HLASM label cannot be empty.
+ if (!RawLabel.size())
+ return !Error(Loc, "HLASM Label cannot be empty");
+
+ // An HLASM label cannot exceed greater than 63 characters.
+ if (RawLabel.size() > 63)
+ return !Error(Loc, "Maximum length for HLASM Label is 63 characters");
+
+ // A label must start with an "alphabetic character".
+ if (!isHLASMAlpha(RawLabel[0]))
+ return !Error(Loc, "HLASM Label has to start with an alphabetic "
+ "character or the underscore character");
+
+ // Now, we've established that the length is valid
+ // and the first character is alphabetic.
+ // Check whether remaining string is alphanumeric.
+ for (unsigned I = 1; I < RawLabel.size(); ++I)
+ if (!isHLASMAlnum(RawLabel[I]))
+ return !Error(Loc, "HLASM Label has to be alphanumeric");
+
+ return true;
+}
+
// Force static initialization.
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmParser() {
RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
index fac363c..f3f3f09 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
@@ -23,18 +23,19 @@
#include "SystemZGenAsmWriter.inc"
-void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
- unsigned Index, raw_ostream &O) {
+void SystemZInstPrinter::printAddress(const MCAsmInfo *MAI, unsigned Base,
+ int64_t Disp, unsigned Index,
+ raw_ostream &O) {
O << Disp;
if (Base || Index) {
O << '(';
if (Index) {
- O << '%' << getRegisterName(Index);
+ printFormattedRegName(MAI, Index, O);
if (Base)
O << ',';
}
if (Base)
- O << '%' << getRegisterName(Base);
+ printFormattedRegName(MAI, Base, O);
O << ')';
}
}
@@ -45,7 +46,7 @@
if (!MO.getReg())
O << '0';
else
- O << '%' << getRegisterName(MO.getReg());
+ printFormattedRegName(MAI, MO.getReg(), O);
}
else if (MO.isImm())
O << MO.getImm();
@@ -55,6 +56,17 @@
llvm_unreachable("Invalid operand");
}
+void SystemZInstPrinter::printFormattedRegName(const MCAsmInfo *MAI,
+ unsigned RegNo, raw_ostream &O) {
+ const char *RegName = getRegisterName(RegNo);
+ if (MAI->getAssemblerDialect() == AD_HLASM) {
+ // Skip register prefix so that only register number is left
+ assert(isalpha(RegName[0]) && isdigit(RegName[1]));
+ O << (RegName + 1);
+ } else
+ O << '%' << RegName;
+}
+
void SystemZInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -62,10 +74,6 @@
printAnnotation(O, Annot);
}
-void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
- O << '%' << getRegisterName(RegNo);
-}
-
template <unsigned N>
static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
int64_t Value = MI->getOperand(OpNum).getImm();
@@ -186,13 +194,13 @@
void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
raw_ostream &O) {
- printAddress(MI->getOperand(OpNum).getReg(),
+ printAddress(&MAI, MI->getOperand(OpNum).getReg(),
MI->getOperand(OpNum + 1).getImm(), 0, O);
}
void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
raw_ostream &O) {
- printAddress(MI->getOperand(OpNum).getReg(),
+ printAddress(&MAI, MI->getOperand(OpNum).getReg(),
MI->getOperand(OpNum + 1).getImm(),
MI->getOperand(OpNum + 2).getReg(), O);
}
@@ -203,8 +211,10 @@
uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
uint64_t Length = MI->getOperand(OpNum + 2).getImm();
O << Disp << '(' << Length;
- if (Base)
- O << ",%" << getRegisterName(Base);
+ if (Base) {
+ O << ",";
+ printRegName(O, Base);
+ }
O << ')';
}
@@ -213,15 +223,18 @@
unsigned Base = MI->getOperand(OpNum).getReg();
uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
unsigned Length = MI->getOperand(OpNum + 2).getReg();
- O << Disp << "(%" << getRegisterName(Length);
- if (Base)
- O << ",%" << getRegisterName(Base);
+ O << Disp << "(";
+ printRegName(O, Length);
+ if (Base) {
+ O << ",";
+ printRegName(O, Base);
+ }
O << ')';
}
void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum,
raw_ostream &O) {
- printAddress(MI->getOperand(OpNum).getReg(),
+ printAddress(&MAI, MI->getOperand(OpNum).getReg(),
MI->getOperand(OpNum + 1).getImm(),
MI->getOperand(OpNum + 2).getReg(), O);
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index 0db7279..0a57ca0 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H
#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H
+#include "SystemZMCAsmInfo.h"
#include "llvm/MC/MCInstPrinter.h"
#include <cstdint>
@@ -32,15 +33,21 @@
static const char *getRegisterName(unsigned RegNo);
// Print an address with the given base, displacement and index.
- static void printAddress(unsigned Base, int64_t Disp, unsigned Index,
- raw_ostream &O);
+ static void printAddress(const MCAsmInfo *MAI, unsigned Base, int64_t Disp,
+ unsigned Index, raw_ostream &O);
// Print the given operand.
static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
raw_ostream &O);
+ static void printFormattedRegName(const MCAsmInfo *MAI, unsigned RegNo,
+ raw_ostream &O);
+
// Override MCInstPrinter.
- void printRegName(raw_ostream &O, unsigned RegNo) const override;
+ inline void printRegName(raw_ostream &O, unsigned RegNo) const override {
+ printFormattedRegName(&MAI, RegNo, O);
+ }
+
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &O) override;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 5f276f7..134c85e 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -8,6 +8,7 @@
#include "MCTargetDesc/SystemZMCFixups.h"
#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixupKindInfo.h"
@@ -49,7 +50,10 @@
unsigned getNumFixupKinds() const override {
return SystemZ::NumTargetFixupKinds;
}
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
@@ -67,6 +71,22 @@
};
} // end anonymous namespace
+Optional<MCFixupKind> SystemZMCAsmBackend::getFixupKind(StringRef Name) const {
+ unsigned Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/SystemZ.def"
+#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_390_NONE)
+ .Case("BFD_RELOC_8", ELF::R_390_8)
+ .Case("BFD_RELOC_16", ELF::R_390_16)
+ .Case("BFD_RELOC_32", ELF::R_390_32)
+ .Case("BFD_RELOC_64", ELF::R_390_64)
+ .Default(-1u);
+ if (Type != -1u)
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
+ return None;
+}
+
const MCFixupKindInfo &
SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
@@ -77,6 +97,11 @@
{ "FK_390_TLS_CALL", 0, 0, 0 }
};
+ // Fixup kinds from .reloc directive are like R_390_NONE. They
+ // do not require any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -85,6 +110,12 @@
return Infos[Kind - FirstTargetFixupKind];
}
+bool SystemZMCAsmBackend::shouldForceRelocation(const MCAssembler &,
+ const MCFixup &Fixup,
+ const MCValue &) {
+ return Fixup.getKind() >= FirstLiteralRelocationKind;
+}
+
void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target,
@@ -92,6 +123,8 @@
bool IsResolved,
const MCSubtargetInfo *STI) const {
MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
unsigned Offset = Fixup.getOffset();
unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
unsigned Size = (BitSize + 7) / 8;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 76df8cf..fa48642 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -17,12 +17,32 @@
CalleeSaveStackSlotSize = 8;
IsLittleEndian = false;
+ AssemblerDialect = TT.isOSzOS() ? AD_HLASM : AD_ATT;
+
MaxInstLength = 6;
- CommentString = "#";
+ CommentString = AssemblerDialect == AD_HLASM ? "*" : "#";
+ RestrictCommentStringToStartOfStatement = (AssemblerDialect == AD_HLASM);
+ AllowAdditionalComments = (AssemblerDialect == AD_ATT);
+ AllowAtAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
+ AllowDollarAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
+ AllowHashAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
+ DotIsPC = (AssemblerDialect == AD_ATT);
+ StarIsPC = (AssemblerDialect == AD_HLASM);
+ EmitGNUAsmStartIndentationMarker = (AssemblerDialect == AD_ATT);
+ AllowAtInName = (AssemblerDialect == AD_HLASM);
+ EmitLabelsInUpperCase = (AssemblerDialect == AD_HLASM);
+
ZeroDirective = "\t.space\t";
Data64bitsDirective = "\t.quad\t";
UsesELFSectionDirectiveForBSS = true;
SupportsDebugInformation = true;
ExceptionsType = ExceptionHandling::DwarfCFI;
}
+
+bool SystemZMCAsmInfo::isAcceptableChar(char C) const {
+ if (AssemblerDialect == AD_ATT)
+ return MCAsmInfo::isAcceptableChar(C);
+
+ return MCAsmInfo::isAcceptableChar(C) || C == '#';
+}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index b8818a6..389575d 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -14,10 +14,12 @@
namespace llvm {
class Triple;
+enum SystemZAsmDialect { AD_ATT = 0, AD_HLASM = 1 };
class SystemZMCAsmInfo : public MCAsmInfoELF {
public:
explicit SystemZMCAsmInfo(const Triple &TT);
+ bool isAcceptableChar(char C) const override;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 49b6fc4..0b3e7b1 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -117,8 +117,10 @@
const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
- MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
unsigned Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
switch (Modifier) {
case MCSymbolRefExpr::VK_None:
if (IsPCRel)
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 5c191d1..2a53dda 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -152,7 +152,7 @@
MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
nullptr, MRI.getDwarfRegNum(SystemZ::R15D, true),
- SystemZMC::CFAOffsetFromInitialSP);
+ SystemZMC::ELFCFAOffsetFromInitialSP);
MAI->addInitialFrameState(Inst);
return MAI;
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 8f720c5..899fec6 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -32,10 +32,10 @@
namespace SystemZMC {
// How many bytes are in the ABI-defined, caller-allocated part of
// a stack frame.
-const int64_t CallFrameSize = 160;
+const int64_t ELFCallFrameSize = 160;
// The offset of the DWARF CFA from the incoming stack pointer.
-const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+const int64_t ELFCFAOffsetFromInitialSP = ELFCallFrameSize;
// Maps of asm register numbers to LLVM register numbers, with 0 indicating
// an invalid register. In principle we could use 32-bit and 64-bit register
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZ.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZ.td
index ebbc6ff..e18deed 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZ.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZ.td
@@ -67,6 +67,20 @@
let ShouldEmitMatchRegisterName = 0;
}
+def ATTAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Variant name.
+ string Name = "att";
+}
+
+def HLASMAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Variant name.
+ string Name = "hlasm";
+}
+
//===----------------------------------------------------------------------===//
// Top-level target declaration
//===----------------------------------------------------------------------===//
@@ -74,5 +88,6 @@
def SystemZ : Target {
let InstructionSet = SystemZInstrInfo;
let AssemblyParsers = [SystemZAsmParser];
+ let AssemblyParserVariants = [ATTAsmParserVariant, HLASMAsmParserVariant];
let AllowRegisterRenaming = 1;
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 584737e..46ccd21 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -126,10 +126,15 @@
void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
SystemZMCInstLower Lower(MF->getContext(), *this);
+ const SystemZSubtarget *Subtarget = &MF->getSubtarget<SystemZSubtarget>();
MCInst LoweredMI;
switch (MI->getOpcode()) {
case SystemZ::Return:
- LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+ if (Subtarget->isTargetXPLINK64())
+ LoweredMI =
+ MCInstBuilder(SystemZ::B).addReg(SystemZ::R7D).addImm(2).addReg(0);
+ else
+ LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
break;
case SystemZ::CondReturn:
@@ -211,6 +216,26 @@
.addImm(0);
break;
+ case SystemZ::CallBRASL_XPLINK64:
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R7D)
+ .addExpr(Lower.getExpr(MI->getOperand(0),
+ MCSymbolRefExpr::VK_PLT)));
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(SystemZ::BCRAsm).addImm(0).addReg(SystemZ::R3D));
+ return;
+
+ case SystemZ::CallBASR_XPLINK64:
+ EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BASR)
+ .addReg(SystemZ::R7D)
+ .addReg(MI->getOperand(0).getReg()));
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(SystemZ::BCRAsm).addImm(0).addReg(SystemZ::R0D));
+ return;
+
case SystemZ::CallBRASL:
LoweredMI = MCInstBuilder(SystemZ::BRASL)
.addReg(SystemZ::R14D)
@@ -516,6 +541,30 @@
LowerPATCHPOINT(*MI, Lower);
return;
+ case SystemZ::EXRL_Pseudo: {
+ unsigned TargetInsOpc = MI->getOperand(0).getImm();
+ Register LenMinus1Reg = MI->getOperand(1).getReg();
+ Register DestReg = MI->getOperand(2).getReg();
+ int64_t DestDisp = MI->getOperand(3).getImm();
+ Register SrcReg = MI->getOperand(4).getReg();
+ int64_t SrcDisp = MI->getOperand(5).getImm();
+
+ MCSymbol *DotSym = nullptr;
+ MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg)
+ .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp);
+ MCInstSTIPair ET_STI(ET, &MF->getSubtarget());
+ EXRLT2SymMap::iterator I = EXRLTargets2Sym.find(ET_STI);
+ if (I != EXRLTargets2Sym.end())
+ DotSym = I->second;
+ else
+ EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol();
+ const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(SystemZ::EXRL).addReg(LenMinus1Reg).addExpr(Dot));
+ return;
+ }
+
default:
Lower.lower(MI, LoweredMI);
break;
@@ -673,6 +722,19 @@
getSubtargetInfo());
}
+void SystemZAsmPrinter::emitEXRLTargetInstructions() {
+ if (EXRLTargets2Sym.empty())
+ return;
+ // Switch to the .text section.
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ for (auto &I : EXRLTargets2Sym) {
+ OutStreamer->emitLabel(I.second);
+ const MCInstSTIPair &MCI_STI = I.first;
+ OutStreamer->emitInstruction(MCI_STI.first, *MCI_STI.second);
+ }
+ EXRLTargets2Sym.clear();
+}
+
// Convert a SystemZ-specific constant pool modifier into the associated
// MCSymbolRefExpr variant kind.
static MCSymbolRefExpr::VariantKind
@@ -702,11 +764,21 @@
bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode,
raw_ostream &OS) {
- if (ExtraCode)
- return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS);
- SystemZMCInstLower Lower(MF->getContext(), *this);
- MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
- SystemZInstPrinter::printOperand(MO, MAI, OS);
+ const MCRegisterInfo &MRI = *TM.getMCRegisterInfo();
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ MCOperand MCOp;
+ if (ExtraCode) {
+ if (ExtraCode[0] == 'N' && !ExtraCode[1] && MO.isReg() &&
+ SystemZ::GR128BitRegClass.contains(MO.getReg()))
+ MCOp =
+ MCOperand::createReg(MRI.getSubReg(MO.getReg(), SystemZ::subreg_l64));
+ else
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS);
+ } else {
+ SystemZMCInstLower Lower(MF->getContext(), *this);
+ MCOp = Lower.lowerOperand(MO);
+ }
+ SystemZInstPrinter::printOperand(MCOp, MAI, OS);
return false;
}
@@ -714,13 +786,14 @@
unsigned OpNo,
const char *ExtraCode,
raw_ostream &OS) {
- SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
+ SystemZInstPrinter::printAddress(MAI, MI->getOperand(OpNo).getReg(),
MI->getOperand(OpNo + 1).getImm(),
MI->getOperand(OpNo + 2).getReg(), OS);
return false;
}
void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
+ emitEXRLTargetInstructions();
emitStackMaps(SM);
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 2d7562c..11b7311 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -9,10 +9,11 @@
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
-#include "SystemZTargetMachine.h"
#include "SystemZMCInstLower.h"
+#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
@@ -26,6 +27,33 @@
private:
StackMaps SM;
+ typedef std::pair<MCInst, const MCSubtargetInfo *> MCInstSTIPair;
+ struct CmpMCInst {
+ bool operator()(const MCInstSTIPair &MCI_STI_A,
+ const MCInstSTIPair &MCI_STI_B) const {
+ if (MCI_STI_A.second != MCI_STI_B.second)
+ return uintptr_t(MCI_STI_A.second) < uintptr_t(MCI_STI_B.second);
+ const MCInst &A = MCI_STI_A.first;
+ const MCInst &B = MCI_STI_B.first;
+ assert(A.getNumOperands() == B.getNumOperands() &&
+ A.getNumOperands() == 5 && A.getOperand(2).getImm() == 1 &&
+ B.getOperand(2).getImm() == 1 && "Unexpected EXRL target MCInst");
+ if (A.getOpcode() != B.getOpcode())
+ return A.getOpcode() < B.getOpcode();
+ if (A.getOperand(0).getReg() != B.getOperand(0).getReg())
+ return A.getOperand(0).getReg() < B.getOperand(0).getReg();
+ if (A.getOperand(1).getImm() != B.getOperand(1).getImm())
+ return A.getOperand(1).getImm() < B.getOperand(1).getImm();
+ if (A.getOperand(3).getReg() != B.getOperand(3).getReg())
+ return A.getOperand(3).getReg() < B.getOperand(3).getReg();
+ if (A.getOperand(4).getImm() != B.getOperand(4).getImm())
+ return A.getOperand(4).getImm() < B.getOperand(4).getImm();
+ return false;
+ }
+ };
+ typedef std::map<MCInstSTIPair, MCSymbol *, CmpMCInst> EXRLT2SymMap;
+ EXRLT2SymMap EXRLTargets2Sym;
+
public:
SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
@@ -49,6 +77,7 @@
void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
void LowerSTACKMAP(const MachineInstr &MI);
void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
+ void emitEXRLTargetInstructions();
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
index 91c7fae..86eb836 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -11,10 +11,20 @@
using namespace llvm;
-const MCPhysReg SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+const MCPhysReg SystemZ::ELFArgGPRs[SystemZ::ELFNumArgGPRs] = {
SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
};
-const MCPhysReg SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+const MCPhysReg SystemZ::ELFArgFPRs[SystemZ::ELFNumArgFPRs] = {
SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
};
+
+// The XPLINK64 ABI-defined param passing general purpose registers
+const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = {
+ SystemZ::R1D, SystemZ::R2D, SystemZ::R3D
+};
+
+// The XPLINK64 ABI-defined param passing floating point registers
+const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = {
+ SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
+};
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index d4c7ce0..96c1080 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -9,17 +9,24 @@
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+#include "SystemZSubtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/MC/MCRegisterInfo.h"
namespace llvm {
namespace SystemZ {
- const unsigned NumArgGPRs = 5;
- extern const MCPhysReg ArgGPRs[NumArgGPRs];
+ const unsigned ELFNumArgGPRs = 5;
+ extern const MCPhysReg ELFArgGPRs[ELFNumArgGPRs];
- const unsigned NumArgFPRs = 4;
- extern const MCPhysReg ArgFPRs[NumArgFPRs];
+ const unsigned ELFNumArgFPRs = 4;
+ extern const MCPhysReg ELFArgFPRs[ELFNumArgFPRs];
+
+ const unsigned XPLINK64NumArgGPRs = 3;
+ extern const MCPhysReg XPLINK64ArgGPRs[XPLINK64NumArgGPRs];
+
+ const unsigned XPLINK64NumArgFPRs = 4;
+ extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
} // end namespace SystemZ
class SystemZCCState : public CCState {
@@ -107,7 +114,16 @@
// OK, we've collected all parts in the pending list. Allocate
// the location (register or stack slot) for the indirect pointer.
// (This duplicates the usual i64 calling convention rules.)
- unsigned Reg = State.AllocateReg(SystemZ::ArgGPRs);
+ unsigned Reg;
+ const SystemZSubtarget &Subtarget =
+ State.getMachineFunction().getSubtarget<SystemZSubtarget>();
+ if (Subtarget.isTargetELF())
+ Reg = State.AllocateReg(SystemZ::ELFArgGPRs);
+ else if (Subtarget.isTargetXPLINK64())
+ Reg = State.AllocateReg(SystemZ::XPLINK64ArgGPRs);
+ else
+ llvm_unreachable("Unknown Calling Convention!");
+
unsigned Offset = Reg ? 0 : State.AllocateStack(8, Align(8));
// Use that same location for all the pending parts.
@@ -124,6 +140,80 @@
return true;
}
+inline bool CC_XPLINK64_Shadow_Reg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ if (LocVT == MVT::f32 || LocVT == MVT::f64) {
+ State.AllocateReg(SystemZ::XPLINK64ArgGPRs);
+ }
+ if (LocVT == MVT::f128 || LocVT.is128BitVector()) {
+ // Shadow next two GPRs, if available.
+ State.AllocateReg(SystemZ::XPLINK64ArgGPRs);
+ State.AllocateReg(SystemZ::XPLINK64ArgGPRs);
+
+ // Quad precision floating point needs to
+ // go inside pre-defined FPR pair.
+ if (LocVT == MVT::f128) {
+ for (unsigned I = 0; I < SystemZ::XPLINK64NumArgFPRs; I += 2)
+ if (State.isAllocated(SystemZ::XPLINK64ArgFPRs[I]))
+ State.AllocateReg(SystemZ::XPLINK64ArgFPRs[I + 1]);
+ }
+ }
+ return false;
+}
+
+inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (LocVT.getSizeInBits() < 128)
+ return false;
+
+ if (static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))
+ return false;
+
+ // For any C or C++ program, this should always be
+ // false, since it is illegal to have a function
+ // where the first argument is variadic. Therefore
+ // the first fixed argument should already have
+ // allocated GPR1 either through shadowing it or
+ // using it for parameter passing.
+ State.AllocateReg(SystemZ::R1D);
+
+ bool AllocGPR2 = State.AllocateReg(SystemZ::R2D);
+ bool AllocGPR3 = State.AllocateReg(SystemZ::R3D);
+
+ // If GPR2 and GPR3 are available, then we may pass vararg in R2Q.
+ if (AllocGPR2 && AllocGPR3) {
+ State.addLoc(
+ CCValAssign::getReg(ValNo, ValVT, SystemZ::R2Q, LocVT, LocInfo));
+ return true;
+ }
+
+ // If only GPR3 is available, we allocate on stack but need to
+ // set custom handling to copy hi bits into GPR3.
+ if (!AllocGPR2 && AllocGPR3) {
+ auto Offset = State.AllocateStack(16, Align(8));
+ State.addLoc(
+ CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+ }
+
+ return false;
+}
+
+inline bool RetCC_SystemZ_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("Return value calling convention currently unsupported.");
+}
+
+inline bool CC_SystemZ_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &,
+ ISD::ArgFlagsTy &, CCState &) {
+ llvm_unreachable("Argument calling convention currently unsupported.");
+}
+
inline bool CC_SystemZ_GHC_Error(unsigned &, MVT &, MVT &,
CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
CCState &) {
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index b1b7ad4..45e22b0 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -20,6 +20,10 @@
class CCIfFixed<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
+// Match if this specific argument is not a fixed (i.e. vararg) argument.
+class CCIfNotFixed<CCAction A>
+ : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>;
+
// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
@@ -28,7 +32,7 @@
//===----------------------------------------------------------------------===//
// z/Linux return value calling convention
//===----------------------------------------------------------------------===//
-def RetCC_SystemZ : CallingConv<[
+def RetCC_SystemZ_ELF : CallingConv<[
// Promote i32 to i64 if it has an explicit extension type.
CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
@@ -83,7 +87,7 @@
//===----------------------------------------------------------------------===//
// z/Linux argument calling conventions
//===----------------------------------------------------------------------===//
-def CC_SystemZ : CallingConv<[
+def CC_SystemZ_ELF : CallingConv<[
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_SystemZ_GHC>>,
// Promote i32 to i64 if it has an explicit extension type.
@@ -139,11 +143,11 @@
//===----------------------------------------------------------------------===//
// z/Linux callee-saved registers
//===----------------------------------------------------------------------===//
-def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
+def CSR_SystemZ_ELF : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
(sequence "F%dD", 8, 15))>;
// R9 is used to return SwiftError; remove it from CSR.
-def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
+def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ_ELF, R9D)>;
// "All registers" as used by the AnyReg calling convention.
// Note that registers 0 and 1 are still defined as intra-call scratch
@@ -155,3 +159,152 @@
def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>;
+//===----------------------------------------------------------------------===//
+// z/OS XPLINK64 callee-saved registers
+//===----------------------------------------------------------------------===//
+def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
+ (sequence "F%dD", 8, 15))>;
+
+def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
+ (sequence "F%dD", 15, 8),
+ (sequence "V%d", 23, 16))>;
+
+//===----------------------------------------------------------------------===//
+// z/OS XPLINK64 return value calling convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ_XPLINK64 : CallingConv<[
+ // XPLINK64 ABI compliant code widens integral types smaller than i64
+ // to i64.
+ CCIfType<[i32], CCPromoteToType<i64>>,
+
+ // Structs of size 1-24 bytes are returned in R1D, R2D, and R3D.
+ CCIfType<[i64], CCIfInReg<CCAssignToReg<[R1D, R2D, R3D]>>>,
+ // An i64 is returned in R3D. R2D and R1D provided for ABI non-compliant
+ // code.
+ CCIfType<[i64], CCAssignToReg<[R3D, R2D, R1D]>>,
+
+ // ABI compliant code returns floating point values in FPR0, FPR2, FPR4
+ // and FPR6, using as many registers as required.
+ // All floating point return-value registers are call-clobbered.
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+ // ABI compliant code returns f128 in F0D and F2D, hence F0Q.
+ // F4D and F6D, hence F4Q are used for complex long double types.
+ CCIfType<[f128], CCAssignToReg<[F0Q,F4Q]>>,
+
+ // ABI compliant code returns vectors in VR24 but other registers
+ // are provided for code that does not care about the ABI.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[V24, V25, V26, V27, V28, V29, V30, V31]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/OS XPLINK64 argument calling conventions
+//===----------------------------------------------------------------------===//
+// XPLink uses a logical argument list consisting of contiguous register-size
+// words (8 bytes in 64-Bit mode) where some arguments are passed in registers
+// and some in storage.
+// Even though 3 GPRs, 4 FPRs, and 8 VRs may be used,
+// space must be reserved for all the args on stack.
+// The first three register-sized words of the parameter area are passed in
+// GPRs 1-3. FP values and vector-type arguments are instead passed in FPRs
+// and VRs respectively, but if a FP value or vector argument occupies one of
+// the first three register-sized words of the parameter area, the corresponding
+// GPR's value is not used to pass arguments.
+//
+// The XPLINK64 Calling Convention is fully specified in Chapter 22 of the z/OS
+// Language Environment Vendor Interfaces. Appendix B of the same document contains
+// examples.
+
+def CC_SystemZ_XPLINK64 : CallingConv<[
+ // XPLINK64 ABI compliant code widens integral types smaller than i64
+ // to i64 before placing the parameters either on the stack or in registers.
+ CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+ // A SwiftSelf is passed in callee-saved R10.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
+
+ // A SwiftError is passed in R0.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R0D]>>>,
+
+ // First i128 values. These are already split into two i64 here,
+ // so we have to use a custom handler and assign into registers, if possible
+ // We need to deal with this first
+ CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>,
+ // The first 3 integer arguments are passed in registers R1D-R3D.
+ // The rest will be passed in the user area. The address offset of the user
+ // area can be found in register R4D.
+ CCIfType<[i32], CCAssignToReg<[R1L, R2L, R3L]>>,
+ CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>,
+
+ // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
+ // are passed in the same way, but they're widened to one of these types
+ // during type legalization.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfFixed<CCAssignToReg<[V24, V25, V26, V27,
+ V28, V29, V30, V31]>>>>,
+
+ // The first 4 named float and double arguments are passed in registers FPR0-FPR6.
+ // The rest will be passed in the user area.
+ CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>,
+ CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>,
+ // The first 2 long double arguments are passed in register FPR0/FPR2
+ // and FPR4/FPR6. The rest will be passed in the user area.
+ CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>,
+
+ // Non fixed floats are passed in GPRs
+ // Promote f32 to f64, if it needs to be passed in GPRs.
+ CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
+ // Assign f64 varargs to their proper GPRs.
+ CCIfType<[f64], CCIfNotFixed<CCAssignToReg<[R1D, R2D, R3D]>>>,
+ // long double, can only be passed in GPR2 and GPR3, if available,
+ // hence R2Q
+ CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+
+ // Non fixed vector arguments are treated in the same way as long
+ // doubles.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
+
+ // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+ // Other f128 arguments are passed in 8-byte-aligned 16-byte stack slots.
+ CCIfType<[f128], CCAssignToStack<16, 8>>,
+ // Vector arguments are passed in 8-byte-alinged 16-byte stack slots too.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToStack<16, 8>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// s390x return value calling convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_SystemZ : CallingConv<[
+ // zOS XPLINK64
+ CCIfSubtarget<"isTargetXPLINK64()", CCDelegateTo<RetCC_SystemZ_XPLINK64>>,
+
+ // ELF Linux SystemZ
+ CCIfSubtarget<"isTargetELF()", CCDelegateTo<RetCC_SystemZ_ELF>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// s390x argument calling conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+ // zOS XPLINK64
+ CCIfSubtarget<"isTargetXPLINK64()", CCDelegateTo<CC_SystemZ_XPLINK64>>,
+
+ // ELF Linux SystemZ
+ CCIfSubtarget<"isTargetELF()", CCDelegateTo<CC_SystemZ_ELF>>
+]>;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
index b1706a4..78b8394 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -293,6 +293,45 @@
//===----------------------------------------------------------------------===//
//
+// New features added in the Fourteenth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureVectorPackedDecimalEnhancement2 : SystemZFeature<
+ "vector-packed-decimal-enhancement-2", "VectorPackedDecimalEnhancement2", (all_of FeatureVectorPackedDecimalEnhancement2),
+ "Assume that the vector packed decimal enhancement facility 2 is installed"
+>;
+
+def FeatureNNPAssist : SystemZFeature<
+ "nnp-assist", "NNPAssist", (all_of FeatureNNPAssist),
+ "Assume that the NNP-assist facility is installed"
+>;
+
+def FeatureBEAREnhancement : SystemZFeature<
+ "bear-enhancement", "BEAREnhancement", (all_of FeatureBEAREnhancement),
+ "Assume that the BEAR-enhancement facility is installed"
+>;
+
+def FeatureResetDATProtection : SystemZFeature<
+ "reset-dat-protection", "ResetDATProtection", (all_of FeatureResetDATProtection),
+ "Assume that the reset-DAT-protection facility is installed"
+>;
+
+def FeatureProcessorActivityInstrumentation : SystemZFeature<
+ "processor-activity-instrumentation", "ProcessorActivityInstrumentation", (all_of FeatureProcessorActivityInstrumentation),
+ "Assume that the processor-activity-instrumentation facility is installed"
+>;
+
+def Arch14NewFeatures : SystemZFeatureList<[
+ FeatureVectorPackedDecimalEnhancement2,
+ FeatureNNPAssist,
+ FeatureBEAREnhancement,
+ FeatureResetDATProtection,
+ FeatureProcessorActivityInstrumentation
+]>;
+
+//===----------------------------------------------------------------------===//
+//
// Cumulative supported and unsupported feature sets
//
//===----------------------------------------------------------------------===//
@@ -309,9 +348,13 @@
: SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
def Arch13SupportedFeatures
: SystemZFeatureAdd<Arch12SupportedFeatures.List, Arch13NewFeatures.List>;
+def Arch14SupportedFeatures
+ : SystemZFeatureAdd<Arch13SupportedFeatures.List, Arch14NewFeatures.List>;
-def Arch13UnsupportedFeatures
+def Arch14UnsupportedFeatures
: SystemZFeatureList<[]>;
+def Arch13UnsupportedFeatures
+ : SystemZFeatureAdd<Arch14UnsupportedFeatures.List, Arch14NewFeatures.List>;
def Arch12UnsupportedFeatures
: SystemZFeatureAdd<Arch13UnsupportedFeatures.List, Arch13NewFeatures.List>;
def Arch11UnsupportedFeatures
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 994f471..d2f6ff9 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -23,7 +23,7 @@
namespace {
// The ABI-defined register save slots, relative to the CFA (i.e.
-// incoming stack pointer + SystemZMC::CallFrameSize).
+// incoming stack pointer + SystemZMC::ELFCallFrameSize).
static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
{ SystemZ::R2D, 0x10 },
{ SystemZ::R3D, 0x18 },
@@ -75,7 +75,7 @@
unsigned LowGPR = 0;
unsigned HighGPR = SystemZ::R15D;
- int StartSPOffset = SystemZMC::CallFrameSize;
+ int StartSPOffset = SystemZMC::ELFCallFrameSize;
for (auto &CS : CSI) {
unsigned Reg = CS.getReg();
int Offset = getRegSpillOffset(MF, Reg);
@@ -84,7 +84,7 @@
LowGPR = Reg;
StartSPOffset = Offset;
}
- Offset -= SystemZMC::CallFrameSize;
+ Offset -= SystemZMC::ELFCallFrameSize;
int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
CS.setFrameIdx(FrameIdx);
} else
@@ -99,8 +99,8 @@
// already be included, but we also need to handle the call-clobbered
// argument registers.
unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
- if (FirstGPR < SystemZ::NumArgGPRs) {
- unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+ if (FirstGPR < SystemZ::ELFNumArgGPRs) {
+ unsigned Reg = SystemZ::ELFArgGPRs[FirstGPR];
int Offset = getRegSpillOffset(MF, Reg);
if (StartSPOffset > Offset) {
LowGPR = Reg; StartSPOffset = Offset;
@@ -110,7 +110,7 @@
ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
// Create fixed stack objects for the remaining registers.
- int CurrOffset = -SystemZMC::CallFrameSize;
+ int CurrOffset = -SystemZMC::ELFCallFrameSize;
if (usePackedStack(MF))
CurrOffset += StartSPOffset;
@@ -146,8 +146,8 @@
// Record these pending uses, which typically include the call-saved
// argument register R6D.
if (IsVarArg)
- for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
- SavedRegs.set(SystemZ::ArgGPRs[I]);
+ for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::ELFNumArgGPRs; ++I)
+ SavedRegs.set(SystemZ::ELFArgGPRs[I]);
// If there are any landing pads, entering them will modify r6/r7.
if (!MF.getLandingPads().empty()) {
@@ -234,8 +234,8 @@
// ...likewise GPR varargs.
if (IsVarArg)
- for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
- addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
+ for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::ELFNumArgGPRs; ++I)
+ addSavedGPR(MBB, MIB, SystemZ::ELFArgGPRs[I], true);
}
// Save FPRs/VRs in the normal TargetInstrInfo way.
@@ -326,7 +326,7 @@
// Get the size of our stack frame to be allocated ...
uint64_t StackSize = (MFFrame.estimateStackSize(MF) +
- SystemZMC::CallFrameSize);
+ SystemZMC::ELFCallFrameSize);
// ... and the maximum offset we may need to reach into the
// caller's frame to access the save area or stack arguments.
int64_t MaxArgOffset = 0;
@@ -437,7 +437,7 @@
report_fatal_error(
"In GHC calling convention a frame pointer is not supported");
}
- MFFrame.setStackSize(MFFrame.getStackSize() + SystemZMC::CallFrameSize);
+ MFFrame.setStackSize(MFFrame.getStackSize() + SystemZMC::ELFCallFrameSize);
return;
}
@@ -446,7 +446,7 @@
DebugLoc DL;
// The current offset of the stack pointer from the CFA.
- int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+ int64_t SPOffsetFromCFA = -SystemZMC::ELFCFAOffsetFromInitialSP;
if (ZFI->getSpillGPRRegs().LowGPR) {
// Skip over the GPR saves.
@@ -480,10 +480,10 @@
break;
}
if (HasStackObject || MFFrame.hasCalls())
- StackSize += SystemZMC::CallFrameSize;
+ StackSize += SystemZMC::ELFCallFrameSize;
// Don't allocate the incoming reg save area.
- StackSize = StackSize > SystemZMC::CallFrameSize
- ? StackSize - SystemZMC::CallFrameSize
+ StackSize = StackSize > SystemZMC::ELFCallFrameSize
+ ? StackSize - SystemZMC::ELFCallFrameSize
: 0;
MFFrame.setStackSize(StackSize);
@@ -638,7 +638,7 @@
const unsigned ProbeSize = TLI.getStackProbeSize(MF);
uint64_t NumFullBlocks = StackSize / ProbeSize;
uint64_t Residual = StackSize % ProbeSize;
- int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+ int64_t SPOffsetFromCFA = -SystemZMC::ELFCFAOffsetFromInitialSP;
MachineBasicBlock *MBB = &PrologMBB;
MachineBasicBlock::iterator MBBI = StackAllocMI;
const DebugLoc DL = StackAllocMI->getDebugLoc();
@@ -682,7 +682,7 @@
.addReg(SystemZ::R15D);
buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R0D, ZII);
emitIncrement(*MBB, MBBI, DL, SystemZ::R0D, -int64_t(LoopAlloc), ZII);
- buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::CallFrameSize + LoopAlloc),
+ buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::ELFCallFrameSize + LoopAlloc),
ZII);
DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
@@ -737,11 +737,11 @@
StackOffset
SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg) const {
- // Our incoming SP is actually SystemZMC::CallFrameSize below the CFA, so
+ // Our incoming SP is actually SystemZMC::ELFCallFrameSize below the CFA, so
// add that difference here.
StackOffset Offset =
TargetFrameLowering::getFrameIndexReference(MF, FI, FrameReg);
- return Offset + StackOffset::getFixed(SystemZMC::CallFrameSize);
+ return Offset + StackOffset::getFixed(SystemZMC::ELFCallFrameSize);
}
MachineBasicBlock::iterator SystemZFrameLowering::
@@ -784,7 +784,7 @@
int FI = ZFI->getFramePointerSaveIndex();
if (!FI) {
MachineFrameInfo &MFFrame = MF.getFrameInfo();
- int Offset = getBackchainOffset(MF) - SystemZMC::CallFrameSize;
+ int Offset = getBackchainOffset(MF) - SystemZMC::ELFCallFrameSize;
FI = MFFrame.CreateFixedObject(8, Offset, false);
ZFI->setFramePointerSaveIndex(FI);
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 085c31c..c8312b8 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -68,7 +68,7 @@
// Return the offset of the backchain.
unsigned getBackchainOffset(MachineFunction &MF) const {
// The back chain is stored topmost with packed-stack.
- return usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
+ return usePackedStack(MF) ? SystemZMC::ELFCallFrameSize - 8 : 0;
}
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 9d90a49..39a82e2 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1432,8 +1432,8 @@
if (V1 == V2 && End1 == End2)
return false;
- return !AA->alias(MemoryLocation(V1, End1, Load->getAAInfo()),
- MemoryLocation(V2, End2, Store->getAAInfo()));
+ return AA->isNoAlias(MemoryLocation(V1, End1, Load->getAAInfo()),
+ MemoryLocation(V2, End2, Store->getAAInfo()));
}
bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 270134d..d70d486 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -853,7 +853,7 @@
}
bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
+ EVT VT, unsigned, Align, MachineMemOperand::Flags, bool *Fast) const {
// Unaligned accesses should never be slower than the expanded version.
// We check specifically for aligned accesses in the few cases where
// they are required.
@@ -1368,6 +1368,55 @@
}
}
+static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
+ SDLoc DL(In);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+ DAG.getIntPtrConstant(1, DL));
+ SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
+ MVT::Untyped, Hi, Lo);
+ return SDValue(Pair, 0);
+}
+
+static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
+ SDLoc DL(In);
+ SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
+ DL, MVT::i64, In);
+ SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
+ DL, MVT::i64, In);
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
+}
+
+bool SystemZTargetLowering::splitValueIntoRegisterParts(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+ EVT ValueVT = Val.getValueType();
+ assert((ValueVT != MVT::i128 ||
+ ((NumParts == 1 && PartVT == MVT::Untyped) ||
+ (NumParts == 2 && PartVT == MVT::i64))) &&
+ "Unknown handling of i128 value.");
+ if (ValueVT == MVT::i128 && NumParts == 1) {
+ // Inline assembly operand.
+ Parts[0] = lowerI128ToGR128(DAG, Val);
+ return true;
+ }
+ return false;
+}
+
+SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
+ SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+ assert((ValueVT != MVT::i128 ||
+ ((NumParts == 1 && PartVT == MVT::Untyped) ||
+ (NumParts == 2 && PartVT == MVT::i64))) &&
+ "Unknown handling of i128 value.");
+ if (ValueVT == MVT::i128 && NumParts == 1)
+ // Inline assembly operand.
+ return lowerGR128ToI128(DAG, Parts[0]);
+ return SDValue();
+}
+
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1485,20 +1534,20 @@
// ...and a similar frame index for the caller-allocated save area
// that will be used to store the incoming registers.
int64_t RegSaveOffset =
- -SystemZMC::CallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
+ -SystemZMC::ELFCallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
// Store the FPR varargs in the reserved frame slots. (We store the
// GPRs as part of the prologue.)
- if (NumFixedFPRs < SystemZ::NumArgFPRs && !useSoftFloat()) {
- SDValue MemOps[SystemZ::NumArgFPRs];
- for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
- unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ArgFPRs[I]);
+ if (NumFixedFPRs < SystemZ::ELFNumArgFPRs && !useSoftFloat()) {
+ SDValue MemOps[SystemZ::ELFNumArgFPRs];
+ for (unsigned I = NumFixedFPRs; I < SystemZ::ELFNumArgFPRs; ++I) {
+ unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ELFArgFPRs[I]);
int FI =
- MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize + Offset, true);
+ MFI.CreateFixedObject(8, -SystemZMC::ELFCallFrameSize + Offset, true);
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
+ unsigned VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I],
&SystemZ::FP64BitRegClass);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
@@ -1507,7 +1556,7 @@
// Join the stores, which are independent of one another.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
makeArrayRef(&MemOps[NumFixedFPRs],
- SystemZ::NumArgFPRs-NumFixedFPRs));
+ SystemZ::ELFNumArgFPRs-NumFixedFPRs));
}
}
@@ -1631,7 +1680,7 @@
// floats are passed as right-justified 8-byte values.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
- unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
+ unsigned Offset = SystemZMC::ELFCallFrameSize + VA.getLocMemOffset();
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
Offset += 4;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
@@ -3281,12 +3330,10 @@
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
- // Return null if the back chain is not present.
- bool HasBackChain = MF.getFunction().hasFnAttribute("backchain");
- if (TFL->usePackedStack(MF) && !HasBackChain)
- return DAG.getConstant(0, DL, PtrVT);
-
- // By definition, the frame address is the address of the back chain.
+ // By definition, the frame address is the address of the back chain. (In
+ // the case of packed stack without backchain, return the address where the
+ // backchain would have been stored. This will either be an unused space or
+ // contain a saved register).
int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
@@ -3885,7 +3932,7 @@
Node->getMemOperand());
// We have to enforce sequential consistency by performing a
// serialization operation after the store.
- if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ if (Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
MVT::Other, Chain), 0);
return Chain;
@@ -4059,7 +4106,10 @@
SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
+ // emitAtomicCmpSwapW() will zero extend the result (original value).
+ SDValue OrigVal = DAG.getNode(ISD::AssertZext, DL, WideVT, AtomicOp.getValue(0),
+ DAG.getValueType(NarrowVT));
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), OrigVal);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
return SDValue();
@@ -5488,27 +5538,6 @@
// Lower operations with invalid operand or result types (currently used
// only for 128-bit integer types).
-
-static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
- SDLoc DL(In);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
- DAG.getIntPtrConstant(0, DL));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
- DAG.getIntPtrConstant(1, DL));
- SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
- MVT::Untyped, Hi, Lo);
- return SDValue(Pair, 0);
-}
-
-static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
- SDLoc DL(In);
- SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
- DL, MVT::i64, In);
- SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
- DL, MVT::i64, In);
- return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
-}
-
void
SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
@@ -5536,7 +5565,7 @@
DL, Tys, Ops, MVT::i128, MMO);
// We have to enforce sequential consistency by performing a
// serialization operation after the store.
- if (cast<AtomicSDNode>(N)->getOrdering() ==
+ if (cast<AtomicSDNode>(N)->getSuccessOrdering() ==
AtomicOrdering::SequentiallyConsistent)
Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
MVT::Other, Res), 0);
@@ -7394,7 +7423,7 @@
// StartMBB:
// ...
// %OrigVal = L Disp(%Base)
- // # fall through to LoopMMB
+ // # fall through to LoopMBB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
MBB->addSuccessor(LoopMBB);
@@ -7406,7 +7435,7 @@
// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
// %Dest = CS %OldVal, %NewVal, Disp(%Base)
// JNE LoopMBB
- // # fall through to DoneMMB
+ // # fall through to DoneMBB
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
.addReg(OrigVal).addMBB(StartMBB)
@@ -7514,7 +7543,7 @@
// StartMBB:
// ...
// %OrigVal = L Disp(%Base)
- // # fall through to LoopMMB
+ // # fall through to LoopMBB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
MBB->addSuccessor(LoopMBB);
@@ -7540,7 +7569,7 @@
// UseAltMBB:
// %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
- // # fall through to UpdateMMB
+ // # fall through to UpdateMBB
MBB = UseAltMBB;
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
@@ -7554,7 +7583,7 @@
// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
// %Dest = CS %OldVal, %NewVal, Disp(%Base)
// JNE LoopMBB
- // # fall through to DoneMMB
+ // # fall through to DoneMBB
MBB = UpdateMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
.addReg(RotatedOldVal).addMBB(LoopMBB)
@@ -7581,7 +7610,6 @@
MachineBasicBlock *
SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
MachineBasicBlock *MBB) const {
-
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -7591,7 +7619,7 @@
Register Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
- Register OrigCmpVal = MI.getOperand(3).getReg();
+ Register CmpVal = MI.getOperand(3).getReg();
Register OrigSwapVal = MI.getOperand(4).getReg();
Register BitShift = MI.getOperand(5).getReg();
Register NegBitShift = MI.getOperand(6).getReg();
@@ -7600,19 +7628,19 @@
const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
- // Get the right opcodes for the displacement.
+ // Get the right opcodes for the displacement and zero-extension.
unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp);
unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
+ unsigned ZExtOpcode = BitSize == 8 ? SystemZ::LLCR : SystemZ::LLHR;
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
Register OrigOldVal = MRI.createVirtualRegister(RC);
Register OldVal = MRI.createVirtualRegister(RC);
- Register CmpVal = MRI.createVirtualRegister(RC);
Register SwapVal = MRI.createVirtualRegister(RC);
Register StoreVal = MRI.createVirtualRegister(RC);
+ Register OldValRot = MRI.createVirtualRegister(RC);
Register RetryOldVal = MRI.createVirtualRegister(RC);
- Register RetryCmpVal = MRI.createVirtualRegister(RC);
Register RetrySwapVal = MRI.createVirtualRegister(RC);
// Insert 2 basic blocks for the loop.
@@ -7624,7 +7652,7 @@
// StartMBB:
// ...
// %OrigOldVal = L Disp(%Base)
- // # fall through to LoopMMB
+ // # fall through to LoopMBB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
.add(Base)
@@ -7634,34 +7662,32 @@
// LoopMBB:
// %OldVal = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
- // %CmpVal = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
// %SwapVal = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
- // %Dest = RLL %OldVal, BitSize(%BitShift)
+ // %OldValRot = RLL %OldVal, BitSize(%BitShift)
// ^^ The low BitSize bits contain the field
// of interest.
- // %RetryCmpVal = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
+ // %RetrySwapVal = RISBG32 %SwapVal, %OldValRot, 32, 63-BitSize, 0
// ^^ Replace the upper 32-BitSize bits of the
- // comparison value with those that we loaded,
- // so that we can use a full word comparison.
- // CR %Dest, %RetryCmpVal
+ // swap value with those that we loaded and rotated.
+ // %Dest = LL[CH] %OldValRot
+ // CR %Dest, %CmpVal
// JNE DoneMBB
// # Fall through to SetMBB
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
.addReg(OrigOldVal).addMBB(StartMBB)
.addReg(RetryOldVal).addMBB(SetMBB);
- BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
- .addReg(OrigCmpVal).addMBB(StartMBB)
- .addReg(RetryCmpVal).addMBB(SetMBB);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
.addReg(OrigSwapVal).addMBB(StartMBB)
.addReg(RetrySwapVal).addMBB(SetMBB);
- BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), OldValRot)
.addReg(OldVal).addReg(BitShift).addImm(BitSize);
- BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
- .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
+ .addReg(SwapVal).addReg(OldValRot).addImm(32).addImm(63 - BitSize).addImm(0);
+ BuildMI(MBB, DL, TII->get(ZExtOpcode), Dest)
+ .addReg(OldValRot);
BuildMI(MBB, DL, TII->get(SystemZ::CR))
- .addReg(Dest).addReg(RetryCmpVal);
+ .addReg(Dest).addReg(CmpVal);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP)
.addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
@@ -7669,17 +7695,12 @@
MBB->addSuccessor(SetMBB);
// SetMBB:
- // %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
- // ^^ Replace the upper 32-BitSize bits of the new
- // value with those that we loaded.
- // %StoreVal = RLL %RetrySwapVal, -BitSize(%NegBitShift)
+ // %StoreVal = RLL %RetrySwapVal, -BitSize(%NegBitShift)
// ^^ Rotate the new field to its proper position.
- // %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
+ // %RetryOldVal = CS %OldVal, %StoreVal, Disp(%Base)
// JNE LoopMBB
- // # fall through to ExitMMB
+ // # fall through to ExitMBB
MBB = SetMBB;
- BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
- .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
.addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
@@ -7774,43 +7795,99 @@
uint64_t DestDisp = MI.getOperand(1).getImm();
MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
uint64_t SrcDisp = MI.getOperand(3).getImm();
- uint64_t Length = MI.getOperand(4).getImm();
+ MachineOperand &LengthMO = MI.getOperand(4);
+ uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0;
+ Register LenMinus1Reg =
+ LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister;
// When generating more than one CLC, all but the last will need to
// branch to the end when a difference is found.
- MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
- SystemZ::splitBlockAfter(MI, MBB) : nullptr);
+ MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC
+ ? SystemZ::splitBlockAfter(MI, MBB)
+ : nullptr);
// Check for the loop form, in which operand 5 is the trip count.
if (MI.getNumExplicitOperands() > 5) {
+ Register StartCountReg = MI.getOperand(5).getReg();
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
- Register StartCountReg = MI.getOperand(5).getReg();
- Register StartSrcReg = forceReg(MI, SrcBase, TII);
- Register StartDestReg = (HaveSingleBase ? StartSrcReg :
- forceReg(MI, DestBase, TII));
+ auto loadZeroAddress = [&]() -> MachineOperand {
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
+ return MachineOperand::CreateReg(Reg, false);
+ };
+ if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
+ DestBase = loadZeroAddress();
+ if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
+ SrcBase = HaveSingleBase ? DestBase : loadZeroAddress();
+
+ MachineBasicBlock *StartMBB = nullptr;
+ MachineBasicBlock *LoopMBB = nullptr;
+ MachineBasicBlock *NextMBB = nullptr;
+ MachineBasicBlock *DoneMBB = nullptr;
+ MachineBasicBlock *AllDoneMBB = nullptr;
+
+ Register StartSrcReg = forceReg(MI, SrcBase, TII);
+ Register StartDestReg =
+ (HaveSingleBase ? StartSrcReg : forceReg(MI, DestBase, TII));
const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
Register ThisSrcReg = MRI.createVirtualRegister(RC);
- Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
- MRI.createVirtualRegister(RC));
+ Register ThisDestReg =
+ (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC));
Register NextSrcReg = MRI.createVirtualRegister(RC);
- Register NextDestReg = (HaveSingleBase ? NextSrcReg :
- MRI.createVirtualRegister(RC));
-
+ Register NextDestReg =
+ (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC));
RC = &SystemZ::GR64BitRegClass;
Register ThisCountReg = MRI.createVirtualRegister(RC);
Register NextCountReg = MRI.createVirtualRegister(RC);
- MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
- MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
- MachineBasicBlock *NextMBB =
- (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
+ if (LengthMO.isReg()) {
+ AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+ StartMBB = SystemZ::emitBlockAfter(MBB);
+ LoopMBB = SystemZ::emitBlockAfter(StartMBB);
+ NextMBB = LoopMBB;
+ DoneMBB = SystemZ::emitBlockAfter(LoopMBB);
- // StartMBB:
- // # fall through to LoopMMB
- MBB->addSuccessor(LoopMBB);
+ // MBB:
+ // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(LenMinus1Reg).addImm(-1);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+ .addMBB(AllDoneMBB);
+ MBB->addSuccessor(AllDoneMBB);
+ MBB->addSuccessor(StartMBB);
+
+ // StartMBB:
+ // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(StartCountReg).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+ .addMBB(DoneMBB);
+ MBB->addSuccessor(DoneMBB);
+ MBB->addSuccessor(LoopMBB);
+ }
+ else {
+ StartMBB = MBB;
+ DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+ LoopMBB = SystemZ::emitBlockAfter(StartMBB);
+ NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
+
+ // StartMBB:
+ // # fall through to LoopMBB
+ MBB->addSuccessor(LoopMBB);
+
+ DestBase = MachineOperand::CreateReg(NextDestReg, false);
+ SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+ ImmLength &= 255;
+ if (EndMBB && !ImmLength)
+ // If the loop handled the whole CLC range, DoneMBB will be empty with
+ // CC live-through into EndMBB, so add it as live-in.
+ DoneMBB->addLiveIn(SystemZ::CC);
+ }
// LoopMBB:
// %ThisDestReg = phi [ %StartDestReg, StartMBB ],
@@ -7825,7 +7902,6 @@
//
// The prefetch is used only for MVC. The JLH is used only for CLC.
MBB = LoopMBB;
-
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
.addReg(StartDestReg).addMBB(StartMBB)
.addReg(NextDestReg).addMBB(NextMBB);
@@ -7857,11 +7933,10 @@
// %NextCountReg = AGHI %ThisCountReg, -1
// CGHI %NextCountReg, 0
// JLH LoopMBB
- // # fall through to DoneMMB
+ // # fall through to DoneMBB
//
// The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
MBB = NextMBB;
-
BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
.addReg(ThisDestReg).addImm(256).addReg(0);
if (!HaveSingleBase)
@@ -7877,18 +7952,39 @@
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
- DestBase = MachineOperand::CreateReg(NextDestReg, false);
- SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
- Length &= 255;
- if (EndMBB && !Length)
- // If the loop handled the whole CLC range, DoneMBB will be empty with
- // CC live-through into EndMBB, so add it as live-in.
- DoneMBB->addLiveIn(SystemZ::CC);
MBB = DoneMBB;
+ if (LengthMO.isReg()) {
+ // DoneMBB:
+ // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
+ // # Use EXecute Relative Long for the remainder of the bytes. The target
+ // instruction of the EXRL will have a length field of 1 since 0 is an
+ // illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+ // 0xff) + 1.
+ // # Fall through to AllDoneMBB.
+ Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ Register RemDestReg = HaveSingleBase ? RemSrcReg
+ : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg)
+ .addReg(StartDestReg).addMBB(StartMBB)
+ .addReg(NextDestReg).addMBB(LoopMBB);
+ if (!HaveSingleBase)
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
+ .addReg(StartSrcReg).addMBB(StartMBB)
+ .addReg(NextSrcReg).addMBB(LoopMBB);
+ MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass);
+ BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
+ .addImm(Opcode)
+ .addReg(LenMinus1Reg)
+ .addReg(RemDestReg).addImm(DestDisp)
+ .addReg(RemSrcReg).addImm(SrcDisp);
+ MBB->addSuccessor(AllDoneMBB);
+ MBB = AllDoneMBB;
+ }
}
+
// Handle any remaining bytes with straight-line code.
- while (Length > 0) {
- uint64_t ThisLength = std::min(Length, uint64_t(256));
+ while (ImmLength > 0) {
+ uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
// The previous iteration might have created out-of-range displacements.
// Apply them using LAY if so.
if (!isUInt<12>(DestDisp)) {
@@ -7918,10 +8014,10 @@
.setMemRefs(MI.memoperands());
DestDisp += ThisLength;
SrcDisp += ThisLength;
- Length -= ThisLength;
+ ImmLength -= ThisLength;
// If there's another CLC to go, branch to the end if a difference
// was found.
- if (EndMBB && Length > 0) {
+ if (EndMBB && ImmLength > 0) {
MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -7966,7 +8062,7 @@
MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
// StartMBB:
- // # fall through to LoopMMB
+ // # fall through to LoopMBB
MBB->addSuccessor(LoopMBB);
// LoopMBB:
@@ -7975,7 +8071,7 @@
// R0L = %CharReg
// %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
// JO LoopMBB
- // # fall through to DoneMMB
+ // # fall through to DoneMBB
//
// The load of R0L can be hoisted by post-RA LICM.
MBB = LoopMBB;
@@ -8412,6 +8508,7 @@
return emitMemMemWrapper(MI, MBB, SystemZ::OC);
case SystemZ::XCSequence:
case SystemZ::XCLoop:
+ case SystemZ::XCLoopVarLen:
return emitMemMemWrapper(MI, MBB, SystemZ::XC);
case SystemZ::CLCSequence:
case SystemZ::CLCLoop:
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 955587d..248efc1 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -422,7 +422,28 @@
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+ unsigned
+ getNumRegisters(LLVMContext &Context, EVT VT,
+ Optional<MVT> RegisterVT) const override {
+ // i128 inline assembly operand.
+ if (VT == MVT::i128 &&
+ RegisterVT.hasValue() && RegisterVT.getValue() == MVT::Untyped)
+ return 1;
+ return TargetLowering::getNumRegisters(Context, VT);
+ }
bool isCheapToSpeculateCtlz() const override { return true; }
+ bool preferZeroCompareBranch() const override { return true; }
+ bool hasBitPreservingFPLogic(EVT VT) const override {
+ EVT ScVT = VT.getScalarType();
+ return ScVT == MVT::f32 || ScVT == MVT::f64 || ScVT == MVT::f128;
+ }
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override {
+ ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+ return Mask && Mask->getValue().isIntN(16);
+ }
+ bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+ return VT.isScalarInteger();
+ }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT) const override;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
@@ -435,8 +456,7 @@
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I = nullptr) const override;
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
- unsigned Align,
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool isTruncateFree(Type *, Type *) const override;
@@ -518,6 +538,15 @@
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
bool allowTruncateForTailCall(Type *, Type *) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+ bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT,
+ Optional<CallingConv::ID> CC) const override;
+ SDValue
+ joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+ const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT,
+ Optional<CallingConv::ID> CC) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -553,6 +582,9 @@
ISD::NodeType getExtendForAtomicOps() const override {
return ISD::ANY_EXTEND;
}
+ ISD::NodeType getExtendForAtomicCmpSwapArg() const override {
+ return ISD::ZERO_EXTEND;
+ }
bool supportSwiftError() const override {
return true;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 95e94c4..5cb46cd 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1438,6 +1438,55 @@
let Inst{7-0} = op{7-0};
}
+class InstVRRj<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = 0;
+ let Inst{23-20} = M4;
+ let Inst{19-16} = 0;
+ let Inst{15-12} = 0;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRRk<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = 0;
+ let Inst{27-24} = 0;
+ let Inst{23-20} = M3;
+ let Inst{19-16} = 0;
+ let Inst{15-12} = 0;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = 0;
+ let Inst{8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -1845,7 +1894,8 @@
//===----------------------------------------------------------------------===//
// A class to describe a variant of an instruction with condition mask.
-class CondVariant<bits<4> ccmaskin, string suffixin, bit alternatein> {
+class CondVariant<bits<4> ccmaskin, string suffixin, bit alternatein,
+ string asmvariantin = ""> {
// The fixed condition mask to use.
bits<4> ccmask = ccmaskin;
@@ -1854,6 +1904,11 @@
// Whether this is an alternate that needs to be marked isAsmParserOnly.
bit alternate = alternatein;
+
+ // Whether this needs be to restricted to a specific dialect.
+ // Valid values are "att" and "hlasm", which when passed in
+ // will set AsmVariantName.
+ string asmvariant = asmvariantin;
}
// Condition mask 15 means "always true", which is used to define
@@ -1864,20 +1919,20 @@
def CondVariantO : CondVariant<1, "o", 0>;
def CondVariantH : CondVariant<2, "h", 0>;
def CondVariantP : CondVariant<2, "p", 1>;
-def CondVariantNLE : CondVariant<3, "nle", 0>;
+def CondVariantNLE : CondVariant<3, "nle", 0, "att">;
def CondVariantL : CondVariant<4, "l", 0>;
def CondVariantM : CondVariant<4, "m", 1>;
-def CondVariantNHE : CondVariant<5, "nhe", 0>;
-def CondVariantLH : CondVariant<6, "lh", 0>;
+def CondVariantNHE : CondVariant<5, "nhe", 0, "att">;
+def CondVariantLH : CondVariant<6, "lh", 0, "att">;
def CondVariantNE : CondVariant<7, "ne", 0>;
def CondVariantNZ : CondVariant<7, "nz", 1>;
def CondVariantE : CondVariant<8, "e", 0>;
def CondVariantZ : CondVariant<8, "z", 1>;
-def CondVariantNLH : CondVariant<9, "nlh", 0>;
-def CondVariantHE : CondVariant<10, "he", 0>;
+def CondVariantNLH : CondVariant<9, "nlh", 0, "att">;
+def CondVariantHE : CondVariant<10, "he", 0, "att">;
def CondVariantNL : CondVariant<11, "nl", 0>;
def CondVariantNM : CondVariant<11, "nm", 1>;
-def CondVariantLE : CondVariant<12, "le", 0>;
+def CondVariantLE : CondVariant<12, "le", 0, "att">;
def CondVariantNH : CondVariant<13, "nh", 0>;
def CondVariantNP : CondVariant<13, "np", 1>;
def CondVariantNO : CondVariant<14, "no", 0>;
@@ -1886,35 +1941,42 @@
class CV<string name>
: CondVariant<!cast<CondVariant>("CondVariant"#name).ccmask,
!cast<CondVariant>("CondVariant"#name).suffix,
- !cast<CondVariant>("CondVariant"#name).alternate>;
+ !cast<CondVariant>("CondVariant"#name).alternate,
+ !cast<CondVariant>("CondVariant"#name).asmvariant>;
// Condition masks for integer instructions (e.g. compare-and-branch).
// This is like the list above, except that condition 3 is not possible
// and that the low bit of the mask is therefore always 0. This means
// that each condition has two names. Conditions "o" and "no" are not used.
def IntCondVariantH : CondVariant<2, "h", 0>;
-def IntCondVariantNLE : CondVariant<2, "nle", 1>;
+def IntCondVariantNLE : CondVariant<2, "nle", 1, "att">;
def IntCondVariantL : CondVariant<4, "l", 0>;
-def IntCondVariantNHE : CondVariant<4, "nhe", 1>;
-def IntCondVariantLH : CondVariant<6, "lh", 0>;
+def IntCondVariantNHE : CondVariant<4, "nhe", 1, "att">;
+def IntCondVariantLH : CondVariant<6, "lh", 0, "att">;
def IntCondVariantNE : CondVariant<6, "ne", 1>;
def IntCondVariantE : CondVariant<8, "e", 0>;
-def IntCondVariantNLH : CondVariant<8, "nlh", 1>;
-def IntCondVariantHE : CondVariant<10, "he", 0>;
+def IntCondVariantNLH : CondVariant<8, "nlh", 1, "att">;
+def IntCondVariantHE : CondVariant<10, "he", 0, "att">;
def IntCondVariantNL : CondVariant<10, "nl", 1>;
-def IntCondVariantLE : CondVariant<12, "le", 0>;
+def IntCondVariantLE : CondVariant<12, "le", 0, "att">;
def IntCondVariantNH : CondVariant<12, "nh", 1>;
// A helper class to look up one of the above by name.
class ICV<string name>
: CondVariant<!cast<CondVariant>("IntCondVariant"#name).ccmask,
!cast<CondVariant>("IntCondVariant"#name).suffix,
- !cast<CondVariant>("IntCondVariant"#name).alternate>;
+ !cast<CondVariant>("IntCondVariant"#name).alternate,
+ !cast<CondVariant>("IntCondVariant"#name).asmvariant>;
// Defines a class that makes it easier to define
// a MnemonicAlias when CondVariant's are involved.
-class MnemonicCondBranchAlias<CondVariant V, string from, string to>
- : MnemonicAlias<!subst("#", V.suffix, from), !subst("#", V.suffix, to)>;
+multiclass MnemonicCondBranchAlias<CondVariant V, string from, string to,
+ string asmvariant = V.asmvariant> {
+ if !or(!eq(V.asmvariant, ""), !eq(V.asmvariant, asmvariant)) then
+ def "" : MnemonicAlias<!subst("#", V.suffix, from),
+ !subst("#", V.suffix, to),
+ asmvariant>;
+}
//===----------------------------------------------------------------------===//
// Instruction definitions with semantics
@@ -2125,6 +2187,7 @@
: InstRIc<opcode, (outs), (ins brtarget16:$RI2),
!subst("#", V.suffix, mnemonic)#"\t$RI2", [(operator bb:$RI2)]> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M1 = V.ccmask;
}
@@ -2142,6 +2205,7 @@
: InstRILc<opcode, (outs), (ins brtarget32:$RI2),
!subst("#", V.suffix, mnemonic)#"\t$RI2", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M1 = V.ccmask;
}
@@ -2160,6 +2224,7 @@
: InstRR<opcode, (outs), (ins ADDR64:$R2),
!subst("#", V.suffix, mnemonic)#"\t$R2", [(operator ADDR64:$R2)]> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let R1 = V.ccmask;
}
@@ -2177,6 +2242,7 @@
: InstRXb<opcode, (outs), (ins bdxaddr12only:$XBD2),
!subst("#", V.suffix, mnemonic)#"\t$XBD2", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M1 = V.ccmask;
}
@@ -2199,6 +2265,7 @@
!subst("#", V.suffix, mnemonic)#"\t$XBD2",
[(operator (load bdxaddr20only:$XBD2))]> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M1 = V.ccmask;
let mayLoad = 1;
}
@@ -2218,6 +2285,7 @@
: InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#V.suffix#"\t$R1, $I2", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2245,6 +2313,7 @@
: InstRIEb<opcode, (outs), (ins cls:$R1, cls:$R2, brtarget16:$RI4),
mnemonic#V.suffix#"\t$R1, $R2, $RI4", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2272,6 +2341,7 @@
: InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4),
mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2304,6 +2374,7 @@
: InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2),
mnemonic#V.suffix#"\t$R1, $R2", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2324,6 +2395,7 @@
: InstRRS<opcode, (outs), (ins cls:$R1, cls:$R2, bdaddr12only:$BD4),
mnemonic#V.suffix#"\t$R1, $R2, $BD4", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2351,6 +2423,7 @@
: InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4),
mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2383,6 +2456,7 @@
: InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2),
mnemonic#V.suffix#"\t$R1, $BD2", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2717,6 +2791,7 @@
let mayStore = 1;
let AccessBytes = bytes;
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -2754,6 +2829,16 @@
let AccessBytes = bytes;
}
+class SideEffectUnarySIY<string mnemonic, bits<16> opcode,
+ bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstSIY<opcode, (outs), (ins mode:$BD1),
+ mnemonic#"\t$BD1", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let I2 = 0;
+}
+
class SideEffectAddressS<string mnemonic, bits<16> opcode,
SDPatternOperator operator,
AddressingMode mode = bdaddr12only>
@@ -2891,6 +2976,7 @@
let mayLoad = 1;
let AccessBytes = bytes;
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -3294,6 +3380,7 @@
let Constraints = "$R1 = $R1src";
let DisableEncoding = "$R1src";
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -3332,6 +3419,7 @@
: InstRRFa<opcode, (outs cls1:$R1), (ins cls3:$R3, cls2:$R2),
mnemonic#V.suffix#"\t$R1, $R2, $R3", []> {
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M4 = V.ccmask;
}
@@ -3401,6 +3489,7 @@
let Constraints = "$R1 = $R1src";
let DisableEncoding = "$R1src";
let isAsmParserOnly = V.alternate;
+ let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
}
@@ -3630,6 +3719,17 @@
let M5 = modifier;
}
+class BinaryExtraVRRb<string mnemonic, bits<16> opcode, bits<4> type = 0>
+ : InstVRRb<opcode, (outs VR128:$V1), (ins VR128:$V2, VR128:$V3, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M5", []> {
+ let M4 = type;
+}
+
+class BinaryExtraVRRbGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRb<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+
// Declare a pair of instructions, one which sets CC and one which doesn't.
// The CC-setting form ends with "S" and sets the low bit of M5.
multiclass BinaryVRRbSPair<string mnemonic, bits<16> opcode,
@@ -3743,6 +3843,10 @@
let M4 = 0;
}
+class BinaryVRRk<string mnemonic, bits<16> opcode>
+ : InstVRRk<opcode, (outs VR128:$V1), (ins VR128:$V2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $V2, $M3", []>;
+
class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type>
: InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
@@ -4504,6 +4608,11 @@
imm32zx4:$M3, imm32zx4:$M4),
mnemonic#"\t$R1, $V2, $M3, $M4", []>;
+class TernaryVRRj<string mnemonic, bits<16> opcode>
+ : InstVRRj<opcode, (outs VR128:$V1), (ins VR128:$V2,
+ VR128:$V3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V2, $V3, $M4", []>;
+
class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
: InstVRSb<opcode, (outs VR128:$V1),
(ins VR128:$V1src, GR64:$R3, shift12only:$BD2, imm32zx4:$M4),
@@ -5223,6 +5332,7 @@
// The Sequence form uses a straight-line sequence of instructions and
// the Loop form uses a loop of length-256 instructions followed by
// another instruction to handle the excess.
+// The LoopVarLen form is for a loop with a non-constant length parameter.
multiclass MemorySS<string mnemonic, bits<8> opcode,
SDPatternOperator sequence, SDPatternOperator loop> {
def "" : SideEffectBinarySSa<mnemonic, opcode>;
@@ -5235,6 +5345,10 @@
imm64:$length, GR64:$count256),
[(loop bdaddr12only:$dest, bdaddr12only:$src,
imm64:$length, GR64:$count256)]>;
+ def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ GR64:$length, GR64:$count256),
+ [(loop bdaddr12only:$dest, bdaddr12only:$src,
+ GR64:$length, GR64:$count256)]>;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index bf01c26..b9f6419 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -56,9 +56,9 @@
void SystemZInstrInfo::anchor() {}
SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
- : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
- RI(), STI(sti) {
-}
+ : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
+ RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister()),
+ STI(sti) {}
// MI is a 128-bit load or store. Split it into two 64-bit loads or stores,
// each having the opcode given by NewOpcode.
@@ -120,7 +120,7 @@
MachineOperand &OffsetMO = MI->getOperand(2);
uint64_t Offset = (MFFrame.getMaxCallFrameSize() +
- SystemZMC::CallFrameSize +
+ SystemZMC::ELFCallFrameSize +
OffsetMO.getImm());
unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
assert(NewOpcode && "No support for huge argument lists yet");
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 6e4f9e7..7df7cc9 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+def IsTargetXPLINK64 : Predicate<"Subtarget->isTargetXPLINK64()">;
+def IsTargetELF : Predicate<"Subtarget->isTargetELF()">;
+
//===----------------------------------------------------------------------===//
// Stack allocation
//===----------------------------------------------------------------------===//
@@ -78,7 +81,7 @@
foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
"Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
def JAsm#V : FixedCondBranchRI <CV<V>, "j#", 0xA74>;
- def JGAsm#V : FixedCondBranchRIL<CV<V>, "jg#", 0xC04>;
+ def JGAsm#V : FixedCondBranchRIL<CV<V>, "j{g|l}#", 0xC04>;
let isIndirectBranch = 1 in {
def BAsm#V : FixedCondBranchRX <CV<V>, "b#", 0x47>;
def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
@@ -92,7 +95,7 @@
// conditional branches with the condition mask set to "always".
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
def J : FixedCondBranchRI <CondAlways, "j", 0xA74, br>;
- def JG : FixedCondBranchRIL<CondAlways, "jg", 0xC04>;
+ def JG : FixedCondBranchRIL<CondAlways, "j{g|lu}", 0xC04>;
let isIndirectBranch = 1 in {
def B : FixedCondBranchRX<CondAlways, "b", 0x47>;
def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
@@ -114,7 +117,8 @@
def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>;
// An alias of BRCL 0, label
-def JGNOP : InstAlias<"jgnop\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>;
+// jgnop on att ; jlnop on hlasm
+def JGNOP : InstAlias<"{jgnop|jlnop}\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>;
// Fused compare-and-branch instructions.
//
@@ -273,24 +277,38 @@
def BASR : CallRR <"basr", 0x0D>;
}
+// z/OS XPLINK
+let Predicates = [IsTargetXPLINK64] in {
+ let isCall = 1, Defs = [R7D, CC], Uses = [FPC] in {
+ def CallBRASL_XPLINK64 : Alias<8, (outs), (ins pcrel32:$I2, variable_ops),
+ [(z_call pcrel32:$I2)]>;
+ def CallBASR_XPLINK64 : Alias<4, (outs), (ins ADDR64:$R2, variable_ops),
+ [(z_call ADDR64:$R2)]>;
+ }
+}
+
// Regular calls.
-let isCall = 1, Defs = [R14D, CC], Uses = [FPC] in {
- def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
- [(z_call pcrel32:$I2)]>;
- def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
- [(z_call ADDR64:$R2)]>;
+// z/Linux ELF
+let Predicates = [IsTargetELF] in {
+ let isCall = 1, Defs = [R14D, CC], Uses = [FPC] in {
+ def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+ [(z_call pcrel32:$I2)]>;
+ def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+ [(z_call ADDR64:$R2)]>;
+ }
+
+ // TLS calls. These will be lowered into a call to __tls_get_offset,
+ // with an extra relocation specifying the TLS symbol.
+ let isCall = 1, Defs = [R14D, CC] in {
+ def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+ [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
+ def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+ [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
+ }
}
-// TLS calls. These will be lowered into a call to __tls_get_offset,
-// with an extra relocation specifying the TLS symbol.
-let isCall = 1, Defs = [R14D, CC] in {
- def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
- [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
- def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
- [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
-}
-
-// Sibling calls.
+// Sibling calls. Indirect sibling calls must be via R6 for XPLink,
+// R1 used for ELF
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
[(z_sibcall pcrel32:$I2)]>;
@@ -318,7 +336,7 @@
def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
}
-// A return instruction (br %r14).
+// A return instruction (br %r14) for ELF and (b 2 %r7) for XPLink.
let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
@@ -2145,10 +2163,19 @@
def DFLTCC : SideEffectTernaryMemMemRRFa<"dfltcc", 0xB939,
GR128, GR128, GR64>;
+// NNPA.
+let Predicates = [FeatureNNPAssist],
+ mayLoad = 1, mayStore = 1, Defs = [R0D, CC], Uses = [R0D, R1D] in
+ def NNPA : SideEffectInherentRRE<"nnpa", 0xB93B>;
+
// Execute.
let hasSideEffects = 1 in {
- def EX : SideEffectBinaryRX<"ex", 0x44, GR64>;
- def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
+ def EX : SideEffectBinaryRX<"ex", 0x44, ADDR64>;
+ def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>;
+ let hasNoSchedulingInfo = 1 in
+ def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1,
+ bdaddr12only:$bdl1, bdaddr12only:$bd2),
+ []>;
}
//===----------------------------------------------------------------------===//
@@ -2358,10 +2385,12 @@
def JXLEG : MnemonicAlias<"jxleg", "brxlg">;
def BRU : MnemonicAlias<"bru", "j">;
-def BRUL : MnemonicAlias<"brul", "jg">;
+def BRUL : MnemonicAlias<"brul", "jg", "att">;
+def BRUL_HLASM : MnemonicAlias<"brul", "jlu", "hlasm">;
foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
"Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
- def BRUAsm#V : MnemonicCondBranchAlias <CV<V>, "br#", "j#">;
- def BRULAsm#V : MnemonicCondBranchAlias <CV<V>, "br#l", "jg#">;
+ defm BRUAsm#V : MnemonicCondBranchAlias <CV<V>, "br#", "j#">;
+ defm BRULAsm#V : MnemonicCondBranchAlias <CV<V>, "br#l", "jg#", "att">;
+ defm BRUL_HLASMAsm#V : MnemonicCondBranchAlias <CV<V>, "br#l", "jl#", "hlasm">;
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrSystem.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrSystem.td
index ecce16c..e26417d 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -26,6 +26,8 @@
def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>;
def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>;
}
+let Predicates = [FeatureBEAREnhancement], hasSideEffects = 1, Defs = [CC] in
+ def LPSWEY : SideEffectUnarySIY<"lpswey", 0xEB71, 16>;
// Insert PSW key.
let Uses = [R2L], Defs = [R2L] in
@@ -104,6 +106,20 @@
def STPX : StoreInherentS<"stpx", 0xB211, null_frag, 4>;
//===----------------------------------------------------------------------===//
+// Breaking-Event-Address-Register Instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureBEAREnhancement] in {
+ // Load BEAR.
+ let hasSideEffects = 1 in
+ def LBEAR : SideEffectUnaryS<"lbear", 0xB200, null_frag, 8>;
+
+ // Store BEAR.
+ let hasSideEffects = 1 in
+ def STBEAR : StoreInherentS<"stbear", 0xB201, null_frag, 8>;
+}
+
+//===----------------------------------------------------------------------===//
// Storage-Key and Real Memory Instructions.
//===----------------------------------------------------------------------===//
@@ -157,6 +173,10 @@
let hasSideEffects = 1 in
defm IDTE : SideEffectQuaternaryRRFbOpt<"idte", 0xB98E, GR64, GR64, GR64>;
+// Reset DAT protection.
+let Predicates = [FeatureResetDATProtection], hasSideEffects = 1 in
+ defm RDP : SideEffectQuaternaryRRFbOpt<"rdp", 0xB98B, GR64, GR64, GR64>;
+
// Compare and replace DAT table entry.
let Predicates = [FeatureEnhancedDAT2], hasSideEffects = 1, Defs = [CC] in
defm CRDTE : SideEffectQuaternaryRRFbOpt<"crdte", 0xB98F, GR128, GR128, GR64>;
@@ -372,6 +392,11 @@
hasSideEffects = 1, Uses = [R0L, R1D] in
def PCKMO : SideEffectInherentRRE<"pckmo", 0xB928>;
+// Query processor activity counter information.
+let Predicates = [FeatureProcessorActivityInstrumentation],
+ hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+ def QPACI : StoreInherentS<"qpaci", 0xB28F, null_frag, 0>;
+
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index a85eb16..2e9524a 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1750,6 +1750,37 @@
}
//===----------------------------------------------------------------------===//
+// NNP assist instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector, FeatureNNPAssist] in {
+ let Uses = [FPC], mayRaiseFPException = 1 in
+ def VCFN : UnaryVRRaFloatGeneric<"vcfn", 0xE65D>;
+ def : Pat<(int_s390_vcfn VR128:$x, imm32zx4_timm:$m),
+ (VCFN VR128:$x, 1, imm32zx4:$m)>;
+
+ let Uses = [FPC], mayRaiseFPException = 1 in
+ def VCLFNL : UnaryVRRaFloatGeneric<"vclfnl", 0xE65E>;
+ def : Pat<(int_s390_vclfnls VR128:$x, imm32zx4_timm:$m),
+ (VCLFNL VR128:$x, 2, imm32zx4:$m)>;
+
+ let Uses = [FPC], mayRaiseFPException = 1 in
+ def VCLFNH : UnaryVRRaFloatGeneric<"vclfnh", 0xE656>;
+ def : Pat<(int_s390_vclfnhs VR128:$x, imm32zx4_timm:$m),
+ (VCLFNH VR128:$x, 2, imm32zx4:$m)>;
+
+ let Uses = [FPC], mayRaiseFPException = 1 in
+ def VCNF : UnaryVRRaFloatGeneric<"vcnf", 0xE655>;
+ def : Pat<(int_s390_vcnf VR128:$x, imm32zx4_timm:$m),
+ (VCNF VR128:$x, imm32zx4:$m, 1)>;
+
+ let Uses = [FPC], mayRaiseFPException = 1 in
+ def VCRNF : BinaryVRRcFloatGeneric<"vcrnf", 0xE675>;
+ def : Pat<(int_s390_vcrnfs VR128:$x, VR128:$y, imm32zx4_timm:$m),
+ (VCRNF VR128:$x, VR128:$y, imm32zx4:$m, 2)>;
+}
+
+//===----------------------------------------------------------------------===//
// Packed-decimal instructions
//===----------------------------------------------------------------------===//
@@ -1786,3 +1817,26 @@
def VCP : CompareVRRh<"vcp", 0xE677>;
}
}
+
+let Predicates = [FeatureVectorPackedDecimalEnhancement2] in {
+ def VSCHP : BinaryExtraVRRbGeneric<"vschp", 0xE674>;
+ def VSCHSP : BinaryExtraVRRb<"vschsp", 0xE674, 2>;
+ def VSCHDP : BinaryExtraVRRb<"vschdp", 0xE674, 3>;
+ def VSCHXP : BinaryExtraVRRb<"vschxp", 0xE674, 4>;
+
+ def VSCSHP : BinaryVRRb<"vscshp", 0xE67C, null_frag, v128b, v128b>;
+
+ def VCSPH : TernaryVRRj<"vcsph", 0xE67D>;
+
+ let Defs = [CC] in
+ def VCLZDP : BinaryVRRk<"vclzdp", 0xE651>;
+
+ let Defs = [CC] in
+ def VSRPR : QuaternaryVRIf<"vsrpr", 0xE672>;
+
+ let Defs = [CC] in {
+ def VPKZR : QuaternaryVRIf<"vpkzr", 0xE670>;
+ def VUPKZH : BinaryVRRk<"vupkzh", 0xE654>;
+ def VUPKZL : BinaryVRRk<"vupkzl", 0xE65C>;
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZProcessors.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZProcessors.td
index 57c2411..4fceaa1 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -38,3 +38,5 @@
def : ProcessorModel<"arch13", Z15Model, Arch13SupportedFeatures.List>;
def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
+def : ProcessorModel<"arch14", Z15Model, Arch14SupportedFeatures.List>;
+
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 5139cc3..0062e39 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -9,21 +9,19 @@
#include "SystemZRegisterInfo.h"
#include "SystemZInstrInfo.h"
#include "SystemZSubtarget.h"
-#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfoMetadata.h"
using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "SystemZGenRegisterInfo.inc"
-SystemZRegisterInfo::SystemZRegisterInfo()
- : SystemZGenRegisterInfo(SystemZ::R14D) {}
-
// Given that MO is a GRX32 operand, return either GR32 or GRH32 if MO
// somehow belongs in it. Otherwise, return GRX32.
static const TargetRegisterClass *getRC32(MachineOperand &MO,
@@ -191,7 +189,12 @@
}
const MCPhysReg *
-SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+SystemZXPLINK64Registers::getCalleeSavedRegs(const MachineFunction *MF) const {
+ return CSR_SystemZ_XPLINK64_SaveList;
+}
+
+const MCPhysReg *
+SystemZELFRegisters::getCalleeSavedRegs(const MachineFunction *MF) const {
const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
return CSR_SystemZ_NoRegs_SaveList;
@@ -202,11 +205,17 @@
MF->getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
return CSR_SystemZ_SwiftError_SaveList;
- return CSR_SystemZ_SaveList;
+ return CSR_SystemZ_ELF_SaveList;
}
const uint32_t *
-SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+SystemZXPLINK64Registers::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ return CSR_SystemZ_XPLINK64_RegMask;
+}
+
+const uint32_t *
+SystemZELFRegisters::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
if (CC == CallingConv::GHC)
@@ -218,27 +227,46 @@
MF.getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
return CSR_SystemZ_SwiftError_RegMask;
- return CSR_SystemZ_RegMask;
+ return CSR_SystemZ_ELF_RegMask;
+}
+
+SystemZRegisterInfo::SystemZRegisterInfo(unsigned int RA)
+ : SystemZGenRegisterInfo(RA) {}
+
+const MCPhysReg *
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+
+ const SystemZSubtarget *Subtarget = &MF->getSubtarget<SystemZSubtarget>();
+ SystemZCallingConventionRegisters *Regs = Subtarget->getSpecialRegisters();
+
+ return Regs->getCalleeSavedRegs(MF);
+}
+
+const uint32_t *
+SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+
+ const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+ SystemZCallingConventionRegisters *Regs = Subtarget->getSpecialRegisters();
+ return Regs->getCallPreservedMask(MF, CC);
}
BitVector
SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const SystemZFrameLowering *TFI = getFrameLowering(MF);
+ const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+ SystemZCallingConventionRegisters *Regs = Subtarget->getSpecialRegisters();
+ if (TFI->hasFP(MF))
+ // The frame pointer. Reserve all aliases.
+ for (MCRegAliasIterator AI(Regs->getFramePointerRegister(), this, true);
+ AI.isValid(); ++AI)
+ Reserved.set(*AI);
- if (TFI->hasFP(MF)) {
- // R11D is the frame pointer. Reserve all aliases.
- Reserved.set(SystemZ::R11D);
- Reserved.set(SystemZ::R11L);
- Reserved.set(SystemZ::R11H);
- Reserved.set(SystemZ::R10Q);
- }
-
- // R15D is the stack pointer. Reserve all aliases.
- Reserved.set(SystemZ::R15D);
- Reserved.set(SystemZ::R15L);
- Reserved.set(SystemZ::R15H);
- Reserved.set(SystemZ::R14Q);
+ // Reserve all aliases for the stack pointer.
+ for (MCRegAliasIterator AI(Regs->getStackPointerRegister(), this, true);
+ AI.isValid(); ++AI)
+ Reserved.set(*AI);
// A0 and A1 hold the thread pointer.
Reserved.set(SystemZ::A0);
@@ -273,7 +301,16 @@
// Special handling of dbg_value instructions.
if (MI->isDebugValue()) {
MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
- MI->getDebugOffset().ChangeToImmediate(Offset);
+ if (MI->isNonListDebugValue()) {
+ MI->getDebugOffset().ChangeToImmediate(Offset);
+ } else {
+ unsigned OpIdx = MI->getDebugOperandIndex(&MI->getOperand(FIOperandNum));
+ SmallVector<uint64_t, 3> Ops;
+ DIExpression::appendOffset(
+ Ops, TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed());
+ MI->getDebugExpressionOp().setMetadata(
+ DIExpression::appendOpsToArg(MI->getDebugExpression(), Ops, OpIdx));
+ }
return;
}
@@ -410,7 +447,11 @@
Register
SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SystemZFrameLowering *TFI = getFrameLowering(MF);
- return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
+ const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+ SystemZCallingConventionRegisters *Regs = Subtarget->getSpecialRegisters();
+
+ return TFI->hasFP(MF) ? Regs->getFramePointerRegister()
+ : Regs->getStackPointerRegister();
}
const TargetRegisterClass *
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 9f2cca0..122504d 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -39,9 +39,84 @@
}
} // end namespace SystemZ
+/// A SystemZ-specific class detailing special use registers
+/// particular for calling conventions.
+/// It is abstract, all calling conventions must override and
+/// define the pure virtual member function defined in this class.
+class SystemZCallingConventionRegisters {
+public:
+ /// \returns the register that keeps the
+ /// return function address.
+ virtual int getReturnFunctionAddressRegister() = 0;
+
+ /// \returns the register that keeps the
+ /// stack pointer address.
+ virtual int getStackPointerRegister() = 0;
+
+ /// \returns the register that keeps the
+ /// frame pointer address.
+ virtual int getFramePointerRegister() = 0;
+
+ /// \returns an array of all the callee saved registers.
+ virtual const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction *MF) const = 0;
+
+ /// \returns the mask of all the call preserved registers.
+ virtual const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const = 0;
+
+ /// Destroys the object. Bogus destructor allowing derived classes
+ /// to override it.
+ virtual ~SystemZCallingConventionRegisters(){};
+};
+
+/// XPLINK64 calling convention specific use registers
+/// Particular to z/OS when in 64 bit mode
+class SystemZXPLINK64Registers : public SystemZCallingConventionRegisters {
+public:
+ int getReturnFunctionAddressRegister() override final {
+ return SystemZ::R7D;
+ };
+
+ int getStackPointerRegister() override final { return SystemZ::R4D; };
+
+ int getFramePointerRegister() override final { return SystemZ::R8D; };
+
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction *MF) const override final;
+
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override final;
+
+ /// Destroys the object. Bogus destructor overriding base class destructor
+ ~SystemZXPLINK64Registers(){};
+};
+
+/// ELF calling convention specific use registers
+/// Particular when on zLinux in 64 bit mode
+class SystemZELFRegisters : public SystemZCallingConventionRegisters {
+public:
+ int getReturnFunctionAddressRegister() override final {
+ return SystemZ::R14D;
+ };
+
+ int getStackPointerRegister() override final { return SystemZ::R15D; };
+
+ int getFramePointerRegister() override final { return SystemZ::R11D; };
+
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction *MF) const override final;
+
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override final;
+
+ /// Destroys the object. Bogus destructor overriding base class destructor
+ ~SystemZELFRegisters(){};
+};
+
struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
public:
- SystemZRegisterInfo();
+ SystemZRegisterInfo(unsigned int RA);
/// getPointerRegClass - Return the register class to use to hold pointers.
/// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index a85862e..00005a6 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -31,8 +31,10 @@
// Define a register class that contains values of types TYPES and an
// associated operand called NAME. SIZE is the size and alignment
// of the registers and REGLIST is the list of individual registers.
+// If the user provides an alternate order list of regs, it will be used for
+// XPLINK. Otherwise, by default, XPLINK will use the regList ordering as well
multiclass SystemZRegClass<string name, list<ValueType> types, int size,
- dag regList, bit allocatable = 1> {
+ dag regList, list<dag> altRegList = [regList], bit allocatable = 1> {
def AsmOperand : AsmOperandClass {
let Name = name;
let ParserMethod = "parse"#name;
@@ -41,6 +43,11 @@
let isAllocatable = allocatable in
def Bit : RegisterClass<"SystemZ", types, size, regList> {
let Size = size;
+ let AltOrders = altRegList;
+ let AltOrderSelect = [{
+ const SystemZSubtarget &S = MF.getSubtarget<SystemZSubtarget>();
+ return S.isTargetXPLINK64();
+ }];
}
def "" : RegisterOperand<!cast<RegisterClass>(name#"Bit")> {
let ParserMatchClass = !cast<AsmOperandClass>(name#"AsmOperand");
@@ -85,40 +92,58 @@
!cast<GPR64>("R"#I#"D")>;
}
-/// Allocate the callee-saved R6-R13 backwards. That way they can be saved
-/// together with R14 and R15 in one prolog instruction.
+/// zLinux: Allocate the callee-saved R6-R13 backwards. That way they can be
+/// saved together with R14 and R15 in one prolog instruction.
+/// XPLINK64: Allocate all registers in natural order
defm GR32 : SystemZRegClass<"GR32", [i32], 32,
(add (sequence "R%uL", 0, 5),
- (sequence "R%uL", 15, 6))>;
+ (sequence "R%uL", 15, 6)),
+ [(add (sequence "R%uL", 0, 15))]>;
defm GRH32 : SystemZRegClass<"GRH32", [i32], 32,
(add (sequence "R%uH", 0, 5),
- (sequence "R%uH", 15, 6))>;
+ (sequence "R%uH", 15, 6)),
+ [(add (sequence "R%uH", 0, 15))]>;
defm GR64 : SystemZRegClass<"GR64", [i64], 64,
(add (sequence "R%uD", 0, 5),
- (sequence "R%uD", 15, 6))>;
+ (sequence "R%uD", 15, 6)),
+ [(add (sequence "R%uD", 0, 15))]>;
// Combine the low and high GR32s into a single class. This can only be
// used for virtual registers if the high-word facility is available.
+/// XPLINK64: Allocate all registers in natural order
defm GRX32 : SystemZRegClass<"GRX32", [i32], 32,
(add (sequence "R%uL", 0, 5),
(sequence "R%uH", 0, 5),
R15L, R15H, R14L, R14H, R13L, R13H,
R12L, R12H, R11L, R11H, R10L, R10H,
- R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
+ R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H),
+ [(add
+ R0L, R1L, R2L, R3L, R0H, R1H, R2H, R3H,
+ R4L, R4H, R5L, R5H, R6L, R6H, R7L, R7H,
+ R8L, R8H, R9L, R9H, R10L,R10H,R11L,R11H,
+ R12L,R12H,R13L,R13H,R14L,R14H,R15L,R15H)
+ ]>;
// The architecture doesn't really have any i128 support, so model the
// register pairs as untyped instead.
+// XPLINK64: Allocate all registers in natural order
defm GR128 : SystemZRegClass<"GR128", [untyped], 128,
- (add R0Q, R2Q, R4Q, R12Q, R10Q, R8Q, R6Q, R14Q)>;
+ (add R0Q, R2Q, R4Q, R12Q, R10Q, R8Q, R6Q, R14Q),
+ [(add R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q)]>;
// Base and index registers. Everything except R0, which in an address
// context evaluates as 0.
-defm ADDR32 : SystemZRegClass<"ADDR32", [i32], 32, (sub GR32Bit, R0L)>;
-defm ADDR64 : SystemZRegClass<"ADDR64", [i64], 64, (sub GR64Bit, R0D)>;
+// XPLINK64: Allocate all registers in natural order
+defm ADDR32 : SystemZRegClass<"ADDR32", [i32], 32, (sub GR32Bit, R0L),
+ [(add (sequence "R%uL", 1, 15))]>;
+defm ADDR64 : SystemZRegClass<"ADDR64", [i64], 64, (sub GR64Bit, R0D),
+ [(add (sequence "R%uD", 1, 15))]>;
// Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
// of a GR128.
-defm ADDR128 : SystemZRegClass<"ADDR128", [untyped], 128, (sub GR128Bit, R0Q)>;
+// XPLINK64: Allocate all registers in natural order
+defm ADDR128 : SystemZRegClass<"ADDR128", [untyped], 128, (sub GR128Bit, R0Q),
+ [(add R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q)]>;
// Any type register. Used for .insn directives when we don't know what the
// register types could be.
@@ -126,7 +151,8 @@
[i64, f64, v8i8, v4i16, v2i32, v2f32], 64,
(add (sequence "R%uD", 0, 15),
(sequence "F%uD", 0, 15),
- (sequence "V%u", 0, 15)), 0/*allocatable*/>;
+ (sequence "V%u", 0, 15)),
+ [], 0/*allocatable*/>;
//===----------------------------------------------------------------------===//
// Floating-point registers
@@ -310,7 +336,7 @@
def A#I : ACR32<I, "a"#I>, DwarfRegNum<[!add(I, 48)]>;
}
defm AR32 : SystemZRegClass<"AR32", [i32], 32,
- (add (sequence "A%u", 0, 15)), 0>;
+ (add (sequence "A%u", 0, 15)), [], 0>;
// Control registers.
class CREG64<bits<16> num, string n> : SystemZReg<n> {
@@ -320,5 +346,4 @@
def C#I : CREG64<I, "c"#I>, DwarfRegNum<[!add(I, 32)]>;
}
defm CR64 : SystemZRegClass<"CR64", [i64], 64,
- (add (sequence "C%u", 0, 15)), 0>;
-
+ (add (sequence "C%u", 0, 15)), [], 0>;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index de49106..f4777b0 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -167,8 +167,8 @@
// Call
def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index 5ea269c..f74c0d5 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -168,8 +168,8 @@
// Call
def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 6a28aec..d17e58f 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -168,8 +168,8 @@
// Call
def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 9a30659..0f01a42 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -146,8 +146,8 @@
// Call
def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index f3ff1df..096a95a 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -151,8 +151,8 @@
// Call
def : InstRW<[WLat1, FXU2, VBU, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index ca5ca72..4a9ea69 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -81,11 +81,12 @@
if (IsVolatile)
return SDValue();
+ auto *CByte = dyn_cast<ConstantSDNode>(Byte);
if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
uint64_t Bytes = CSize->getZExtValue();
if (Bytes == 0)
return SDValue();
- if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+ if (CByte) {
// Handle cases that can be done using at most two of
// MVI, MVHI, MVHHI and MVGHI. The latter two can only be
// used if ByteVal is all zeros or all ones; in other casees,
@@ -125,7 +126,6 @@
assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
// Handle the special case of a memset of 0, which can use XC.
- auto *CByte = dyn_cast<ConstantSDNode>(Byte);
if (CByte && CByte->getZExtValue() == 0)
return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
Chain, Dst, Dst, Bytes);
@@ -138,6 +138,18 @@
return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
Chain, DstPlus1, Dst, Bytes - 1);
}
+
+ // Variable length
+ if (CByte && CByte->getZExtValue() == 0) {
+ // Handle the special case of a variable length memset of 0 with XC.
+ SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
+ DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+ DAG.getConstant(-1, DL, MVT::i64));
+ SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
+ DAG.getConstant(8, DL, MVT::i64));
+ return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
+ LenMinus1, TripC);
+ }
return SDValue();
}
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index d24e264..bfcdee2 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -45,11 +45,24 @@
HasVectorEnhancements2 = false;
HasVectorPackedDecimal = false;
HasVectorPackedDecimalEnhancement = false;
+ HasVectorPackedDecimalEnhancement2 = false;
}
return *this;
}
+SystemZCallingConventionRegisters *
+SystemZSubtarget::initializeSpecialRegisters() {
+ if (isTargetXPLINK64())
+ return new SystemZXPLINK64Registers;
+ else if (isTargetELF())
+ return new SystemZELFRegisters;
+ else {
+ llvm_unreachable("Invalid Calling Convention. Cannot initialize Special "
+ "Call Registers!");
+ }
+}
+
SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM)
@@ -61,20 +74,24 @@
HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
HasExecutionHint(false), HasLoadAndTrap(false),
HasTransactionalExecution(false), HasProcessorAssist(false),
- HasDFPZonedConversion(false), HasEnhancedDAT2(false),
- HasVector(false), HasLoadStoreOnCond2(false),
- HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
- HasDFPPackedConversion(false),
+ HasDFPZonedConversion(false), HasEnhancedDAT2(false), HasVector(false),
+ HasLoadStoreOnCond2(false), HasLoadAndZeroRightmostByte(false),
+ HasMessageSecurityAssist5(false), HasDFPPackedConversion(false),
HasMiscellaneousExtensions2(false), HasGuardedStorage(false),
HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
- HasInsertReferenceBitsMultiple(false),
- HasMiscellaneousExtensions3(false), HasMessageSecurityAssist9(false),
- HasVectorEnhancements2(false), HasVectorPackedDecimalEnhancement(false),
- HasEnhancedSort(false), HasDeflateConversion(false), HasSoftFloat(false),
- TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
- TLInfo(TM, *this), TSInfo(), FrameLowering() {}
+ HasInsertReferenceBitsMultiple(false), HasMiscellaneousExtensions3(false),
+ HasMessageSecurityAssist9(false), HasVectorEnhancements2(false),
+ HasVectorPackedDecimalEnhancement(false), HasEnhancedSort(false),
+ HasDeflateConversion(false), HasVectorPackedDecimalEnhancement2(false),
+ HasNNPAssist(false), HasBEAREnhancement(false),
+ HasResetDATProtection(false), HasProcessorActivityInstrumentation(false),
+ HasSoftFloat(false), TargetTriple(TT),
+ SpecialRegisters(initializeSpecialRegisters()),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+ TSInfo(), FrameLowering() {}
+SystemZSubtarget::~SystemZSubtarget() { delete getSpecialRegisters(); }
bool SystemZSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 3841063..f6c155d 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -68,10 +68,16 @@
bool HasVectorPackedDecimalEnhancement;
bool HasEnhancedSort;
bool HasDeflateConversion;
+ bool HasVectorPackedDecimalEnhancement2;
+ bool HasNNPAssist;
+ bool HasBEAREnhancement;
+ bool HasResetDATProtection;
+ bool HasProcessorActivityInstrumentation;
bool HasSoftFloat;
private:
Triple TargetTriple;
+ SystemZCallingConventionRegisters *SpecialRegisters;
SystemZInstrInfo InstrInfo;
SystemZTargetLowering TLInfo;
SystemZSelectionDAGInfo TSInfo;
@@ -79,10 +85,19 @@
SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
StringRef FS);
+ SystemZCallingConventionRegisters *initializeSpecialRegisters(void);
+
public:
SystemZSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM);
+ ~SystemZSubtarget();
+
+ SystemZCallingConventionRegisters *getSpecialRegisters() const {
+ assert(SpecialRegisters && "Unsupported SystemZ calling convention");
+ return SpecialRegisters;
+ }
+
const TargetFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
@@ -240,6 +255,27 @@
// Return true if the target has the deflate-conversion facility.
bool hasDeflateConversion() const { return HasDeflateConversion; }
+ // Return true if the target has the vector-packed-decimal
+ // enhancement facility 2.
+ bool hasVectorPackedDecimalEnhancement2() const {
+ return HasVectorPackedDecimalEnhancement2;
+ }
+
+ // Return true if the target has the NNP-assist facility.
+ bool hasNNPAssist() const { return HasNNPAssist; }
+
+ // Return true if the target has the BEAR-enhancement facility.
+ bool hasBEAREnhancement() const { return HasBEAREnhancement; }
+
+ // Return true if the target has the reset-DAT-protection facility.
+ bool hasResetDATProtection() const { return HasResetDATProtection; }
+
+ // Return true if the target has the processor-activity-instrumentation
+ // facility.
+ bool hasProcessorActivityInstrumentation() const {
+ return HasProcessorActivityInstrumentation;
+ }
+
// Return true if soft float should be used.
bool hasSoftFloat() const { return HasSoftFloat; }
@@ -248,6 +284,15 @@
bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+ // Returns TRUE if we are generating GOFF object code
+ bool isTargetGOFF() const { return TargetTriple.isOSBinFormatGOFF(); }
+
+ // Returns TRUE if we are using XPLINK64 linkage convention
+ bool isTargetXPLINK64() const { return (isTargetGOFF() && isTargetzOS()); }
+
+ // Returns TRUE if we are generating code for a s390x machine running zOS
+ bool isTargetzOS() const { return TargetTriple.isOSzOS(); }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 7b78dc4..a886f9b 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -97,6 +97,15 @@
return Ret;
}
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSzOS())
+ return std::make_unique<TargetLoweringObjectFileGOFF>();
+
+ // Note: Some times run with -triple s390x-unknown.
+ // In this case, default to ELF unless z/OS specifically provided.
+ return std::make_unique<TargetLoweringObjectFileELF>();
+}
+
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
// Static code is suitable for use in a dynamic executable; there is no
// separate DynamicNoPIC model.
@@ -160,7 +169,7 @@
getEffectiveRelocModel(RM),
getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
OL),
- TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+ TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
}
@@ -179,9 +188,7 @@
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
// function, so we can enable it as a subtarget feature.
- bool softFloat =
- F.hasFnAttribute("use-soft-float") &&
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool softFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
if (softFloat)
FS += FS.empty() ? "+soft-float" : ",+soft-float";
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e7ac239..03c4da8 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -30,8 +30,8 @@
//
//===----------------------------------------------------------------------===//
-int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -63,10 +63,10 @@
return 4 * TTI::TCC_Basic;
}
-int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -183,9 +183,10 @@
return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -246,7 +247,7 @@
// Find out if L contains a call, what the machine instruction count
// estimate is, and how many stores there are.
bool HasCall = false;
- unsigned NumStores = 0;
+ InstructionCost NumStores = 0;
for (auto &BB : L->blocks())
for (auto &I : *BB) {
if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
@@ -270,7 +271,8 @@
// The z13 processor will run out of store tags if too many stores
// are fed into it too quickly. Therefore make sure there are not
// too many stores in the resulting unrolled loop.
- unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
+ unsigned const NumStoresVal = *NumStores.getValue();
+ unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
if (HasCall) {
// Only allow full unrolling if loop has any calls.
@@ -323,12 +325,18 @@
return 0;
}
-unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
- if (!Vector)
- return 64;
- if (ST->hasVector())
- return 128;
- return 0;
+TypeSize
+SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(64);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
+ }
+
+ llvm_unreachable("Unsupported register kind");
}
unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
@@ -372,10 +380,10 @@
return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
}
-int SystemZTTIImpl::getArithmeticInstrCost(
+InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
@@ -487,8 +495,10 @@
if (DivRemConstPow2)
return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
- if (DivRemConst)
- return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
+ if (DivRemConst) {
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
+ }
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
// Temporary hack: disable high vectorization factors with integer
// division/remainder, which will get scalarized and handled with
@@ -509,9 +519,11 @@
return NumVectors;
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- unsigned ScalarCost =
+ InstructionCost ScalarCost =
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
- unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ InstructionCost Cost =
+ (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
if (VF == 2)
@@ -528,7 +540,9 @@
// There is no native support for FRem.
if (Opcode == Instruction::FRem) {
- unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ InstructionCost Cost =
+ (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
if (VF == 2 && ScalarBits == 32)
Cost *= 2;
@@ -541,8 +555,11 @@
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
-int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
- int Index, VectorType *SubTp) {
+InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp) {
+ Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVector()) {
unsigned NumVectors = getNumVectorRegs(Tp);
@@ -575,7 +592,7 @@
}
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
// Return the log2 difference of the element sizes of the two vector types.
@@ -700,13 +717,14 @@
return Cost;
}
-int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// FIXME: Can the logic below also be used for these cost kinds?
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
- int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+ auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
return BaseCost == 0 ? BaseCost : 1;
}
@@ -743,8 +761,13 @@
}
}
else if (ST->hasVector()) {
+ // Vector to scalar cast.
auto *SrcVecTy = cast<FixedVectorType>(Src);
- auto *DstVecTy = cast<FixedVectorType>(Dst);
+ auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
+ if (!DstVecTy) {
+ // TODO: tune vector-to-scalar cast.
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+ }
unsigned VF = SrcVecTy->getNumElements();
unsigned NumDstVectors = getNumVectorRegs(Dst);
unsigned NumSrcVectors = getNumVectorRegs(Src);
@@ -789,9 +812,9 @@
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values. Base implementation does not
// realize float->int gets scalarized.
- unsigned ScalarCost = getCastInstrCost(
+ InstructionCost ScalarCost = getCastInstrCost(
Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
- unsigned TotCost = VF * ScalarCost;
+ InstructionCost TotCost = VF * ScalarCost;
bool NeedsInserts = true, NeedsExtracts = true;
// FP128 registers do not get inserted or extracted.
if (DstScalarBits == 128 &&
@@ -846,10 +869,11 @@
return ExtCost;
}
-int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy, CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
@@ -930,8 +954,8 @@
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
}
-int SystemZTTIImpl::
-getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
// vlvgp will insert two grs into a vector register, so only count half the
// number of instructions.
if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
@@ -1039,10 +1063,11 @@
return false;
}
-int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
assert(!Src->isVoidTy() && "Invalid type");
// TODO: Handle other cost kinds.
@@ -1109,7 +1134,7 @@
// needed for using / defining the vector operands. The SystemZ version does
// roughly the same but bases the computations on vector permutations
// instead.
-int SystemZTTIImpl::getInterleavedMemoryOpCost(
+InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
@@ -1120,9 +1145,6 @@
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
- // Return the ceiling of dividing A by B.
- auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
-
unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
unsigned VF = NumElts / Factor;
@@ -1149,7 +1171,7 @@
// requires one operation, except that vperm can handle two input
// registers first time for each dst vector.
unsigned NumSrcVecs = ValueVecs[Index].count();
- unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
+ unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
}
@@ -1173,9 +1195,11 @@
return -1;
}
-int SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
- int Cost = getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
+InstructionCost
+SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ InstructionCost Cost =
+ getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
if (Cost != -1)
return Cost;
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index c97e099..51cf557 100644
--- a/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -38,13 +38,16 @@
unsigned getInliningThresholdMultiplier() { return 3; }
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
@@ -62,7 +65,7 @@
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getCacheLineSize() const override { return 256; }
unsigned getPrefetchDistance() const override { return 4500; }
@@ -78,7 +81,7 @@
bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -87,33 +90,37 @@
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
const Instruction *I);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
- int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getInterleavedMemoryOpCost(
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
/// @}
};
diff --git a/src/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp b/src/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 81af4ee..7954f0f 100644
--- a/src/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -44,7 +44,7 @@
// `Initialize` can be called more than once.
delete Mang;
Mang = new Mangler();
- InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), ctx,
+ initMCObjectFileInfo(ctx, TM.isPositionIndependent(),
TM.getCodeModel() == CodeModel::Large);
// Reset various EH DWARF encodings.
@@ -217,8 +217,14 @@
// Handle thread-local data first.
if (GVar->isThreadLocal()) {
- if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS)
+ if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS) {
+ // Zero-initialized TLS variables with local linkage always get classified
+ // as ThreadBSSLocal.
+ if (GVar->hasLocalLinkage()) {
+ return SectionKind::getThreadBSSLocal();
+ }
return SectionKind::getThreadBSS();
+ }
return SectionKind::getThreadData();
}
@@ -290,7 +296,8 @@
// consideration when it tries to merge entries in the section.
Reloc::Model ReloModel = TM.getRelocationModel();
if (ReloModel == Reloc::Static || ReloModel == Reloc::ROPI ||
- ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI)
+ ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI ||
+ !C->needsDynamicRelocation())
return SectionKind::getReadOnly();
// Otherwise, the dynamic linker needs to fix it up, put it in the
@@ -380,6 +387,11 @@
return nullptr;
}
+MCSection *TargetLoweringObjectFile::getUniqueSectionForFunction(
+ const Function &F, const TargetMachine &TM) const {
+ return nullptr;
+}
+
/// getTTypeGlobalReference - Return an MCExpr to use for a
/// reference to the specified global variable from exception
/// handling information.
diff --git a/src/llvm-project/llvm/lib/Target/TargetMachine.cpp b/src/llvm-project/llvm/lib/Target/TargetMachine.cpp
index 2aee0e5..0a655a8 100644
--- a/src/llvm-project/llvm/lib/Target/TargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/TargetMachine.cpp
@@ -56,7 +56,7 @@
void TargetMachine::resetTargetOptions(const Function &F) const {
#define RESET_OPTION(X, Y) \
do { \
- Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \
+ Options.X = F.getFnAttribute(Y).getValueAsBool(); \
} while (0)
RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
diff --git a/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
index 29f5afb..ec911e5 100644
--- a/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
@@ -12,7 +12,6 @@
#include "VETargetStreamer.h"
#include "VEInstPrinter.h"
-#include "llvm/Support/FormattedStream.h"
using namespace llvm;
diff --git a/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.h b/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.h
index 6f6a0d4..1704e0b 100644
--- a/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.h
@@ -11,6 +11,7 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/FormattedStream.h"
namespace llvm {
class VETargetStreamer : public MCTargetStreamer {
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
index 9e97d0e..1ae3a2c 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -313,18 +313,13 @@
const VEInstrInfo &TII = *STI.getInstrInfo();
const VERegisterInfo &RegInfo = *STI.getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
- bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+ bool NeedsStackRealignment = RegInfo.shouldRealignStack(MF);
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
- // FIXME: unfortunately, returning false from canRealignStack
- // actually just causes needsStackRealignment to return false,
- // rather than reporting an error, as would be sensible. This is
- // poor, but fixing that bogosity is going to be a large project.
- // For now, just see if it's lied, and report an error here.
- if (!NeedsStackRealignment && MFI.getMaxAlign() > getStackAlign())
+ if (NeedsStackRealignment && !RegInfo.canRealignStack(MF))
report_fatal_error("Function \"" + Twine(MF.getName()) +
"\" required "
"stack re-alignment, but LLVM couldn't handle it "
@@ -428,7 +423,7 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- RegInfo->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken();
}
@@ -436,7 +431,7 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
- return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
}
bool VEFrameLowering::hasGOT(const MachineFunction &MF) const {
@@ -461,7 +456,7 @@
return StackOffset::getFixed(FrameOffset +
MF.getFrameInfo().getStackSize());
}
- if (RegInfo->needsStackRealignment(MF) && !isFixed) {
+ if (RegInfo->hasStackRealignment(MF) && !isFixed) {
// If data on stack require realignemnt, frame indexies are based on a %sp
// or %s17 (bp) register. If there is a variable sized object, bp is used.
if (hasBP(MF))
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
index 761baa7..e2608e8 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
@@ -180,7 +180,7 @@
// %dest, #FI, %reg, offset
// In the eliminateFrameIndex, above MI is converted to the following.
// %dest, %fp, %reg, fi_offset + offset
- if (dyn_cast<FrameIndexSDNode>(RHS))
+ if (isa<FrameIndexSDNode>(RHS))
std::swap(LHS, RHS);
if (matchADDRri(RHS, Index, Offset)) {
@@ -220,9 +220,8 @@
bool VEDAGToDAGISel::selectADDRzii(SDValue Addr, SDValue &Base, SDValue &Index,
SDValue &Offset) {
- if (dyn_cast<FrameIndexSDNode>(Addr)) {
+ if (isa<FrameIndexSDNode>(Addr))
return false;
- }
if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
Addr.getOpcode() == ISD::TargetGlobalAddress ||
Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
@@ -252,7 +251,7 @@
bool VEDAGToDAGISel::selectADDRzi(SDValue Addr, SDValue &Base,
SDValue &Offset) {
- if (dyn_cast<FrameIndexSDNode>(Addr))
+ if (isa<FrameIndexSDNode>(Addr))
return false;
if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
Addr.getOpcode() == ISD::TargetGlobalAddress ||
@@ -271,7 +270,7 @@
}
bool VEDAGToDAGISel::matchADDRrr(SDValue Addr, SDValue &Base, SDValue &Index) {
- if (dyn_cast<FrameIndexSDNode>(Addr))
+ if (isa<FrameIndexSDNode>(Addr))
return false;
if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
Addr.getOpcode() == ISD::TargetGlobalAddress ||
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp b/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
index d377f8e..b297e0f 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -837,7 +838,7 @@
/// alignment error (trap) on the target machine.
bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
- unsigned Align,
+ Align A,
MachineMemOperand::Flags,
bool *Fast) const {
if (Fast) {
@@ -997,7 +998,7 @@
// The mappings for emitLeading/TrailingFence for VE is designed by following
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
@@ -1018,7 +1019,7 @@
llvm_unreachable("Unknown fence ordering in emitLeadingFence");
}
-Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
@@ -2743,6 +2744,7 @@
// Special treatment for packed V64 types.
assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+ (void)VT;
// Example of codes:
// %packed_v = extractelt %vr, %idx / 2
// %v = %packed_v >> (%idx % 2 * 32)
@@ -2787,6 +2789,7 @@
// Special treatment for packed V64 types.
assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+ (void)VT;
// The v512i32 and v512f32 starts from upper bits (0..31). This "upper
// bits" required `val << 32` from C implementation's point of view.
//
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.h b/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
index a6e1bf3..b4ce890 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
@@ -92,12 +92,15 @@
// VE uses release consistency, so need fence for each atomics.
return true;
}
- Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ ISD::NodeType getExtendForAtomicOps() const override {
+ return ISD::ANY_EXTEND;
+ }
/// Custom Lower {
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -183,7 +186,7 @@
bool ForCodeSize) const override;
/// Returns true if the target allows unaligned memory accesses of the
/// specified type.
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align A,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td b/src/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
index b6862cf..2f77daa 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
@@ -793,7 +793,7 @@
let Constraints = "$dest = $sd", DisableEncoding = "$sd",
mayStore=1, mayLoad = 1, hasSideEffects = 0 in
multiclass RRCAStgm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
- Operand immOp, Operand MEM, Operand ADDR,
+ Operand immOp, Operand MEM, ComplexPattern ADDR,
SDPatternOperator OpNode = null_frag> {
def r : RRM<opc, (outs RC:$dest), (ins MEM:$addr, RC:$sy, RC:$sd),
!strconcat(opcStr, " $dest, $addr, $sy"),
@@ -1719,10 +1719,10 @@
// extload, sextload and zextload stuff
multiclass EXT64m<SDPatternOperator from,
- SDPatternOperator torri,
- SDPatternOperator torii,
- SDPatternOperator tozri,
- SDPatternOperator tozii> {
+ RM torri,
+ RM torii,
+ RM tozri,
+ RM tozii> {
def : Pat<(i64 (from ADDRrri:$addr)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (torri MEMrri:$addr),
sub_i32)>;
@@ -1748,10 +1748,10 @@
// anyextload
multiclass EXT32m<SDPatternOperator from,
- SDPatternOperator torri,
- SDPatternOperator torii,
- SDPatternOperator tozri,
- SDPatternOperator tozii> {
+ RM torri,
+ RM torii,
+ RM tozri,
+ RM tozii> {
def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
@@ -1762,10 +1762,10 @@
// truncstore
multiclass TRUNC64m<SDPatternOperator from,
- SDPatternOperator torri,
- SDPatternOperator torii,
- SDPatternOperator tozri,
- SDPatternOperator tozii> {
+ RM torri,
+ RM torii,
+ RM tozri,
+ RM tozii> {
def : Pat<(from i64:$src, ADDRrri:$addr),
(torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
def : Pat<(from i64:$src, ADDRrii:$addr),
@@ -1781,8 +1781,8 @@
// Atomic loads
multiclass ATMLDm<SDPatternOperator from,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
@@ -1794,9 +1794,9 @@
defm : ATMLDm<atomic_load_64, LDrri, LDrii, LDzri, LDzii>;
// Optimized atomic loads with sext
-multiclass SXATMLDm<SDPatternOperator from, Operand TY,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+multiclass SXATMLDm<SDPatternOperator from, ValueType TY,
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrri:$addr))), TY)),
(i2l (torri MEMrri:$addr))>;
def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrii:$addr))), TY)),
@@ -1807,8 +1807,8 @@
(i2l (tozii MEMzii:$addr))>;
}
multiclass SXATMLD32m<SDPatternOperator from,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(i64 (sext (from ADDRrri:$addr))),
(i2l (torri MEMrri:$addr))>;
def : Pat<(i64 (sext (from ADDRrii:$addr))),
@@ -1824,9 +1824,9 @@
defm : SXATMLD32m<atomic_load_32, LDLSXrri, LDLSXrii, LDLSXzri, LDLSXzii>;
// Optimized atomic loads with zext
-multiclass ZXATMLDm<SDPatternOperator from, Operand VAL,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+multiclass ZXATMLDm<SDPatternOperator from, int VAL,
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(i64 (and (anyext (from ADDRrri:$addr)), VAL)),
(i2l (torri MEMrri:$addr))>;
def : Pat<(i64 (and (anyext (from ADDRrii:$addr)), VAL)),
@@ -1836,9 +1836,9 @@
def : Pat<(i64 (and (anyext (from ADDRzii:$addr)), VAL)),
(i2l (tozii MEMzii:$addr))>;
}
-multiclass ZXATMLD32m<SDPatternOperator from, Operand VAL,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+multiclass ZXATMLD32m<SDPatternOperator from, int VAL,
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(i64 (zext (from ADDRrri:$addr))),
(i2l (torri MEMrri:$addr))>;
def : Pat<(i64 (zext (from ADDRrii:$addr))),
@@ -1857,8 +1857,8 @@
// Atomic stores
multiclass ATMSTm<SDPatternOperator from, ValueType ty,
- SDPatternOperator torri, SDPatternOperator torii,
- SDPatternOperator tozri, SDPatternOperator tozii> {
+ RM torri, RM torii,
+ RM tozri, RM tozii> {
def : Pat<(from ADDRrri:$addr, ty:$src), (torri MEMrri:$addr, $src)>;
def : Pat<(from ADDRrii:$addr, ty:$src), (torii MEMrii:$addr, $src)>;
def : Pat<(from ADDRzri:$addr, ty:$src), (tozri MEMzri:$addr, $src)>;
@@ -1872,10 +1872,10 @@
// Optimized atomic stores with truncate
multiclass TRATMSTm<SDPatternOperator from,
ValueType ty,
- SDPatternOperator torri,
- SDPatternOperator torii,
- SDPatternOperator tozri,
- SDPatternOperator tozii> {
+ RM torri,
+ RM torii,
+ RM tozri,
+ RM tozii> {
def : Pat<(from ADDRrri:$addr, (i32 (trunc i64:$src))),
(torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
def : Pat<(from ADDRrii:$addr, (i32 (trunc i64:$src))),
@@ -1929,10 +1929,10 @@
// brcc
// integer brcc
-multiclass BRCCIm<ValueType ty, SDPatternOperator BrOpNode1,
- SDPatternOperator BrOpNode2,
- SDPatternOperator CmpOpNode1,
- SDPatternOperator CmpOpNode2> {
+multiclass BRCCIm<ValueType ty, CF BrOpNode1,
+ CF BrOpNode2,
+ RR CmpOpNode1,
+ RR CmpOpNode2> {
def : Pat<(brcc CCSIOp:$cond, ty:$l, simm7:$r, bb:$addr),
(BrOpNode2 (icond2ccSwap $cond), (LO7 $r), $l, bb:$addr)>;
def : Pat<(brcc CCSIOp:$cond, ty:$l, ty:$r, bb:$addr),
@@ -1947,8 +1947,7 @@
defm : BRCCIm<i64, BRCFLrr, BRCFLir, CMPULrr, CMPULir>;
// floating point brcc
-multiclass BRCCFm<ValueType ty, SDPatternOperator BrOpNode1,
- SDPatternOperator BrOpNode2> {
+multiclass BRCCFm<ValueType ty, CF BrOpNode1, CF BrOpNode2> {
def : Pat<(brcc cond:$cond, ty:$l, simm7fp:$r, bb:$addr),
(BrOpNode2 (fcond2ccSwap $cond), (LO7FP $r), $l, bb:$addr)>;
def : Pat<(brcc cond:$cond, ty:$l, ty:$r, bb:$addr),
diff --git a/src/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td b/src/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
index 0084876..dc3c913 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/src/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -16,7 +16,7 @@
//===----------------------------------------------------------------------===//
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
- SDNodeXForm ImmCast, SDNodeXForm SuperRegCast> {
+ SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
// VBRDil
def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)),
(VBRDil (ImmCast $sy), i32:$vl)>;
@@ -38,8 +38,8 @@
}
multiclass extract_insert_elem32<ValueType v32, ValueType s32,
- SDNodeXForm SubRegCast,
- SDNodeXForm SuperRegCast> {
+ OutPatFrag SubRegCast,
+ OutPatFrag SuperRegCast> {
// LVSvi
def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)),
(SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>;
@@ -73,7 +73,7 @@
multiclass patterns_elem32<ValueType v32, ValueType s32,
SDPatternOperator ImmOp, SDNodeXForm ImmCast,
- SDNodeXForm SubRegCast, SDNodeXForm SuperRegCast> {
+ OutPatFrag SubRegCast, OutPatFrag SuperRegCast> {
defm : vbrd_elem32<v32, s32, ImmOp, ImmCast, SuperRegCast>;
defm : extract_insert_elem32<v32, s32, SubRegCast, SuperRegCast>;
}
diff --git a/src/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
index 68af665..0242fa1 100644
--- a/src/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -50,18 +50,42 @@
return 64;
}
- unsigned getRegisterBitWidth(bool Vector) const {
- if (Vector) {
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(64);
+ case TargetTransformInfo::RGK_FixedWidthVector:
// TODO report vregs once vector isel is stable.
- return 0;
+ return TypeSize::getFixed(0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
}
- return 64;
+
+ llvm_unreachable("Unsupported register kind");
+ }
+
+ /// \returns How the target needs this vector-predicated operation to be
+ /// transformed.
+ TargetTransformInfo::VPLegalization
+ getVPLegalizationStrategy(const VPIntrinsic &PI) const {
+ using VPLegalization = TargetTransformInfo::VPLegalization;
+ return VPLegalization(VPLegalization::Legal, VPLegalization::Legal);
}
unsigned getMinVectorRegisterBitWidth() const {
// TODO report vregs once vector isel is stable.
return 0;
}
+
+ bool shouldBuildRelLookupTables() const {
+ // NEC nld doesn't support relative lookup tables. It shows following
+ // errors. So, we disable it at the moment.
+ // /opt/nec/ve/bin/nld: src/CMakeFiles/cxxabi_shared.dir/cxa_demangle.cpp
+ // .o(.rodata+0x17b4): reloc against `.L.str.376': error 2
+ // /opt/nec/ve/bin/nld: final link failed: Nonrepresentable section on
+ // output
+ return false;
+ }
};
} // namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/CMakeLists.txt
index 7f66fb3..717c98f 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/CMakeLists.txt
@@ -1,10 +1,12 @@
add_llvm_component_library(LLVMWebAssemblyAsmParser
WebAssemblyAsmParser.cpp
+ WebAssemblyAsmTypeCheck.cpp
LINK_COMPONENTS
MC
MCParser
WebAssemblyInfo
+ WebAssemblyUtils
Support
ADD_TO_COMPONENT
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 60ac324..eb1dd87 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -13,9 +13,12 @@
///
//===----------------------------------------------------------------------===//
+#include "AsmParser/WebAssemblyAsmTypeCheck.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -29,6 +32,7 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/Endian.h"
+#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -125,10 +129,19 @@
llvm_unreachable("Should be integer immediate or symbol!");
}
- void addFPImmOperands(MCInst &Inst, unsigned N) const {
+ void addFPImmf32Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
if (Kind == Float)
- Inst.addOperand(MCOperand::createFPImm(Flt.Val));
+ Inst.addOperand(
+ MCOperand::createSFPImm(bit_cast<uint32_t>(float(Flt.Val))));
+ else
+ llvm_unreachable("Should be float immediate!");
+ }
+
+ void addFPImmf64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (Kind == Float)
+ Inst.addOperand(MCOperand::createDFPImm(bit_cast<uint64_t>(Flt.Val)));
else
llvm_unreachable("Should be float immediate!");
}
@@ -160,11 +173,13 @@
}
};
+// Perhaps this should go somewhere common.
+static wasm::WasmLimits DefaultLimits() {
+ return {wasm::WASM_LIMITS_FLAG_NONE, 0, 0};
+}
+
static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx,
const StringRef &Name) {
- // FIXME: Duplicates functionality from
- // MC/WasmObjectWriter::recordRelocation, as well as WebAssemblyCodegen's
- // WebAssembly:getOrCreateFunctionTableSymbol.
MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
@@ -194,7 +209,6 @@
// guarantee that correct order.
enum ParserState {
FileStart,
- Label,
FunctionStart,
FunctionLocals,
Instructions,
@@ -208,23 +222,50 @@
Block,
Loop,
Try,
+ CatchAll,
If,
Else,
Undefined,
};
- std::vector<NestingType> NestingStack;
+ struct Nested {
+ NestingType NT;
+ wasm::WasmSignature Sig;
+ };
+ std::vector<Nested> NestingStack;
- // We track this to see if a .functype following a label is the same,
- // as this is how we recognize the start of a function.
- MCSymbol *LastLabel = nullptr;
+ MCSymbolWasm *DefaultFunctionTable = nullptr;
MCSymbol *LastFunctionLabel = nullptr;
+ bool is64;
+
+ WebAssemblyAsmTypeCheck TC;
+ // Don't type check if -no-type-check was set.
+ bool SkipTypeCheck;
+
public:
WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
: MCTargetAsmParser(Options, STI, MII), Parser(Parser),
- Lexer(Parser.getLexer()) {
+ Lexer(Parser.getLexer()),
+ is64(STI.getTargetTriple().isArch64Bit()),
+ TC(Parser, MII, is64), SkipTypeCheck(Options.MCNoTypeCheck) {
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ // Don't type check if this is inline asm, since that is a naked sequence of
+ // instructions without a function/locals decl.
+ auto &SM = Parser.getSourceManager();
+ auto BufferName =
+ SM.getBufferInfo(SM.getMainFileID()).Buffer->getBufferIdentifier();
+ if (BufferName == "<inline asm>")
+ SkipTypeCheck = true;
+ }
+
+ void Initialize(MCAsmParser &Parser) override {
+ MCAsmParserExtension::Initialize(Parser);
+
+ DefaultFunctionTable = GetOrCreateFunctionTableSymbol(
+ getContext(), "__indirect_function_table");
+ if (!STI->checkFeatures("+reference-types"))
+ DefaultFunctionTable->setOmitFromLinkingSection();
}
#define GET_ASSEMBLER_HEADER
@@ -268,7 +309,9 @@
case Loop:
return {"loop", "end_loop"};
case Try:
- return {"try", "end_try"};
+ return {"try", "end_try/delegate"};
+ case CatchAll:
+ return {"catch_all", "end_try"};
case If:
return {"if", "end_if"};
case Else:
@@ -278,15 +321,16 @@
}
}
- void push(NestingType NT) { NestingStack.push_back(NT); }
+ void push(NestingType NT) { NestingStack.push_back({NT, wasm::WasmSignature()}); }
bool pop(StringRef Ins, NestingType NT1, NestingType NT2 = Undefined) {
if (NestingStack.empty())
return error(Twine("End of block construct with no start: ") + Ins);
auto Top = NestingStack.back();
- if (Top != NT1 && Top != NT2)
+ if (Top.NT != NT1 && Top.NT != NT2)
return error(Twine("Block construct type mismatch, expected: ") +
- nestingString(Top).second + ", instead got: " + Ins);
+ nestingString(Top.NT).second + ", instead got: " + Ins);
+ TC.setLastSig(Top.Sig);
NestingStack.pop_back();
return false;
}
@@ -295,7 +339,7 @@
auto Err = !NestingStack.empty();
while (!NestingStack.empty()) {
error(Twine("Unmatched block construct(s) at function end: ") +
- nestingString(NestingStack.back()).first);
+ nestingString(NestingStack.back().NT).first);
NestingStack.pop_back();
}
return Err;
@@ -325,45 +369,9 @@
return Name;
}
- Optional<wasm::ValType> parseType(const StringRef &Type) {
- // FIXME: can't use StringSwitch because wasm::ValType doesn't have a
- // "invalid" value.
- if (Type == "i32")
- return wasm::ValType::I32;
- if (Type == "i64")
- return wasm::ValType::I64;
- if (Type == "f32")
- return wasm::ValType::F32;
- if (Type == "f64")
- return wasm::ValType::F64;
- if (Type == "v128" || Type == "i8x16" || Type == "i16x8" ||
- Type == "i32x4" || Type == "i64x2" || Type == "f32x4" ||
- Type == "f64x2")
- return wasm::ValType::V128;
- if (Type == "funcref")
- return wasm::ValType::FUNCREF;
- if (Type == "externref")
- return wasm::ValType::EXTERNREF;
- return Optional<wasm::ValType>();
- }
-
- WebAssembly::BlockType parseBlockType(StringRef ID) {
- // Multivalue block types are handled separately in parseSignature
- return StringSwitch<WebAssembly::BlockType>(ID)
- .Case("i32", WebAssembly::BlockType::I32)
- .Case("i64", WebAssembly::BlockType::I64)
- .Case("f32", WebAssembly::BlockType::F32)
- .Case("f64", WebAssembly::BlockType::F64)
- .Case("v128", WebAssembly::BlockType::V128)
- .Case("funcref", WebAssembly::BlockType::Funcref)
- .Case("externref", WebAssembly::BlockType::Externref)
- .Case("void", WebAssembly::BlockType::Void)
- .Default(WebAssembly::BlockType::Invalid);
- }
-
bool parseRegTypeList(SmallVectorImpl<wasm::ValType> &Types) {
while (Lexer.is(AsmToken::Identifier)) {
- auto Type = parseType(Lexer.getTok().getString());
+ auto Type = WebAssembly::parseType(Lexer.getTok().getString());
if (!Type)
return error("unknown type: ", Lexer.getTok());
Types.push_back(Type.getValue());
@@ -405,9 +413,9 @@
auto &Flt = Lexer.getTok();
auto S = Flt.getString();
double Val;
- if (S.compare_lower("infinity") == 0) {
+ if (S.compare_insensitive("infinity") == 0) {
Val = std::numeric_limits<double>::infinity();
- } else if (S.compare_lower("nan") == 0) {
+ } else if (S.compare_insensitive("nan") == 0) {
Val = std::numeric_limits<double>::quiet_NaN();
} else {
return true;
@@ -458,20 +466,76 @@
return false;
}
- WebAssembly::HeapType parseHeapType(StringRef Id) {
- return StringSwitch<WebAssembly::HeapType>(Id)
- .Case("extern", WebAssembly::HeapType::Externref)
- .Case("func", WebAssembly::HeapType::Funcref)
- .Default(WebAssembly::HeapType::Invalid);
- }
-
void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc,
WebAssembly::BlockType BT) {
+ if (BT != WebAssembly::BlockType::Void) {
+ wasm::WasmSignature Sig({static_cast<wasm::ValType>(BT)}, {});
+ TC.setLastSig(Sig);
+ NestingStack.back().Sig = Sig;
+ }
Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Integer, NameLoc, NameLoc,
WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)}));
}
+ bool parseLimits(wasm::WasmLimits *Limits) {
+ auto Tok = Lexer.getTok();
+ if (!Tok.is(AsmToken::Integer))
+ return error("Expected integer constant, instead got: ", Tok);
+ int64_t Val = Tok.getIntVal();
+ assert(Val >= 0);
+ Limits->Minimum = Val;
+ Parser.Lex();
+
+ if (isNext(AsmToken::Comma)) {
+ Limits->Flags |= wasm::WASM_LIMITS_FLAG_HAS_MAX;
+ auto Tok = Lexer.getTok();
+ if (!Tok.is(AsmToken::Integer))
+ return error("Expected integer constant, instead got: ", Tok);
+ int64_t Val = Tok.getIntVal();
+ assert(Val >= 0);
+ Limits->Maximum = Val;
+ Parser.Lex();
+ }
+ return false;
+ }
+
+ bool parseFunctionTableOperand(std::unique_ptr<WebAssemblyOperand> *Op) {
+ if (STI->checkFeatures("+reference-types")) {
+ // If the reference-types feature is enabled, there is an explicit table
+ // operand. To allow the same assembly to be compiled with or without
+ // reference types, we allow the operand to be omitted, in which case we
+ // default to __indirect_function_table.
+ auto &Tok = Lexer.getTok();
+ if (Tok.is(AsmToken::Identifier)) {
+ auto *Sym =
+ GetOrCreateFunctionTableSymbol(getContext(), Tok.getString());
+ const auto *Val = MCSymbolRefExpr::create(Sym, getContext());
+ *Op = std::make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(),
+ WebAssemblyOperand::SymOp{Val});
+ Parser.Lex();
+ return expect(AsmToken::Comma, ",");
+ } else {
+ const auto *Val =
+ MCSymbolRefExpr::create(DefaultFunctionTable, getContext());
+ *Op = std::make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Symbol, SMLoc(), SMLoc(),
+ WebAssemblyOperand::SymOp{Val});
+ return false;
+ }
+ } else {
+ // For the MVP there is at most one table whose number is 0, but we can't
+ // write a table symbol or issue relocations. Instead we just ensure the
+ // table is live and write a zero.
+ getStreamer().emitSymbolAttribute(DefaultFunctionTable, MCSA_NoDeadStrip);
+ *Op = std::make_unique<WebAssemblyOperand>(WebAssemblyOperand::Integer,
+ SMLoc(), SMLoc(),
+ WebAssemblyOperand::IntOp{0});
+ return false;
+ }
+ }
+
bool ParseInstruction(ParseInstructionInfo & /*Info*/, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override {
// Note: Name does NOT point into the sourcecode, but to a local, so
@@ -508,6 +572,7 @@
bool ExpectBlockType = false;
bool ExpectFuncType = false;
bool ExpectHeapType = false;
+ std::unique_ptr<WebAssemblyOperand> FunctionTable;
if (Name == "block") {
push(Block);
ExpectBlockType = true;
@@ -528,10 +593,17 @@
if (pop(Name, Try))
return true;
push(Try);
+ } else if (Name == "catch_all") {
+ if (pop(Name, Try))
+ return true;
+ push(CatchAll);
} else if (Name == "end_if") {
if (pop(Name, If, Else))
return true;
} else if (Name == "end_try") {
+ if (pop(Name, Try, CatchAll))
+ return true;
+ } else if (Name == "delegate") {
if (pop(Name, Try))
return true;
} else if (Name == "end_loop") {
@@ -546,16 +618,12 @@
if (pop(Name, Function) || ensureEmptyNestingStack())
return true;
} else if (Name == "call_indirect" || Name == "return_call_indirect") {
+ // These instructions have differing operand orders in the text format vs
+ // the binary formats. The MC instructions follow the binary format, so
+ // here we stash away the operand and append it later.
+ if (parseFunctionTableOperand(&FunctionTable))
+ return true;
ExpectFuncType = true;
- // Ensure that the object file has a __indirect_function_table import, as
- // we call_indirect against it.
- auto &Ctx = getStreamer().getContext();
- MCSymbolWasm *Sym =
- GetOrCreateFunctionTableSymbol(Ctx, "__indirect_function_table");
- // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
- // it as NO_STRIP so as to ensure that the indirect function table makes
- // it to linked output.
- Sym->setNoStrip();
} else if (Name == "ref.null") {
ExpectHeapType = true;
}
@@ -571,7 +639,10 @@
return true;
// Got signature as block type, don't need more
ExpectBlockType = false;
- auto &Ctx = getStreamer().getContext();
+ TC.setLastSig(*Signature.get());
+ if (ExpectBlockType)
+ NestingStack.back().Sig = *Signature.get();
+ auto &Ctx = getContext();
// The "true" here will cause this to be a nameless symbol.
MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true);
auto *WasmSym = cast<MCSymbolWasm>(Sym);
@@ -594,13 +665,13 @@
auto &Id = Lexer.getTok();
if (ExpectBlockType) {
// Assume this identifier is a block_type.
- auto BT = parseBlockType(Id.getString());
+ auto BT = WebAssembly::parseBlockType(Id.getString());
if (BT == WebAssembly::BlockType::Invalid)
return error("Unknown block type: ", Id);
addBlockTypeOperand(Operands, NameLoc, BT);
Parser.Lex();
} else if (ExpectHeapType) {
- auto HeapType = parseHeapType(Id.getString());
+ auto HeapType = WebAssembly::parseHeapType(Id.getString());
if (HeapType == WebAssembly::HeapType::Invalid) {
return error("Expected a heap type: ", Id);
}
@@ -674,15 +745,12 @@
// Support blocks with no operands as default to void.
addBlockTypeOperand(Operands, NameLoc, WebAssembly::BlockType::Void);
}
+ if (FunctionTable)
+ Operands.push_back(std::move(FunctionTable));
Parser.Lex();
return false;
}
- void onLabelParsed(MCSymbol *Symbol) override {
- LastLabel = Symbol;
- CurrentState = Label;
- }
-
bool parseSignature(wasm::WasmSignature *Signature) {
if (expect(AsmToken::LParen, "("))
return true;
@@ -740,7 +808,7 @@
auto TypeName = expectIdent();
if (TypeName.empty())
return true;
- auto Type = parseType(TypeName);
+ auto Type = WebAssembly::parseType(TypeName);
if (!Type)
return error("Unknown type in .globaltype directive: ", TypeTok);
// Optional mutable modifier. Default to mutable for historical reasons.
@@ -767,24 +835,31 @@
}
if (DirectiveID.getString() == ".tabletype") {
+ // .tabletype SYM, ELEMTYPE[, MINSIZE[, MAXSIZE]]
auto SymName = expectIdent();
if (SymName.empty())
return true;
if (expect(AsmToken::Comma, ","))
return true;
- auto TypeTok = Lexer.getTok();
- auto TypeName = expectIdent();
- if (TypeName.empty())
+
+ auto ElemTypeTok = Lexer.getTok();
+ auto ElemTypeName = expectIdent();
+ if (ElemTypeName.empty())
return true;
- auto Type = parseType(TypeName);
- if (!Type)
- return error("Unknown type in .tabletype directive: ", TypeTok);
+ Optional<wasm::ValType> ElemType = WebAssembly::parseType(ElemTypeName);
+ if (!ElemType)
+ return error("Unknown type in .tabletype directive: ", ElemTypeTok);
+
+ wasm::WasmLimits Limits = DefaultLimits();
+ if (isNext(AsmToken::Comma) && parseLimits(&Limits))
+ return true;
// Now that we have the name and table type, we can actually create the
// symbol
auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
- WasmSym->setTableType(Type.getValue());
+ wasm::WasmTableType Type = {uint8_t(ElemType.getValue()), Limits};
+ WasmSym->setTableType(Type);
TOut.emitTableType(WasmSym);
return expect(AsmToken::EndOfStatement, "EOL");
}
@@ -799,17 +874,18 @@
if (SymName.empty())
return true;
auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
- if (CurrentState == Label && WasmSym == LastLabel) {
+ if (WasmSym->isDefined()) {
// This .functype indicates a start of a function.
if (ensureEmptyNestingStack())
return true;
CurrentState = FunctionStart;
- LastFunctionLabel = LastLabel;
+ LastFunctionLabel = WasmSym;
push(Function);
}
auto Signature = std::make_unique<wasm::WasmSignature>();
if (parseSignature(Signature.get()))
return true;
+ TC.funcDecl(*Signature);
WasmSym->setSignature(Signature.get());
addSignature(std::move(Signature));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
@@ -854,7 +930,7 @@
TOut.emitImportName(WasmSym, ImportName);
}
- if (DirectiveID.getString() == ".eventtype") {
+ if (DirectiveID.getString() == ".tagtype") {
auto SymName = expectIdent();
if (SymName.empty())
return true;
@@ -864,19 +940,20 @@
return true;
WasmSym->setSignature(Signature.get());
addSignature(std::move(Signature));
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT);
- TOut.emitEventType(WasmSym);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TAG);
+ TOut.emitTagType(WasmSym);
// TODO: backend also calls TOut.emitIndIdx, but that is not implemented.
return expect(AsmToken::EndOfStatement, "EOL");
}
if (DirectiveID.getString() == ".local") {
if (CurrentState != FunctionStart)
- return error(".local directive should follow the start of a function",
+ return error(".local directive should follow the start of a function: ",
Lexer.getTok());
SmallVector<wasm::ValType, 4> Locals;
if (parseRegTypeList(Locals))
return true;
+ TC.localDecl(Locals);
TOut.emitLocal(Locals);
CurrentState = FunctionLocals;
return expect(AsmToken::EndOfStatement, "EOL");
@@ -941,7 +1018,7 @@
if (Op0.getImm() == -1)
Op0.setImm(Align);
}
- if (getSTI().getTargetTriple().isArch64Bit()) {
+ if (is64) {
// Upgrade 32-bit loads/stores to 64-bit. These mostly differ by having
// an offset64 arg instead of offset32, but to the assembler matcher
// they're both immediates so don't get selected for.
@@ -951,9 +1028,11 @@
Inst.setOpcode(Opc64);
}
}
+ if (!SkipTypeCheck && TC.typeCheck(IDLoc, Inst))
+ return true;
Out.emitInstruction(Inst, getSTI());
if (CurrentState == EndFunction) {
- onEndOfFunction();
+ onEndOfFunction(IDLoc);
} else {
CurrentState = Instructions;
}
@@ -990,6 +1069,20 @@
}
void doBeforeLabelEmit(MCSymbol *Symbol) override {
+ // Code below only applies to labels in text sections.
+ auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+ if (!CWS || !CWS->getKind().isText())
+ return;
+
+ auto WasmSym = cast<MCSymbolWasm>(Symbol);
+ // Unlike other targets, we don't allow data in text sections (labels
+ // declared with .type @object).
+ if (WasmSym->getType() == wasm::WASM_SYMBOL_TYPE_DATA) {
+ Parser.Error(Parser.getTok().getLoc(),
+ "Wasm doesn\'t support data symbols in text sections");
+ return;
+ }
+
// Start a new section for the next function automatically, since our
// object writer expects each function to have its own section. This way
// The user can't forget this "convention".
@@ -997,14 +1090,10 @@
if (SymName.startswith(".L"))
return; // Local Symbol.
- // Only create a new text section if we're already in one.
// TODO: If the user explicitly creates a new function section, we ignore
// its name when we create this one. It would be nice to honor their
// choice, while still ensuring that we create one if they forget.
// (that requires coordination with WasmAsmParser::parseSectionDirective)
- auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
- if (!CWS || !CWS->getKind().isText())
- return;
auto SecName = ".text." + SymName;
auto *Group = CWS->getGroup();
@@ -1013,9 +1102,9 @@
// for importing comdat functions. But there's no way to specify that in
// assembly currently.
if (Group)
- cast<MCSymbolWasm>(Symbol)->setComdat(true);
+ WasmSym->setComdat(true);
auto *WS =
- getContext().getWasmSection(SecName, SectionKind::getText(), Group,
+ getContext().getWasmSection(SecName, SectionKind::getText(), 0, Group,
MCContext::GenericSectionID, nullptr);
getStreamer().SwitchSection(WS);
// Also generate DWARF for this section if requested.
@@ -1023,7 +1112,9 @@
getContext().addGenDwarfSection(WS);
}
- void onEndOfFunction() {
+ void onEndOfFunction(SMLoc ErrorLoc) {
+ TC.endOfFunction(ErrorLoc);
+
// Automatically output a .size directive, so it becomes optional for the
// user.
if (!LastFunctionLabel) return;
@@ -1050,3 +1141,14 @@
#define GET_SUBTARGET_FEATURE_NAME
#define GET_MATCHER_IMPLEMENTATION
#include "WebAssemblyGenAsmMatcher.inc"
+
+StringRef GetMnemonic(unsigned Opc) {
+ // FIXME: linear search!
+ for (auto &ME : MatchTable0) {
+ if (ME.Opcode == Opc) {
+ return ME.getMnemonic();
+ }
+ }
+ assert(false && "mnemonic not found");
+ return StringRef();
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
new file mode 100644
index 0000000..2f9245a7
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -0,0 +1,263 @@
+//==- WebAssemblyAsmTypeCheck.cpp - Assembler for WebAssembly -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file is part of the WebAssembly Assembler.
+///
+/// It contains code to translate a parsed .s file into MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AsmParser/WebAssemblyAsmTypeCheck.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
+#include "WebAssembly.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-asm-parser"
+
+extern StringRef GetMnemonic(unsigned Opc);
+
+namespace llvm {
+
+WebAssemblyAsmTypeCheck::WebAssemblyAsmTypeCheck(MCAsmParser &Parser,
+ const MCInstrInfo &MII, bool is64)
+ : Parser(Parser), MII(MII), is64(is64) {
+}
+
+void WebAssemblyAsmTypeCheck::funcDecl(const wasm::WasmSignature &Sig) {
+ LocalTypes.assign(Sig.Params.begin(), Sig.Params.end());
+ ReturnTypes.assign(Sig.Returns.begin(), Sig.Returns.end());
+}
+
+void WebAssemblyAsmTypeCheck::localDecl(const SmallVector<wasm::ValType, 4> &Locals) {
+ LocalTypes.insert(LocalTypes.end(), Locals.begin(), Locals.end());
+}
+
+void WebAssemblyAsmTypeCheck::dumpTypeStack(Twine Msg) {
+ LLVM_DEBUG({
+ std::string s;
+ for (auto VT : Stack) {
+ s += WebAssembly::typeToString(VT);
+ s += " ";
+ }
+ dbgs() << Msg << s << '\n';
+ });
+}
+
+bool WebAssemblyAsmTypeCheck::typeError(SMLoc ErrorLoc, const Twine &Msg) {
+ // Once you get one type error in a function, it will likely trigger more
+ // which are mostly not helpful.
+ if (TypeErrorThisFunction)
+ return true;
+ TypeErrorThisFunction = true;
+ dumpTypeStack("current stack: ");
+ return Parser.Error(ErrorLoc, Msg);
+}
+
+bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
+ Optional<wasm::ValType> EVT) {
+ if (Stack.empty()) {
+ return typeError(ErrorLoc,
+ EVT.hasValue()
+ ? StringRef("empty stack while popping ") +
+ WebAssembly::typeToString(EVT.getValue())
+ : StringRef(
+ "empty stack while popping value"));
+ }
+ auto PVT = Stack.back();
+ Stack.pop_back();
+ if (EVT.hasValue() && EVT.getValue() != PVT) {
+ return typeError(
+ ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) +
+ ", expected " +
+ WebAssembly::typeToString(EVT.getValue()));
+ }
+ return false;
+}
+
+bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst,
+ wasm::ValType &Type) {
+ auto Local = static_cast<size_t>(Inst.getOperand(0).getImm());
+ if (Local >= LocalTypes.size())
+ return typeError(ErrorLoc, StringRef("no local type specified for index ") +
+ std::to_string(Local));
+ Type = LocalTypes[Local];
+ return false;
+}
+
+bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc) {
+ if (LastSig.Returns.size() > Stack.size())
+ return typeError(ErrorLoc, "end: insufficient values on the type stack");
+ for (size_t i = 0; i < LastSig.Returns.size(); i++) {
+ auto EVT = LastSig.Returns[i];
+ auto PVT = Stack[Stack.size() - LastSig.Returns.size() + i];
+ if (PVT != EVT)
+ return typeError(
+ ErrorLoc, StringRef("end got ") + WebAssembly::typeToString(PVT) +
+ ", expected " + WebAssembly::typeToString(EVT));
+ }
+ return false;
+}
+
+bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc,
+ const wasm::WasmSignature& Sig) {
+ for (auto VT : llvm::reverse(Sig.Params))
+ if (popType(ErrorLoc, VT)) return true;
+ Stack.insert(Stack.end(), Sig.Returns.begin(), Sig.Returns.end());
+ return false;
+}
+
+bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+ const MCSymbolRefExpr *&SymRef) {
+ auto Op = Inst.getOperand(0);
+ if (!Op.isExpr())
+ return typeError(ErrorLoc, StringRef("expected expression operand"));
+ SymRef = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+ if (!SymRef)
+ return typeError(ErrorLoc, StringRef("expected symbol operand"));
+ return false;
+}
+
+bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
+ wasm::ValType &Type) {
+ const MCSymbolRefExpr *SymRef;
+ if (getSymRef(ErrorLoc, Inst, SymRef))
+ return true;
+ auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+ switch (WasmSym->getType().getValueOr(wasm::WASM_SYMBOL_TYPE_DATA)) {
+ case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+ Type = static_cast<wasm::ValType>(WasmSym->getGlobalType().Type);
+ break;
+ case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+ case wasm::WASM_SYMBOL_TYPE_DATA:
+ if (SymRef->getKind() == MCSymbolRefExpr::VK_GOT) {
+ Type = is64 ? wasm::ValType::I64 : wasm::ValType::I32;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ default:
+ return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+ " missing .globaltype");
+ }
+ return false;
+}
+
+void WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
+ // Check the return types.
+ for (auto RVT : llvm::reverse(ReturnTypes)) {
+ popType(ErrorLoc, RVT);
+ }
+ if (!Stack.empty()) {
+ typeError(ErrorLoc,
+ std::to_string(Stack.size()) + " superfluous return values");
+ }
+ // Reset the type checker state.
+ Clear();
+}
+
+bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
+ auto Opc = Inst.getOpcode();
+ auto Name = GetMnemonic(Opc);
+ dumpTypeStack("typechecking " + Name + ": ");
+ wasm::ValType Type;
+ if (Name == "local.get") {
+ if (getLocal(ErrorLoc, Inst, Type))
+ return true;
+ Stack.push_back(Type);
+ } else if (Name == "local.set") {
+ if (getLocal(ErrorLoc, Inst, Type))
+ return true;
+ if (popType(ErrorLoc, Type))
+ return true;
+ } else if (Name == "local.tee") {
+ if (getLocal(ErrorLoc, Inst, Type))
+ return true;
+ if (popType(ErrorLoc, Type))
+ return true;
+ Stack.push_back(Type);
+ } else if (Name == "global.get") {
+ if (getGlobal(ErrorLoc, Inst, Type))
+ return true;
+ Stack.push_back(Type);
+ } else if (Name == "global.set") {
+ if (getGlobal(ErrorLoc, Inst, Type))
+ return true;
+ if (popType(ErrorLoc, Type))
+ return true;
+ } else if (Name == "drop") {
+ if (popType(ErrorLoc, {}))
+ return true;
+ } else if (Name == "end_block" || Name == "end_loop" || Name == "end_if" ||
+ Name == "else") {
+ if (checkEnd(ErrorLoc))
+ return true;
+ } else if (Name == "call_indirect" || Name == "return_call_indirect") {
+ // Function value.
+ if (popType(ErrorLoc, wasm::ValType::I32)) return true;
+ if (checkSig(ErrorLoc, LastSig)) return true;
+ } else if (Name == "call" || Name == "return_call") {
+ const MCSymbolRefExpr *SymRef;
+ if (getSymRef(ErrorLoc, Inst, SymRef))
+ return true;
+ auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+ auto Sig = WasmSym->getSignature();
+ if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_FUNCTION)
+ return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+ " missing .functype");
+ if (checkSig(ErrorLoc, *Sig)) return true;
+ } else if (Name == "ref.null") {
+ auto VT = static_cast<wasm::ValType>(Inst.getOperand(0).getImm());
+ Stack.push_back(VT);
+ } else {
+ // The current instruction is a stack instruction which doesn't have
+ // explicit operands that indicate push/pop types, so we get those from
+ // the register version of the same instruction.
+ auto RegOpc = WebAssembly::getRegisterOpcode(Opc);
+ assert(RegOpc != -1 && "Failed to get register version of MC instruction");
+ const auto &II = MII.get(RegOpc);
+ // First pop all the uses off the stack and check them.
+ for (unsigned I = II.getNumOperands(); I > II.getNumDefs(); I--) {
+ const auto &Op = II.OpInfo[I - 1];
+ if (Op.OperandType == MCOI::OPERAND_REGISTER) {
+ auto VT = WebAssembly::regClassToValType(Op.RegClass);
+ if (popType(ErrorLoc, VT))
+ return true;
+ }
+ }
+ // Now push all the defs onto the stack.
+ for (unsigned I = 0; I < II.getNumDefs(); I++) {
+ const auto &Op = II.OpInfo[I];
+ assert(Op.OperandType == MCOI::OPERAND_REGISTER && "Register expected");
+ auto VT = WebAssembly::regClassToValType(Op.RegClass);
+ Stack.push_back(VT);
+ }
+ }
+ return false;
+}
+
+} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
new file mode 100644
index 0000000..a15a69b
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -0,0 +1,66 @@
+//==- WebAssemblyAsmTypeCheck.h - Assembler for WebAssembly -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file is part of the WebAssembly Assembler.
+///
+/// It contains code to translate a parsed .s file into MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_ASMPARSER_TYPECHECK_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_ASMPARSER_TYPECHECK_H
+
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCSymbol.h"
+
+namespace llvm {
+
+class WebAssemblyAsmTypeCheck final {
+ MCAsmParser &Parser;
+ const MCInstrInfo &MII;
+
+ SmallVector<wasm::ValType, 8> Stack;
+ SmallVector<wasm::ValType, 16> LocalTypes;
+ SmallVector<wasm::ValType, 4> ReturnTypes;
+ wasm::WasmSignature LastSig;
+ bool TypeErrorThisFunction = false;
+ bool is64;
+
+ void Clear() {
+ Stack.clear();
+ LocalTypes.clear();
+ ReturnTypes.clear();
+ TypeErrorThisFunction = false;
+ }
+
+ void dumpTypeStack(Twine Msg);
+ bool typeError(SMLoc ErrorLoc, const Twine &Msg);
+ bool popType(SMLoc ErrorLoc, Optional<wasm::ValType> EVT);
+ bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+ bool checkEnd(SMLoc ErrorLoc);
+ bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig);
+ bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+ const MCSymbolRefExpr *&SymRef);
+ bool getGlobal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+
+public:
+ WebAssemblyAsmTypeCheck(MCAsmParser &Parser, const MCInstrInfo &MII, bool is64);
+
+ void funcDecl(const wasm::WasmSignature &Sig);
+ void localDecl(const SmallVector<wasm::ValType, 4> &Locals);
+ void setLastSig(const wasm::WasmSignature &Sig) { LastSig = Sig; }
+ void endOfFunction(SMLoc ErrorLoc);
+ bool typeCheck(SMLoc ErrorLoc, const MCInst &Inst);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_WEBASSEMBLY_ASMPARSER_TYPECHECK_H
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/CMakeLists.txt
index 87920e7..5ce8df8 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -38,6 +38,8 @@
WebAssemblyLowerGlobalDtors.cpp
WebAssemblyMachineFunctionInfo.cpp
WebAssemblyMCInstLower.cpp
+ WebAssemblyMCLowerPrePass.cpp
+ WebAssemblyNullifyDebugValueLists.cpp
WebAssemblyOptimizeLiveIntervals.cpp
WebAssemblyOptimizeReturned.cpp
WebAssemblyPeephole.cpp
@@ -56,7 +58,6 @@
WebAssemblyTargetMachine.cpp
WebAssemblyTargetObjectFile.cpp
WebAssemblyTargetTransformInfo.cpp
- WebAssemblyUtilities.cpp
DEPENDS
intrinsics_gen
@@ -75,6 +76,7 @@
TransformUtils
WebAssemblyDesc
WebAssemblyInfo
+ WebAssemblyUtils
ADD_TO_COMPONENT
WebAssembly
@@ -84,3 +86,4 @@
add_subdirectory(Disassembler)
add_subdirectory(MCTargetDesc)
add_subdirectory(TargetInfo)
+add_subdirectory(Utils)
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
index 1782f57..bb18b4d 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
@@ -5,6 +5,7 @@
WebAssemblyDesc
MCDisassembler
WebAssemblyInfo
+ WebAssemblyUtils
Support
MC
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 1b7cc09..6770ccc 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -14,9 +14,8 @@
///
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/WebAssemblyInstPrinter.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
@@ -114,7 +113,8 @@
Bytes.data() + Size);
Size += sizeof(T);
if (std::is_floating_point<T>::value) {
- MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
+ MI.addOperand(
+ MCOperand::createDFPImm(bit_cast<uint64_t>(static_cast<double>(Val))));
} else {
MI.addOperand(MCOperand::createImm(static_cast<int64_t>(Val)));
}
@@ -203,7 +203,7 @@
case WebAssembly::OPERAND_OFFSET64:
case WebAssembly::OPERAND_P2ALIGN:
case WebAssembly::OPERAND_TYPEINDEX:
- case WebAssembly::OPERAND_EVENT:
+ case WebAssembly::OPERAND_TAG:
case MCOI::OPERAND_IMMEDIATE: {
if (!parseLEBImmediate(MI, Size, Bytes, false))
return MCDisassembler::Fail;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
index 6a68e7d..ac7d492 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
@@ -11,6 +11,7 @@
MC
Support
WebAssemblyInfo
+ WebAssemblyUtils
ADD_TO_COMPONENT
WebAssembly
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index fb8b0c3..2967aaa 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -13,9 +13,10 @@
#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -48,8 +49,35 @@
StringRef Annot,
const MCSubtargetInfo &STI,
raw_ostream &OS) {
- // Print the instruction (this uses the AsmStrings from the .td files).
- printInstruction(MI, Address, OS);
+ switch (MI->getOpcode()) {
+ case WebAssembly::CALL_INDIRECT_S:
+ case WebAssembly::RET_CALL_INDIRECT_S: {
+ // A special case for call_indirect (and ret_call_indirect), if the table
+ // operand is a symbol: the order of the type and table operands is inverted
+ // in the text format relative to the binary format. Otherwise if table the
+ // operand isn't a symbol, then we have an MVP compilation unit, and the
+ // table shouldn't appear in the output.
+ OS << "\t";
+ OS << getMnemonic(MI).first;
+ OS << " ";
+
+ assert(MI->getNumOperands() == 2);
+ const unsigned TypeOperand = 0;
+ const unsigned TableOperand = 1;
+ if (MI->getOperand(TableOperand).isExpr()) {
+ printOperand(MI, TableOperand, OS);
+ OS << ", ";
+ } else {
+ assert(MI->getOperand(TableOperand).getImm() == 0);
+ }
+ printOperand(MI, TypeOperand, OS);
+ break;
+ }
+ default:
+ // Print the instruction (this uses the AsmStrings from the .td files).
+ printInstruction(MI, Address, OS);
+ break;
+ }
// Print any additional variadic operands.
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
@@ -68,7 +96,7 @@
for (auto I = Start, E = MI->getNumOperands(); I < E; ++I) {
if (MI->getOpcode() == WebAssembly::CALL_INDIRECT &&
I - Start == NumVariadicDefs) {
- // Skip type and flags arguments when printing for tests
+ // Skip type and table arguments when printing for tests.
++I;
continue;
}
@@ -104,7 +132,8 @@
case WebAssembly::TRY:
case WebAssembly::TRY_S:
ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, false));
- EHPadStack.push_back(ControlFlowCounter++);
+ TryStack.push_back(ControlFlowCounter++);
+ EHInstStack.push_back(TRY);
return;
case WebAssembly::END_LOOP:
@@ -128,11 +157,12 @@
case WebAssembly::END_TRY:
case WebAssembly::END_TRY_S:
- if (ControlFlowStack.empty()) {
+ if (ControlFlowStack.empty() || EHInstStack.empty()) {
printAnnotation(OS, "End marker mismatch!");
} else {
printAnnotation(
OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+ EHInstStack.pop_back();
}
return;
@@ -140,10 +170,24 @@
case WebAssembly::CATCH_S:
case WebAssembly::CATCH_ALL:
case WebAssembly::CATCH_ALL_S:
- if (EHPadStack.empty()) {
+ // There can be multiple catch instructions for one try instruction, so
+ // we print a label only for the first 'catch' label.
+ if (EHInstStack.empty()) {
printAnnotation(OS, "try-catch mismatch!");
- } else {
- printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
+ } else if (EHInstStack.back() == CATCH_ALL) {
+ printAnnotation(OS, "catch/catch_all cannot occur after catch_all");
+ } else if (EHInstStack.back() == TRY) {
+ if (TryStack.empty()) {
+ printAnnotation(OS, "try-catch mismatch!");
+ } else {
+ printAnnotation(OS, "catch" + utostr(TryStack.pop_back_val()) + ':');
+ }
+ EHInstStack.pop_back();
+ if (Opc == WebAssembly::CATCH || Opc == WebAssembly::CATCH_S) {
+ EHInstStack.push_back(CATCH);
+ } else {
+ EHInstStack.push_back(CATCH_ALL);
+ }
}
return;
@@ -151,10 +195,39 @@
case WebAssembly::RETHROW_S:
// 'rethrow' rethrows to the nearest enclosing catch scope, if any. If
// there's no enclosing catch scope, it throws up to the caller.
- if (EHPadStack.empty()) {
+ if (TryStack.empty()) {
printAnnotation(OS, "to caller");
} else {
- printAnnotation(OS, "down to catch" + utostr(EHPadStack.back()));
+ printAnnotation(OS, "down to catch" + utostr(TryStack.back()));
+ }
+ return;
+
+ case WebAssembly::DELEGATE:
+ case WebAssembly::DELEGATE_S:
+ if (ControlFlowStack.empty() || TryStack.empty() || EHInstStack.empty()) {
+ printAnnotation(OS, "try-delegate mismatch!");
+ } else {
+ // 'delegate' is
+ // 1. A marker for the end of block label
+ // 2. A destination for throwing instructions
+ // 3. An instruction that itself rethrows to another 'catch'
+ assert(ControlFlowStack.back().first == TryStack.back());
+ std::string Label = "label/catch" +
+ utostr(ControlFlowStack.pop_back_val().first) +
+ ": ";
+ TryStack.pop_back();
+ EHInstStack.pop_back();
+ uint64_t Depth = MI->getOperand(0).getImm();
+ if (Depth >= ControlFlowStack.size()) {
+ Label += "to caller";
+ } else {
+ const auto &Pair = ControlFlowStack.rbegin()[Depth];
+ if (Pair.second)
+ printAnnotation(OS, "delegate cannot target a loop");
+ else
+ Label += "down to catch" + utostr(Pair.first);
+ }
+ printAnnotation(OS, Label);
}
return;
}
@@ -235,17 +308,10 @@
O << '=';
} else if (Op.isImm()) {
O << Op.getImm();
- } else if (Op.isFPImm()) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- const MCOperandInfo &Info = Desc.OpInfo[OpNo];
- if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
- // TODO: MC converts all floating point immediate operands to double.
- // This is fine for numeric values, but may cause NaNs to change bits.
- O << ::toString(APFloat(float(Op.getFPImm())));
- } else {
- assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
- O << ::toString(APFloat(Op.getFPImm()));
- }
+ } else if (Op.isSFPImm()) {
+ O << ::toString(APFloat(APFloat::IEEEsingle(), APInt(32, Op.getSFPImm())));
+ } else if (Op.isDFPImm()) {
+ O << ::toString(APFloat(APFloat::IEEEdouble(), APInt(64, Op.getDFPImm())));
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
// call_indirect instructions have a TYPEINDEX operand that we print
@@ -323,52 +389,3 @@
O << "unsupported_heap_type_operand";
}
}
-
-// We have various enums representing a subset of these types, use this
-// function to convert any of them to text.
-const char *WebAssembly::anyTypeToString(unsigned Ty) {
- switch (Ty) {
- case wasm::WASM_TYPE_I32:
- return "i32";
- case wasm::WASM_TYPE_I64:
- return "i64";
- case wasm::WASM_TYPE_F32:
- return "f32";
- case wasm::WASM_TYPE_F64:
- return "f64";
- case wasm::WASM_TYPE_V128:
- return "v128";
- case wasm::WASM_TYPE_FUNCREF:
- return "funcref";
- case wasm::WASM_TYPE_EXTERNREF:
- return "externref";
- case wasm::WASM_TYPE_FUNC:
- return "func";
- case wasm::WASM_TYPE_NORESULT:
- return "void";
- default:
- return "invalid_type";
- }
-}
-
-const char *WebAssembly::typeToString(wasm::ValType Ty) {
- return anyTypeToString(static_cast<unsigned>(Ty));
-}
-
-std::string WebAssembly::typeListToString(ArrayRef<wasm::ValType> List) {
- std::string S;
- for (auto &Ty : List) {
- if (&Ty != &List[0]) S += ", ";
- S += WebAssembly::typeToString(Ty);
- }
- return S;
-}
-
-std::string WebAssembly::signatureToString(const wasm::WasmSignature *Sig) {
- std::string S("(");
- S += typeListToString(Sig->Params);
- S += ") -> (";
- S += typeListToString(Sig->Returns);
- S += ")";
- return S;
-}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 2ed6d56..7d980c7 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -26,7 +26,10 @@
class WebAssemblyInstPrinter final : public MCInstPrinter {
uint64_t ControlFlowCounter = 0;
SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
- SmallVector<uint64_t, 4> EHPadStack;
+ SmallVector<uint64_t, 4> TryStack;
+
+ enum EHInstKind { TRY, CATCH, CATCH_ALL };
+ SmallVector<EHInstKind, 4> EHInstStack;
public:
WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
@@ -53,16 +56,6 @@
static const char *getRegisterName(unsigned RegNo);
};
-namespace WebAssembly {
-
-const char *typeToString(wasm::ValType Ty);
-const char *anyTypeToString(unsigned Ty);
-
-std::string typeListToString(ArrayRef<wasm::ValType> List);
-std::string signatureToString(const wasm::WasmSignature *Sig);
-
-} // end namespace WebAssembly
-
} // end namespace llvm
#endif
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 55bf5d14..4961c2e 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -130,19 +130,12 @@
encodeULEB128(uint64_t(MO.getImm()), OS);
}
- } else if (MO.isFPImm()) {
- const MCOperandInfo &Info = Desc.OpInfo[I];
- if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
- // TODO: MC converts all floating point immediate operands to double.
- // This is fine for numeric values, but may cause NaNs to change bits.
- auto F = float(MO.getFPImm());
- support::endian::write<float>(OS, F, support::little);
- } else {
- assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
- double D = MO.getFPImm();
- support::endian::write<double>(OS, D, support::little);
- }
-
+ } else if (MO.isSFPImm()) {
+ uint32_t F = MO.getSFPImm();
+ support::endian::write<uint32_t>(OS, F, support::little);
+ } else if (MO.isDFPImm()) {
+ uint64_t D = MO.getDFPImm();
+ support::endian::write<uint64_t>(OS, D, support::little);
} else if (MO.isExpr()) {
const MCOperandInfo &Info = Desc.OpInfo[I];
llvm::MCFixupKind FixupKind;
@@ -161,7 +154,7 @@
case WebAssembly::OPERAND_SIGNATURE:
case WebAssembly::OPERAND_TYPEINDEX:
case WebAssembly::OPERAND_GLOBAL:
- case WebAssembly::OPERAND_EVENT:
+ case WebAssembly::OPERAND_TAG:
FixupKind = MCFixupKind(WebAssembly::fixup_uleb128_i32);
break;
case WebAssembly::OPERAND_OFFSET64:
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 064e613..31cccb2 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -129,29 +129,3 @@
TargetRegistry::RegisterNullTargetStreamer(*T, createNullTargetStreamer);
}
}
-
-wasm::ValType WebAssembly::toValType(const MVT &Ty) {
- switch (Ty.SimpleTy) {
- case MVT::i32:
- return wasm::ValType::I32;
- case MVT::i64:
- return wasm::ValType::I64;
- case MVT::f32:
- return wasm::ValType::F32;
- case MVT::f64:
- return wasm::ValType::F64;
- case MVT::v16i8:
- case MVT::v8i16:
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v4f32:
- case MVT::v2f64:
- return wasm::ValType::V128;
- case MVT::funcref:
- return wasm::ValType::FUNCREF;
- case MVT::externref:
- return wasm::ValType::EXTERNREF;
- default:
- llvm_unreachable("unexpected type");
- }
-}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 3508ec0..99defb4 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -72,8 +72,8 @@
OPERAND_SIGNATURE,
/// type signature immediate for call_indirect.
OPERAND_TYPEINDEX,
- /// Event index.
- OPERAND_EVENT,
+ /// Tag index.
+ OPERAND_TAG,
/// A list of branch targets for br_list.
OPERAND_BRLIST,
/// 32-bit unsigned table number.
@@ -129,38 +129,10 @@
namespace llvm {
namespace WebAssembly {
-/// Used as immediate MachineOperands for block signatures
-enum class BlockType : unsigned {
- Invalid = 0x00,
- Void = 0x40,
- I32 = unsigned(wasm::ValType::I32),
- I64 = unsigned(wasm::ValType::I64),
- F32 = unsigned(wasm::ValType::F32),
- F64 = unsigned(wasm::ValType::F64),
- V128 = unsigned(wasm::ValType::V128),
- Externref = unsigned(wasm::ValType::EXTERNREF),
- Funcref = unsigned(wasm::ValType::FUNCREF),
- // Multivalue blocks (and other non-void blocks) are only emitted when the
- // blocks will never be exited and are at the ends of functions (see
- // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
- // to pop values off the stack, so the exact multivalue signature can always
- // be inferred from the return type of the parent function in MCInstLower.
- Multivalue = 0xffff,
-};
-
-/// Used as immediate MachineOperands for heap types, e.g. for ref.null.
-enum class HeapType : unsigned {
- Invalid = 0x00,
- Externref = unsigned(wasm::ValType::EXTERNREF),
- Funcref = unsigned(wasm::ValType::FUNCREF),
-};
-
/// Instruction opcodes emitted via means other than CodeGen.
static const unsigned Nop = 0x01;
static const unsigned End = 0x0b;
-wasm::ValType toValType(const MVT &Ty);
-
/// Return the default p2align value for a load or store with the given opcode.
inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
switch (Opc) {
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 652d7a0..397b9b0 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -13,8 +13,8 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
-#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -79,14 +79,21 @@
void WebAssemblyTargetAsmStreamer::emitTableType(const MCSymbolWasm *Sym) {
assert(Sym->isTable());
+ const wasm::WasmTableType &Type = Sym->getTableType();
OS << "\t.tabletype\t" << Sym->getName() << ", "
- << WebAssembly::typeToString(Sym->getTableType());
+ << WebAssembly::typeToString(static_cast<wasm::ValType>(Type.ElemType));
+ bool HasMaximum = Type.Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX;
+ if (Type.Limits.Minimum != 0 || HasMaximum) {
+ OS << ", " << Type.Limits.Minimum;
+ if (HasMaximum)
+ OS << ", " << Type.Limits.Maximum;
+ }
OS << '\n';
}
-void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
- assert(Sym->isEvent());
- OS << "\t.eventtype\t" << Sym->getName() << " ";
+void WebAssemblyTargetAsmStreamer::emitTagType(const MCSymbolWasm *Sym) {
+ assert(Sym->isTag());
+ OS << "\t.tagtype\t" << Sym->getName() << " ";
OS << WebAssembly::typeListToString(Sym->getSignature()->Params);
OS << "\n";
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 75c9fb4..c0ad63c 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -22,6 +22,7 @@
namespace llvm {
class MCSymbolWasm;
+class formatted_raw_ostream;
/// WebAssembly-specific streamer interface, to implement support
/// WebAssembly-specific assembly directives.
@@ -41,8 +42,8 @@
virtual void emitGlobalType(const MCSymbolWasm *Sym) = 0;
/// .tabletype
virtual void emitTableType(const MCSymbolWasm *Sym) = 0;
- /// .eventtype
- virtual void emitEventType(const MCSymbolWasm *Sym) = 0;
+ /// .tagtype
+ virtual void emitTagType(const MCSymbolWasm *Sym) = 0;
/// .import_module
virtual void emitImportModule(const MCSymbolWasm *Sym,
StringRef ImportModule) = 0;
@@ -70,7 +71,7 @@
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalType(const MCSymbolWasm *Sym) override;
void emitTableType(const MCSymbolWasm *Sym) override;
- void emitEventType(const MCSymbolWasm *Sym) override;
+ void emitTagType(const MCSymbolWasm *Sym) override;
void emitImportModule(const MCSymbolWasm *Sym, StringRef ImportModule) override;
void emitImportName(const MCSymbolWasm *Sym, StringRef ImportName) override;
void emitExportName(const MCSymbolWasm *Sym, StringRef ExportName) override;
@@ -87,7 +88,7 @@
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalType(const MCSymbolWasm *Sym) override {}
void emitTableType(const MCSymbolWasm *Sym) override {}
- void emitEventType(const MCSymbolWasm *Sym) override {}
+ void emitTagType(const MCSymbolWasm *Sym) override {}
void emitImportModule(const MCSymbolWasm *Sym,
StringRef ImportModule) override {}
void emitImportName(const MCSymbolWasm *Sym,
@@ -108,7 +109,7 @@
void emitIndIdx(const MCExpr *) override {}
void emitGlobalType(const MCSymbolWasm *) override {}
void emitTableType(const MCSymbolWasm *) override {}
- void emitEventType(const MCSymbolWasm *) override {}
+ void emitTagType(const MCSymbolWasm *) override {}
void emitImportModule(const MCSymbolWasm *, StringRef) override {}
void emitImportName(const MCSymbolWasm *, StringRef) override {}
void emitExportName(const MCSymbolWasm *, StringRef) override {}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index aa7e231..f67fab9 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -34,8 +34,9 @@
explicit WebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten);
private:
- unsigned getRelocType(const MCValue &Target,
- const MCFixup &Fixup) const override;
+ unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+ const MCSectionWasm &FixupSection,
+ bool IsLocRel) const override;
};
} // end anonymous namespace
@@ -43,7 +44,7 @@
bool IsEmscripten)
: MCWasmObjectTargetWriter(Is64Bit, IsEmscripten) {}
-static const MCSection *getFixupSection(const MCExpr *Expr) {
+static const MCSection *getTargetSection(const MCExpr *Expr) {
if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
if (SyExp->getSymbol().isInSection())
return &SyExp->getSymbol().getSection();
@@ -51,19 +52,20 @@
}
if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr)) {
- auto SectionLHS = getFixupSection(BinOp->getLHS());
- auto SectionRHS = getFixupSection(BinOp->getRHS());
+ auto SectionLHS = getTargetSection(BinOp->getLHS());
+ auto SectionRHS = getTargetSection(BinOp->getRHS());
return SectionLHS == SectionRHS ? nullptr : SectionLHS;
}
if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
- return getFixupSection(UnOp->getSubExpr());
+ return getTargetSection(UnOp->getSubExpr());
return nullptr;
}
-unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
- const MCFixup &Fixup) const {
+unsigned WebAssemblyWasmObjectWriter::getRelocType(
+ const MCValue &Target, const MCFixup &Fixup,
+ const MCSectionWasm &FixupSection, bool IsLocRel) const {
const MCSymbolRefExpr *RefA = Target.getSymA();
assert(RefA);
auto& SymA = cast<MCSymbolWasm>(RefA->getSymbol());
@@ -75,9 +77,11 @@
return wasm::R_WASM_GLOBAL_INDEX_LEB;
case MCSymbolRefExpr::VK_WASM_TBREL:
assert(SymA.isFunction());
- return wasm::R_WASM_TABLE_INDEX_REL_SLEB;
+ return is64Bit() ? wasm::R_WASM_TABLE_INDEX_REL_SLEB64
+ : wasm::R_WASM_TABLE_INDEX_REL_SLEB;
case MCSymbolRefExpr::VK_WASM_TLSREL:
- return wasm::R_WASM_MEMORY_ADDR_TLS_SLEB;
+ return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64
+ : wasm::R_WASM_MEMORY_ADDR_TLS_SLEB;
case MCSymbolRefExpr::VK_WASM_MBREL:
assert(SymA.isData());
return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64
@@ -102,8 +106,8 @@
return wasm::R_WASM_GLOBAL_INDEX_LEB;
if (SymA.isFunction())
return wasm::R_WASM_FUNCTION_INDEX_LEB;
- if (SymA.isEvent())
- return wasm::R_WASM_EVENT_INDEX_LEB;
+ if (SymA.isTag())
+ return wasm::R_WASM_TAG_INDEX_LEB;
if (SymA.isTable())
return wasm::R_WASM_TABLE_NUMBER_LEB;
return wasm::R_WASM_MEMORY_ADDR_LEB;
@@ -111,25 +115,33 @@
assert(SymA.isData());
return wasm::R_WASM_MEMORY_ADDR_LEB64;
case FK_Data_4:
- if (SymA.isFunction())
+ if (SymA.isFunction()) {
+ if (FixupSection.getKind().isMetadata())
+ return wasm::R_WASM_FUNCTION_OFFSET_I32;
+ assert(FixupSection.isWasmData());
return wasm::R_WASM_TABLE_INDEX_I32;
+ }
if (SymA.isGlobal())
return wasm::R_WASM_GLOBAL_INDEX_I32;
if (auto Section = static_cast<const MCSectionWasm *>(
- getFixupSection(Fixup.getValue()))) {
+ getTargetSection(Fixup.getValue()))) {
if (Section->getKind().isText())
return wasm::R_WASM_FUNCTION_OFFSET_I32;
else if (!Section->isWasmData())
return wasm::R_WASM_SECTION_OFFSET_I32;
}
- return wasm::R_WASM_MEMORY_ADDR_I32;
+ return IsLocRel ? wasm::R_WASM_MEMORY_ADDR_LOCREL_I32
+ : wasm::R_WASM_MEMORY_ADDR_I32;
case FK_Data_8:
- if (SymA.isFunction())
+ if (SymA.isFunction()) {
+ if (FixupSection.getKind().isMetadata())
+ return wasm::R_WASM_FUNCTION_OFFSET_I64;
return wasm::R_WASM_TABLE_INDEX_I64;
+ }
if (SymA.isGlobal())
llvm_unreachable("unimplemented R_WASM_GLOBAL_INDEX_I64");
if (auto Section = static_cast<const MCSectionWasm *>(
- getFixupSection(Fixup.getValue()))) {
+ getTargetSection(Fixup.getValue()))) {
if (Section->getKind().isText())
return wasm::R_WASM_FUNCTION_OFFSET_I64;
else if (!Section->isWasmData())
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/README.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/README.txt
index ef3f5aa..934a3ba 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/README.txt
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/README.txt
@@ -1,19 +1,23 @@
//===-- README.txt - Notes for WebAssembly code gen -----------------------===//
-This WebAssembly backend is presently under development.
+The object format emitted by the WebAssembly backed is documented in:
-The most notable feature which is not yet stable is the ".o" file format.
-".o" file support is needed for many common ways of using LLVM, such as
-using it through "clang -c", so this backend is not yet considered widely
-usable. However, this backend is usable within some language toolchain
-packages:
+ * https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
-Emscripten provides a C/C++ compilation environment that includes standard
-libraries, tools, and packaging for producing WebAssembly applications that
-can run in browsers and other environments. For more information, see the
-Emscripten documentation in general, and this page in particular:
+The C ABI is described in:
- * https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
+ * https://github.com/WebAssembly/tool-conventions/blob/master/BasicCABI.md
+
+For more information on WebAssembly itself, see the home page:
+
+ * https://webassembly.github.io/
+
+Emscripten provides a C/C++ compilation environment based on clang which
+includes standard libraries, tools, and packaging for producing WebAssembly
+applications that can run in browsers and other environments.
+
+wasi-sdk provides a more minimal C/C++ SDK based on clang, llvm and a libc based
+on musl, for producing WebAssemmbly applictions that use the WASI ABI.
Rust provides WebAssembly support integrated into Cargo. There are two
main options:
@@ -25,38 +29,11 @@
For more information, see:
* https://www.hellorust.com/
-
-This backend does not yet support debug info. Full DWARF support needs a
-design for how DWARF should be represented in WebAssembly. Sourcemap support
-has an existing design and some corresponding browser implementations, so it
-just needs implementing in LLVM.
-
-Work-in-progress documentation for the ".o" file format is here:
-
- * https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
-
-A corresponding linker implementation is also under development:
-
- * https://lld.llvm.org/WebAssembly.html
-
-For more information on WebAssembly itself, see the home page:
- * https://webassembly.github.io/
-
The following documents contain some information on the semantics and binary
encoding of WebAssembly itself:
* https://github.com/WebAssembly/design/blob/master/Semantics.md
* https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
-The backend is built, tested and archived on the following waterfall:
- https://wasm-stat.us
-
-The backend's bringup is done in part by using the GCC torture test suite, since
-it doesn't require C library support. Current known failures are in
-known_gcc_test_failures.txt, all other tests should pass. The waterfall will
-turn red if not. Once most of these pass, further testing will use LLVM's own
-test suite. The tests can be run locally using:
- https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
-
Some notes on ways that the generated code could be improved follow:
//===---------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/src/llvm-project/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
index be7a632..741cc00 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
@@ -24,6 +24,7 @@
namespace WebAssembly {
int getStackOpcode(unsigned short Opcode);
+int getRegisterOpcode(unsigned short Opcode);
int getWasm64Opcode(unsigned short Opcode);
} // namespace WebAssembly
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/CMakeLists.txt
new file mode 100644
index 0000000..3fc1732
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_component_library(LLVMWebAssemblyUtils
+ WebAssemblyUtilities.cpp
+ WebAssemblyTypeUtilities.cpp
+
+ LINK_COMPONENTS
+ CodeGen
+ Core
+ MC
+ Support
+
+ ADD_TO_COMPONENT
+ WebAssembly
+ )
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
new file mode 100644
index 0000000..6f81431
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -0,0 +1,176 @@
+//===-- WebAssemblyTypeUtilities.cpp - WebAssembly Type Utility Functions -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements several utility functions for WebAssembly type parsing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTypeUtilities.h"
+#include "llvm/ADT/StringSwitch.h"
+
+// Get register classes enum.
+#define GET_REGINFO_ENUM
+#include "WebAssemblyGenRegisterInfo.inc"
+
+using namespace llvm;
+
+Optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
+ // FIXME: can't use StringSwitch because wasm::ValType doesn't have a
+ // "invalid" value.
+ if (Type == "i32")
+ return wasm::ValType::I32;
+ if (Type == "i64")
+ return wasm::ValType::I64;
+ if (Type == "f32")
+ return wasm::ValType::F32;
+ if (Type == "f64")
+ return wasm::ValType::F64;
+ if (Type == "v128" || Type == "i8x16" || Type == "i16x8" || Type == "i32x4" ||
+ Type == "i64x2" || Type == "f32x4" || Type == "f64x2")
+ return wasm::ValType::V128;
+ if (Type == "funcref")
+ return wasm::ValType::FUNCREF;
+ if (Type == "externref")
+ return wasm::ValType::EXTERNREF;
+ return Optional<wasm::ValType>();
+}
+
+WebAssembly::HeapType WebAssembly::parseHeapType(StringRef Type) {
+ return StringSwitch<WebAssembly::HeapType>(Type)
+ .Case("extern", WebAssembly::HeapType::Externref)
+ .Case("func", WebAssembly::HeapType::Funcref)
+ .Default(WebAssembly::HeapType::Invalid);
+}
+
+WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
+ // Multivalue block types are handled separately in parseSignature
+ return StringSwitch<WebAssembly::BlockType>(Type)
+ .Case("i32", WebAssembly::BlockType::I32)
+ .Case("i64", WebAssembly::BlockType::I64)
+ .Case("f32", WebAssembly::BlockType::F32)
+ .Case("f64", WebAssembly::BlockType::F64)
+ .Case("v128", WebAssembly::BlockType::V128)
+ .Case("funcref", WebAssembly::BlockType::Funcref)
+ .Case("externref", WebAssembly::BlockType::Externref)
+ .Case("void", WebAssembly::BlockType::Void)
+ .Default(WebAssembly::BlockType::Invalid);
+}
+
+MVT WebAssembly::parseMVT(StringRef Type) {
+ return StringSwitch<MVT>(Type)
+ .Case("i32", MVT::i32)
+ .Case("i64", MVT::i64)
+ .Case("f32", MVT::f32)
+ .Case("f64", MVT::f64)
+ .Case("i64", MVT::i64)
+ .Case("v16i8", MVT::v16i8)
+ .Case("v8i16", MVT::v8i16)
+ .Case("v4i32", MVT::v4i32)
+ .Case("v2i64", MVT::v2i64)
+ .Case("funcref", MVT::funcref)
+ .Case("externref", MVT::externref)
+ .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
+}
+
+// We have various enums representing a subset of these types, use this
+// function to convert any of them to text.
+const char *WebAssembly::anyTypeToString(unsigned Type) {
+ switch (Type) {
+ case wasm::WASM_TYPE_I32:
+ return "i32";
+ case wasm::WASM_TYPE_I64:
+ return "i64";
+ case wasm::WASM_TYPE_F32:
+ return "f32";
+ case wasm::WASM_TYPE_F64:
+ return "f64";
+ case wasm::WASM_TYPE_V128:
+ return "v128";
+ case wasm::WASM_TYPE_FUNCREF:
+ return "funcref";
+ case wasm::WASM_TYPE_EXTERNREF:
+ return "externref";
+ case wasm::WASM_TYPE_FUNC:
+ return "func";
+ case wasm::WASM_TYPE_NORESULT:
+ return "void";
+ default:
+ return "invalid_type";
+ }
+}
+
+const char *WebAssembly::typeToString(wasm::ValType Type) {
+ return anyTypeToString(static_cast<unsigned>(Type));
+}
+
+std::string WebAssembly::typeListToString(ArrayRef<wasm::ValType> List) {
+ std::string S;
+ for (const auto &Type : List) {
+ if (&Type != &List[0])
+ S += ", ";
+ S += WebAssembly::typeToString(Type);
+ }
+ return S;
+}
+
+std::string WebAssembly::signatureToString(const wasm::WasmSignature *Sig) {
+ std::string S("(");
+ S += typeListToString(Sig->Params);
+ S += ") -> (";
+ S += typeListToString(Sig->Returns);
+ S += ")";
+ return S;
+}
+
+wasm::ValType WebAssembly::toValType(MVT Type) {
+ switch (Type.SimpleTy) {
+ case MVT::i32:
+ return wasm::ValType::I32;
+ case MVT::i64:
+ return wasm::ValType::I64;
+ case MVT::f32:
+ return wasm::ValType::F32;
+ case MVT::f64:
+ return wasm::ValType::F64;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return wasm::ValType::V128;
+ case MVT::funcref:
+ return wasm::ValType::FUNCREF;
+ case MVT::externref:
+ return wasm::ValType::EXTERNREF;
+ default:
+ llvm_unreachable("unexpected type");
+ }
+}
+
+wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
+ switch (RC) {
+ case WebAssembly::I32RegClassID:
+ return wasm::ValType::I32;
+ case WebAssembly::I64RegClassID:
+ return wasm::ValType::I64;
+ case WebAssembly::F32RegClassID:
+ return wasm::ValType::F32;
+ case WebAssembly::F64RegClassID:
+ return wasm::ValType::F64;
+ case WebAssembly::V128RegClassID:
+ return wasm::ValType::V128;
+ case WebAssembly::FUNCREFRegClassID:
+ return wasm::ValType::FUNCREF;
+ case WebAssembly::EXTERNREFRegClassID:
+ return wasm::ValType::EXTERNREF;
+ default:
+ llvm_unreachable("unexpected type");
+ }
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
new file mode 100644
index 0000000..8d757df
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -0,0 +1,82 @@
+//===-- WebAssemblyTypeUtilities - WebAssembly Type Utilities---*- C++ -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the WebAssembly-specific type parsing
+/// utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYTYPEUTILITIES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYTYPEUTILITIES_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+namespace WebAssembly {
+
+/// Used as immediate MachineOperands for block signatures
+enum class BlockType : unsigned {
+ Invalid = 0x00,
+ Void = 0x40,
+ I32 = unsigned(wasm::ValType::I32),
+ I64 = unsigned(wasm::ValType::I64),
+ F32 = unsigned(wasm::ValType::F32),
+ F64 = unsigned(wasm::ValType::F64),
+ V128 = unsigned(wasm::ValType::V128),
+ Externref = unsigned(wasm::ValType::EXTERNREF),
+ Funcref = unsigned(wasm::ValType::FUNCREF),
+ // Multivalue blocks (and other non-void blocks) are only emitted when the
+ // blocks will never be exited and are at the ends of functions (see
+ // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
+ // to pop values off the stack, so the exact multivalue signature can always
+ // be inferred from the return type of the parent function in MCInstLower.
+ Multivalue = 0xffff,
+};
+
+/// Used as immediate MachineOperands for heap types, e.g. for ref.null.
+enum class HeapType : unsigned {
+ Invalid = 0x00,
+ Externref = unsigned(wasm::ValType::EXTERNREF),
+ Funcref = unsigned(wasm::ValType::FUNCREF),
+};
+
+// Convert StringRef to ValType / HealType / BlockType
+
+Optional<wasm::ValType> parseType(StringRef Type);
+HeapType parseHeapType(StringRef Type);
+BlockType parseBlockType(StringRef Type);
+MVT parseMVT(StringRef Type);
+
+// Convert ValType or a list/signature of ValTypes to a string.
+
+// Convert an unsinged integer, which can be among wasm::ValType enum, to its
+// type name string. If the input is not within wasm::ValType, returns
+// "invalid_type".
+const char *anyTypeToString(unsigned Type);
+const char *typeToString(wasm::ValType Type);
+// Convert a list of ValTypes into a string in the format of
+// "type0, type1, ... typeN"
+std::string typeListToString(ArrayRef<wasm::ValType> List);
+// Convert a wasm signature into a string in the format of
+// "(params) -> (results)", where params and results are a string of ValType
+// lists.
+std::string signatureToString(const wasm::WasmSignature *Sig);
+
+// Convert a MVT into its corresponding wasm ValType.
+wasm::ValType toValType(MVT Type);
+
+// Convert a register class to a wasm ValType.
+wasm::ValType regClassToValType(unsigned RC);
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
similarity index 75%
rename from src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
rename to src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
index f8fb57d..3da80f4 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
@@ -18,7 +18,6 @@
#include "llvm/MC/MCContext.h"
using namespace llvm;
-const char *const WebAssembly::ClangCallTerminateFn = "__clang_call_terminate";
const char *const WebAssembly::CxaBeginCatchFn = "__cxa_begin_catch";
const char *const WebAssembly::CxaRethrowFn = "__cxa_rethrow";
const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev";
@@ -73,7 +72,7 @@
return false;
// These functions never throw
if (F->getName() == CxaBeginCatchFn || F->getName() == PersonalityWrapperFn ||
- F->getName() == ClangCallTerminateFn || F->getName() == StdTerminateFn)
+ F->getName() == StdTerminateFn)
return false;
// TODO Can we exclude call instructions that are marked as 'nounwind' in the
@@ -92,17 +91,15 @@
case WebAssembly::CALL_INDIRECT_S:
case WebAssembly::RET_CALL_INDIRECT:
case WebAssembly::RET_CALL_INDIRECT_S:
- return MI.getOperand(MI.getNumOperands() - 1);
+ return MI.getOperand(MI.getNumExplicitOperands() - 1);
default:
llvm_unreachable("Not a call instruction");
}
}
-MCSymbolWasm *
-WebAssembly::getOrCreateFunctionTableSymbol(MCContext &Ctx,
- const StringRef &Name) {
- // FIXME: Duplicates functionality from
- // MC/WasmObjectWriter::recordRelocation.
+MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
+ MCContext &Ctx, const WebAssemblySubtarget *Subtarget) {
+ StringRef Name = "__indirect_function_table";
MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
@@ -113,6 +110,34 @@
// The default function table is synthesized by the linker.
Sym->setUndefined();
}
+ // MVP object files can't have symtab entries for tables.
+ if (!(Subtarget && Subtarget->hasReferenceTypes()))
+ Sym->setOmitFromLinkingSection();
+ return Sym;
+}
+
+MCSymbolWasm *WebAssembly::getOrCreateFuncrefCallTableSymbol(
+ MCContext &Ctx, const WebAssemblySubtarget *Subtarget) {
+ StringRef Name = "__funcref_call_table";
+ MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ if (Sym) {
+ if (!Sym->isFunctionTable())
+ Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
+ } else {
+ Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+
+ // Setting Weak ensure only one table is left after linking when multiple
+ // modules define the table.
+ Sym->setWeak(true);
+
+ wasm::WasmLimits Limits = {0, 1, 1};
+ wasm::WasmTableType TableType = {wasm::WASM_TYPE_FUNCREF, Limits};
+ Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+ Sym->setTableType(TableType);
+ }
+ // MVP object files can't have symtab entries for tables.
+ if (!(Subtarget && Subtarget->hasReferenceTypes()))
+ Sym->setOmitFromLinkingSection();
return Sym;
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
new file mode 100644
index 0000000..673dc952
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -0,0 +1,85 @@
+//===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the WebAssembly-specific
+/// utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCSymbolWasm;
+class StringRef;
+class WebAssemblyFunctionInfo;
+class WebAssemblySubtarget;
+
+namespace WebAssembly {
+
+enum WasmAddressSpace : unsigned {
+ // Default address space, for pointers to linear memory (stack, heap, data).
+ WASM_ADDRESS_SPACE_DEFAULT = 0,
+ // A non-integral address space for pointers to named objects outside of
+ // linear memory: WebAssembly globals or WebAssembly locals. Loads and stores
+ // to these pointers are lowered to global.get / global.set or local.get /
+ // local.set, as appropriate.
+ WASM_ADDRESS_SPACE_WASM_VAR = 1
+};
+
+inline bool isDefaultAddressSpace(unsigned AS) {
+ return AS == WASM_ADDRESS_SPACE_DEFAULT;
+}
+inline bool isWasmVarAddressSpace(unsigned AS) {
+ return AS == WASM_ADDRESS_SPACE_WASM_VAR;
+}
+inline bool isValidAddressSpace(unsigned AS) {
+ return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS);
+}
+
+bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool mayThrow(const MachineInstr &MI);
+
+// Exception-related function names
+extern const char *const ClangCallTerminateFn;
+extern const char *const CxaBeginCatchFn;
+extern const char *const CxaRethrowFn;
+extern const char *const StdTerminateFn;
+extern const char *const PersonalityWrapperFn;
+
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+const MachineOperand &getCalleeOp(const MachineInstr &MI);
+
+/// Returns the __indirect_function_table, for use in call_indirect and in
+/// function bitcasts.
+MCSymbolWasm *
+getOrCreateFunctionTableSymbol(MCContext &Ctx,
+ const WebAssemblySubtarget *Subtarget);
+
+/// Returns the __funcref_call_table, for use in funcref calls when lowered to
+/// table.set + call_indirect.
+MCSymbolWasm *
+getOrCreateFuncrefCallTableSymbol(MCContext &Ctx,
+ const WebAssemblySubtarget *Subtarget);
+
+/// Find a catch instruction from an EH pad. Returns null if no catch
+/// instruction found or the catch is in an invalid location.
+MachineInstr *findCatch(MachineBasicBlock *EHPad);
+
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssembly.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssembly.h
index 9ce02f7..9eb960d 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -25,7 +25,8 @@
class FunctionPass;
// LLVM IR passes.
-ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
+ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool EnableEH,
+ bool EnableSjLj);
ModulePass *createWebAssemblyLowerGlobalDtors();
ModulePass *createWebAssemblyAddMissingPrototypes();
ModulePass *createWebAssemblyFixFunctionBitcasts();
@@ -39,6 +40,7 @@
// Late passes.
FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyNullifyDebugValueLists();
FunctionPass *createWebAssemblyPrepareForLiveIntervals();
FunctionPass *createWebAssemblyOptimizeLiveIntervals();
FunctionPass *createWebAssemblyMemIntrinsicResults();
@@ -54,6 +56,7 @@
FunctionPass *createWebAssemblyRegNumbering();
FunctionPass *createWebAssemblyDebugFixup();
FunctionPass *createWebAssemblyPeephole();
+FunctionPass *createWebAssemblyMCLowerPrePass();
// PassRegistry initialization declarations.
void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
@@ -64,6 +67,7 @@
void initializeWebAssemblyArgumentMovePass(PassRegistry &);
void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &);
void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &);
+void initializeWebAssemblyNullifyDebugValueListsPass(PassRegistry &);
void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &);
void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
void initializeWebAssemblyMemIntrinsicResultsPass(PassRegistry &);
@@ -80,6 +84,7 @@
void initializeWebAssemblyRegNumberingPass(PassRegistry &);
void initializeWebAssemblyDebugFixupPass(PassRegistry &);
void initializeWebAssemblyPeepholePass(PassRegistry &);
+void initializeWebAssemblyMCLowerPrePassPass(PassRegistry &);
namespace WebAssembly {
enum TargetIndex {
@@ -87,10 +92,14 @@
TI_LOCAL,
// Followed by an absolute global index (ULEB). DEPRECATED.
TI_GLOBAL_FIXED,
+ // Followed by the index from the bottom of the Wasm stack.
TI_OPERAND_STACK,
// Followed by a compilation unit relative global index (uint32_t)
// that will have an associated relocation.
- TI_GLOBAL_RELOC
+ TI_GLOBAL_RELOC,
+ // Like TI_LOCAL, but indicates an indirect value (e.g. byval arg
+ // passed by pointer).
+ TI_LOCAL_INDIRECT
};
} // end namespace WebAssembly
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 02f5cc6..4089b04 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -26,10 +26,10 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 7f1c4bb..56829eb 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -14,14 +14,16 @@
//===----------------------------------------------------------------------===//
#include "WebAssemblyAsmPrinter.h"
-#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMCInstLower.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblyRuntimeLibcallSignatures.h"
#include "WebAssemblyTargetMachine.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
@@ -169,14 +171,126 @@
return WasmSym;
}
-void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
+void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
+ if (!WebAssembly::isWasmVarAddressSpace(GV->getAddressSpace())) {
+ AsmPrinter::emitGlobalVariable(GV);
+ return;
+ }
+
+ assert(!GV->isThreadLocal());
+
+ MCSymbolWasm *Sym = cast<MCSymbolWasm>(getSymbol(GV));
+
+ if (!Sym->getType()) {
+ const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering();
+ SmallVector<EVT, 1> VTs;
+ ComputeValueVTs(TLI, GV->getParent()->getDataLayout(), GV->getValueType(),
+ VTs);
+ if (VTs.size() != 1 ||
+ TLI.getNumRegisters(GV->getParent()->getContext(), VTs[0]) != 1)
+ report_fatal_error("Aggregate globals not yet implemented");
+ MVT VT = TLI.getRegisterType(GV->getParent()->getContext(), VTs[0]);
+ bool Mutable = true;
+ wasm::ValType Type = WebAssembly::toValType(VT);
+ Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+ Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable});
+ }
+
+ emitVisibility(Sym, GV->getVisibility(), !GV->isDeclaration());
+ if (GV->hasInitializer()) {
+ assert(getSymbolPreferLocal(*GV) == Sym);
+ emitLinkage(GV, Sym);
+ getTargetStreamer()->emitGlobalType(Sym);
+ OutStreamer->emitLabel(Sym);
+ // TODO: Actually emit the initializer value. Otherwise the global has the
+ // default value for its type (0, ref.null, etc).
+ OutStreamer->AddBlankLine();
+ }
+}
+
+MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
+ auto *WasmSym = cast<MCSymbolWasm>(GetExternalSymbolSymbol(Name));
+
+ // May be called multiple times, so early out.
+ if (WasmSym->getType().hasValue())
+ return WasmSym;
+
+ const WebAssemblySubtarget &Subtarget = getSubtarget();
+
+ // Except for certain known symbols, all symbols used by CodeGen are
+ // functions. It's OK to hardcode knowledge of specific symbols here; this
+ // method is precisely there for fetching the signatures of known
+ // Clang-provided symbols.
+ if (Name == "__stack_pointer" || Name == "__tls_base" ||
+ Name == "__memory_base" || Name == "__table_base" ||
+ Name == "__tls_size" || Name == "__tls_align") {
+ bool Mutable =
+ Name == "__stack_pointer" || Name == "__tls_base";
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+ WasmSym->setGlobalType(wasm::WasmGlobalType{
+ uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64
+ : wasm::WASM_TYPE_I32),
+ Mutable});
+ return WasmSym;
+ }
+
+ SmallVector<wasm::ValType, 4> Returns;
+ SmallVector<wasm::ValType, 4> Params;
+ if (Name == "__cpp_exception") {
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TAG);
+ // We can't confirm its signature index for now because there can be
+ // imported exceptions. Set it to be 0 for now.
+ WasmSym->setTagType(
+ {wasm::WASM_TAG_ATTRIBUTE_EXCEPTION, /* SigIndex */ 0});
+ // We may have multiple C++ compilation units to be linked together, each of
+ // which defines the exception symbol. To resolve them, we declare them as
+ // weak.
+ WasmSym->setWeak(true);
+ WasmSym->setExternal(true);
+
+ // All C++ exceptions are assumed to have a single i32 (for wasm32) or i64
+ // (for wasm64) param type and void return type. The reaon is, all C++
+ // exception values are pointers, and to share the type section with
+ // functions, exceptions are assumed to have void return type.
+ Params.push_back(Subtarget.hasAddr64() ? wasm::ValType::I64
+ : wasm::ValType::I32);
+ } else { // Function symbols
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ getLibcallSignature(Subtarget, Name, Returns, Params);
+ }
+ auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns),
+ std::move(Params));
+ WasmSym->setSignature(Signature.get());
+ addSignature(std::move(Signature));
+
+ return WasmSym;
+}
+
+void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
+ if (signaturesEmitted)
+ return;
+ signaturesEmitted = true;
+
+ // Normally symbols for globals get discovered as the MI gets lowered,
+ // but we need to know about them ahead of time.
+ MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
+ for (const auto &Name : MMIW.MachineSymbolsUsed) {
+ getOrCreateWasmSymbol(Name.getKey());
+ }
+
for (auto &It : OutContext.getSymbols()) {
- // Emit a .globaltype and .eventtype declaration.
+ // Emit .globaltype, .tagtype, or .tabletype declarations.
auto Sym = cast<MCSymbolWasm>(It.getValue());
- if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL)
- getTargetStreamer()->emitGlobalType(Sym);
- else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_EVENT)
- getTargetStreamer()->emitEventType(Sym);
+ if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
+ // .globaltype already handled by emitGlobalVariable for defined
+ // variables; here we make sure the types of external wasm globals get
+ // written to the file.
+ if (Sym->isUndefined())
+ getTargetStreamer()->emitGlobalType(Sym);
+ } else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_TAG)
+ getTargetStreamer()->emitTagType(Sym);
+ else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_TABLE)
+ getTargetStreamer()->emitTableType(Sym);
}
DenseSet<MCSymbol *> InvokeSymbols;
@@ -241,14 +355,33 @@
getTargetStreamer()->emitExportName(Sym, Name);
}
}
+}
+
+void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
+ emitExternalDecls(M);
+
+ // When a function's address is taken, a TABLE_INDEX relocation is emitted
+ // against the function symbol at the use site. However the relocation
+ // doesn't explicitly refer to the table. In the future we may want to
+ // define a new kind of reloc against both the function and the table, so
+ // that the linker can see that the function symbol keeps the table alive,
+ // but for now manually mark the table as live.
+ for (const auto &F : M) {
+ if (!F.isIntrinsic() && F.hasAddressTaken()) {
+ MCSymbolWasm *FunctionTable =
+ WebAssembly::getOrCreateFunctionTableSymbol(OutContext, Subtarget);
+ OutStreamer->emitSymbolAttribute(FunctionTable, MCSA_NoDeadStrip);
+ break;
+ }
+ }
for (const auto &G : M.globals()) {
- if (!G.hasInitializer() && G.hasExternalLinkage()) {
- if (G.getValueType()->isSized()) {
- uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
- OutStreamer->emitELFSize(getSymbol(&G),
- MCConstantExpr::create(Size, OutContext));
- }
+ if (!G.hasInitializer() && G.hasExternalLinkage() &&
+ !WebAssembly::isWasmVarAddressSpace(G.getAddressSpace()) &&
+ G.getValueType()->isSized()) {
+ uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
+ OutStreamer->emitELFSize(getSymbol(&G),
+ MCConstantExpr::create(Size, OutContext));
}
}
@@ -392,6 +525,17 @@
// Nothing to do; jump tables are incorporated into the instruction stream.
}
+void WebAssemblyAsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *Sym)
+ const {
+ AsmPrinter::emitLinkage(GV, Sym);
+ // This gets called before the function label and type are emitted.
+ // We use it to emit signatures of external functions.
+ // FIXME casts!
+ const_cast<WebAssemblyAsmPrinter *>(this)
+ ->emitExternalDecls(*MMI->getModule());
+}
+
+
void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
const Function &F = MF->getFunction();
SmallVector<MVT, 1> ResultVTs;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index 7a6a3247..6b2f200 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -25,6 +25,7 @@
// TODO: Do the uniquing of Signatures here instead of ObjectFileWriter?
std::vector<std::unique_ptr<wasm::WasmSignature>> Signatures;
std::vector<std::unique_ptr<std::string>> Names;
+ bool signaturesEmitted = false;
StringRef storeName(StringRef Name) {
std::unique_ptr<std::string> N = std::make_unique<std::string>(Name);
@@ -65,8 +66,10 @@
void emitEndOfAsmFile(Module &M) override;
void EmitProducerInfo(Module &M);
void EmitTargetFeatures(Module &M);
+ void emitGlobalVariable(const GlobalVariable *GV) override;
void emitJumpTableInfo() override;
void emitConstantPool() override;
+ void emitLinkage(const GlobalValue *, MCSymbol *) const override;
void emitFunctionBodyStart() override;
void emitInstruction(const MachineInstr *MI) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -80,6 +83,8 @@
MCSymbolWasm *getMCSymbolForFunction(const Function *F, bool EnableEmEH,
wasm::WasmSignature *Sig,
bool &InvokeDetected);
+ MCSymbol *getOrCreateWasmSymbol(StringRef Name);
+ void emitExternalDecls(const Module &M);
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index eb3e9b9..2257d15 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -17,11 +17,11 @@
////===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyExceptionInfo.h"
#include "WebAssemblySortRegion.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -218,6 +219,7 @@
CompareBlockNumbersBackwards>
Ready;
+ const auto *EHInfo = MF.getWasmEHFuncInfo();
SortRegionInfo SRI(MLI, WEI);
SmallVector<Entry, 4> Entries;
for (MachineBasicBlock *MBB = &MF.front();;) {
@@ -245,8 +247,34 @@
if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
continue;
// Decrement the predecessor count. If it's now zero, it's ready.
- if (--NumPredsLeft[Succ->getNumber()] == 0)
+ if (--NumPredsLeft[Succ->getNumber()] == 0) {
+ // When we are in a SortRegion, we allow sorting of not only BBs that
+ // belong to the current (innermost) region but also BBs that are
+ // dominated by the current region header. But we should not do this for
+ // exceptions because there can be cases in which, for example:
+ // EHPad A's unwind destination (where the exception lands when it is
+ // not caught by EHPad A) is EHPad B, so EHPad B does not belong to the
+ // exception dominated by EHPad A. But EHPad B is dominated by EHPad A,
+ // so EHPad B can be sorted within EHPad A's exception. This is
+ // incorrect because we may end up delegating/rethrowing to an inner
+ // scope in CFGStackify. So here we make sure those unwind destinations
+ // are deferred until their unwind source's exception is sorted.
+ if (EHInfo && EHInfo->hasUnwindSrcs(Succ)) {
+ SmallPtrSet<MachineBasicBlock *, 4> UnwindSrcs =
+ EHInfo->getUnwindSrcs(Succ);
+ bool IsDeferred = false;
+ for (Entry &E : Entries) {
+ if (UnwindSrcs.count(E.TheRegion->getHeader())) {
+ E.Deferred.push_back(Succ);
+ IsDeferred = true;
+ break;
+ }
+ }
+ if (IsDeferred)
+ continue;
+ }
Preferred.push(Succ);
+ }
}
// Determine the block to follow MBB. First try to find a preferred block,
// to preserve the original block order when possible.
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index a8e0c3e..59d69e4 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -21,16 +21,18 @@
///
//===----------------------------------------------------------------------===//
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyExceptionInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySortRegion.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -38,7 +40,8 @@
#define DEBUG_TYPE "wasm-cfg-stackify"
-STATISTIC(NumUnwindMismatches, "Number of EH pad unwind mismatches found");
+STATISTIC(NumCallUnwindMismatches, "Number of call unwind mismatches found");
+STATISTIC(NumCatchUnwindMismatches, "Number of catch unwind mismatches found");
namespace {
class WebAssemblyCFGStackify final : public MachineFunctionPass {
@@ -68,24 +71,42 @@
void placeBlockMarker(MachineBasicBlock &MBB);
void placeLoopMarker(MachineBasicBlock &MBB);
void placeTryMarker(MachineBasicBlock &MBB);
+
+ // Exception handling related functions
+ bool fixCallUnwindMismatches(MachineFunction &MF);
+ bool fixCatchUnwindMismatches(MachineFunction &MF);
+ void addTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd,
+ MachineBasicBlock *DelegateDest);
+ void recalculateScopeTops(MachineFunction &MF);
void removeUnnecessaryInstrs(MachineFunction &MF);
- bool fixUnwindMismatches(MachineFunction &MF);
+
+ // Wrap-up
+ using EndMarkerInfo =
+ std::pair<const MachineBasicBlock *, const MachineInstr *>;
+ unsigned getBranchDepth(const SmallVectorImpl<EndMarkerInfo> &Stack,
+ const MachineBasicBlock *MBB);
+ unsigned getDelegateDepth(const SmallVectorImpl<EndMarkerInfo> &Stack,
+ const MachineBasicBlock *MBB);
+ unsigned
+ getRethrowDepth(const SmallVectorImpl<EndMarkerInfo> &Stack,
+ const SmallVectorImpl<const MachineBasicBlock *> &EHPadStack);
void rewriteDepthImmediates(MachineFunction &MF);
void fixEndsAtEndOfFunction(MachineFunction &MF);
+ void cleanupFunctionData(MachineFunction &MF);
- // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY).
+ // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY) or DELEGATE
+ // (in case of TRY).
DenseMap<const MachineInstr *, MachineInstr *> BeginToEnd;
- // For each END_(BLOCK|LOOP|TRY), the corresponding BLOCK|LOOP|TRY.
+ // For each END_(BLOCK|LOOP|TRY) or DELEGATE, the corresponding
+ // BLOCK|LOOP|TRY.
DenseMap<const MachineInstr *, MachineInstr *> EndToBegin;
// <TRY marker, EH pad> map
DenseMap<const MachineInstr *, MachineBasicBlock *> TryToEHPad;
// <EH pad, TRY marker> map
DenseMap<const MachineBasicBlock *, MachineInstr *> EHPadToTry;
- // There can be an appendix block at the end of each function, shared for:
- // - creating a correct signature for fallthrough returns
- // - target for rethrows that need to unwind to the caller, but are trapped
- // inside another try/catch
+ // We need an appendix block to place 'end_loop' or 'end_try' marker when the
+ // loop / exception bottom block is the last block in a function
MachineBasicBlock *AppendixBB = nullptr;
MachineBasicBlock *getAppendixBlock(MachineFunction &MF) {
if (!AppendixBB) {
@@ -97,6 +118,19 @@
return AppendixBB;
}
+ // Before running rewriteDepthImmediates function, 'delegate' has a BB as its
+ // destination operand. getFakeCallerBlock() returns a fake BB that will be
+ // used for the operand when 'delegate' needs to rethrow to the caller. This
+ // will be rewritten as an immediate value that is the number of block depths
+ // + 1 in rewriteDepthImmediates, and this fake BB will be removed at the end
+ // of the pass.
+ MachineBasicBlock *FakeCallerBB = nullptr;
+ MachineBasicBlock *getFakeCallerBlock(MachineFunction &MF) {
+ if (!FakeCallerBB)
+ FakeCallerBB = MF.CreateMachineBasicBlock();
+ return FakeCallerBB;
+ }
+
// Helper functions to register / unregister scope information created by
// marker instructions.
void registerScope(MachineInstr *Begin, MachineInstr *End);
@@ -189,6 +223,7 @@
EndToBegin[End] = Begin;
}
+// When 'End' is not an 'end_try' but 'delegate, EHPad is nullptr.
void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin,
MachineInstr *End,
MachineBasicBlock *EHPad) {
@@ -675,6 +710,8 @@
while (Cont->isEHPad()) {
MachineInstr *Try = EHPadToTry[Cont];
MachineInstr *EndTry = BeginToEnd[Try];
+ // We started from an EH pad, so the end marker cannot be a delegate
+ assert(EndTry->getOpcode() != WebAssembly::DELEGATE);
Cont = EndTry->getParent();
}
@@ -719,8 +756,10 @@
for (auto &MI : MBB) {
if (MI.getOpcode() != WebAssembly::TRY)
continue;
-
MachineInstr *Try = &MI, *EndTry = BeginToEnd[Try];
+ if (EndTry->getOpcode() == WebAssembly::DELEGATE)
+ continue;
+
MachineBasicBlock *TryBB = Try->getParent();
MachineBasicBlock *Cont = EndTry->getParent();
int64_t RetType = Try->getOperand(0).getImm();
@@ -763,12 +802,8 @@
// When MBB is split into MBB and Split, we should unstackify defs in MBB that
// have their uses in Split.
-// FIXME This function will be used when fixing unwind mismatches, but the old
-// version of that function was removed for the moment and the new version has
-// not yet been added. So 'LLVM_ATTRIBUTE_UNUSED' is added to suppress the
-// warning. Remove the attribute after the new functionality is added.
-LLVM_ATTRIBUTE_UNUSED static void
-unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, MachineBasicBlock &Split) {
+static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
+ MachineBasicBlock &Split) {
MachineFunction &MF = *MBB.getParent();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -826,22 +861,622 @@
}
}
-bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
- // TODO Implement this
- return false;
+// Wrap the given range of instruction with try-delegate. RangeBegin and
+// RangeEnd are inclusive.
+void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin,
+ MachineInstr *RangeEnd,
+ MachineBasicBlock *DelegateDest) {
+ auto *BeginBB = RangeBegin->getParent();
+ auto *EndBB = RangeEnd->getParent();
+ MachineFunction &MF = *BeginBB->getParent();
+ const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ // Local expression tree before the first call of this range should go
+ // after the nested TRY.
+ SmallPtrSet<const MachineInstr *, 4> AfterSet;
+ AfterSet.insert(RangeBegin);
+ for (auto I = MachineBasicBlock::iterator(RangeBegin), E = BeginBB->begin();
+ I != E; --I) {
+ if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+ continue;
+ if (WebAssembly::isChild(*std::prev(I), MFI))
+ AfterSet.insert(&*std::prev(I));
+ else
+ break;
+ }
+
+ // Create the nested try instruction.
+ auto TryPos = getLatestInsertPos(
+ BeginBB, SmallPtrSet<const MachineInstr *, 4>(), AfterSet);
+ MachineInstr *Try = BuildMI(*BeginBB, TryPos, RangeBegin->getDebugLoc(),
+ TII.get(WebAssembly::TRY))
+ .addImm(int64_t(WebAssembly::BlockType::Void));
+
+ // Create a BB to insert the 'delegate' instruction.
+ MachineBasicBlock *DelegateBB = MF.CreateMachineBasicBlock();
+ // If the destination of 'delegate' is not the caller, adds the destination to
+ // the BB's successors.
+ if (DelegateDest != FakeCallerBB)
+ DelegateBB->addSuccessor(DelegateDest);
+
+ auto SplitPos = std::next(RangeEnd->getIterator());
+ if (SplitPos == EndBB->end()) {
+ // If the range's end instruction is at the end of the BB, insert the new
+ // delegate BB after the current BB.
+ MF.insert(std::next(EndBB->getIterator()), DelegateBB);
+ EndBB->addSuccessor(DelegateBB);
+
+ } else {
+ // When the split pos is in the middle of a BB, we split the BB into two and
+ // put the 'delegate' BB in between. We normally create a split BB and make
+ // it a successor of the original BB (PostSplit == true), but in case the BB
+ // is an EH pad and the split pos is before 'catch', we should preserve the
+ // BB's property, including that it is an EH pad, in the later part of the
+ // BB, where 'catch' is. In this case we set PostSplit to false.
+ bool PostSplit = true;
+ if (EndBB->isEHPad()) {
+ for (auto I = MachineBasicBlock::iterator(SplitPos), E = EndBB->end();
+ I != E; ++I) {
+ if (WebAssembly::isCatch(I->getOpcode())) {
+ PostSplit = false;
+ break;
+ }
+ }
+ }
+
+ MachineBasicBlock *PreBB = nullptr, *PostBB = nullptr;
+ if (PostSplit) {
+ // If the range's end instruction is in the middle of the BB, we split the
+ // BB into two and insert the delegate BB in between.
+ // - Before:
+ // bb:
+ // range_end
+ // other_insts
+ //
+ // - After:
+ // pre_bb: (previous 'bb')
+ // range_end
+ // delegate_bb: (new)
+ // delegate
+ // post_bb: (new)
+ // other_insts
+ PreBB = EndBB;
+ PostBB = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(PreBB->getIterator()), PostBB);
+ MF.insert(std::next(PreBB->getIterator()), DelegateBB);
+ PostBB->splice(PostBB->end(), PreBB, SplitPos, PreBB->end());
+ PostBB->transferSuccessors(PreBB);
+ } else {
+ // - Before:
+ // ehpad:
+ // range_end
+ // catch
+ // ...
+ //
+ // - After:
+ // pre_bb: (new)
+ // range_end
+ // delegate_bb: (new)
+ // delegate
+ // post_bb: (previous 'ehpad')
+ // catch
+ // ...
+ assert(EndBB->isEHPad());
+ PreBB = MF.CreateMachineBasicBlock();
+ PostBB = EndBB;
+ MF.insert(PostBB->getIterator(), PreBB);
+ MF.insert(PostBB->getIterator(), DelegateBB);
+ PreBB->splice(PreBB->end(), PostBB, PostBB->begin(), SplitPos);
+ // We don't need to transfer predecessors of the EH pad to 'PreBB',
+ // because an EH pad's predecessors are all through unwind edges and they
+ // should still unwind to the EH pad, not PreBB.
+ }
+ unstackifyVRegsUsedInSplitBB(*PreBB, *PostBB);
+ PreBB->addSuccessor(DelegateBB);
+ PreBB->addSuccessor(PostBB);
+ }
+
+ // Add 'delegate' instruction in the delegate BB created above.
+ MachineInstr *Delegate = BuildMI(DelegateBB, RangeEnd->getDebugLoc(),
+ TII.get(WebAssembly::DELEGATE))
+ .addMBB(DelegateDest);
+ registerTryScope(Try, Delegate, nullptr);
}
-static unsigned
-getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
- const MachineBasicBlock *MBB) {
- unsigned Depth = 0;
- for (auto X : reverse(Stack)) {
- if (X == MBB)
- break;
- ++Depth;
+bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
+ // Linearizing the control flow by placing TRY / END_TRY markers can create
+ // mismatches in unwind destinations for throwing instructions, such as calls.
+ //
+ // We use the 'delegate' instruction to fix the unwind mismatches. 'delegate'
+ // instruction delegates an exception to an outer 'catch'. It can target not
+ // only 'catch' but all block-like structures including another 'delegate',
+ // but with slightly different semantics than branches. When it targets a
+ // 'catch', it will delegate the exception to that catch. It is being
+ // discussed how to define the semantics when 'delegate''s target is a non-try
+ // block: it will either be a validation failure or it will target the next
+ // outer try-catch. But anyway our LLVM backend currently does not generate
+ // such code. The example below illustrates where the 'delegate' instruction
+ // in the middle will delegate the exception to, depending on the value of N.
+ // try
+ // try
+ // block
+ // try
+ // try
+ // call @foo
+ // delegate N ;; Where will this delegate to?
+ // catch ;; N == 0
+ // end
+ // end ;; N == 1 (invalid; will not be generated)
+ // delegate ;; N == 2
+ // catch ;; N == 3
+ // end
+ // ;; N == 4 (to caller)
+
+ // 1. When an instruction may throw, but the EH pad it will unwind to can be
+ // different from the original CFG.
+ //
+ // Example: we have the following CFG:
+ // bb0:
+ // call @foo ; if it throws, unwind to bb2
+ // bb1:
+ // call @bar ; if it throws, unwind to bb3
+ // bb2 (ehpad):
+ // catch
+ // ...
+ // bb3 (ehpad)
+ // catch
+ // ...
+ //
+ // And the CFG is sorted in this order. Then after placing TRY markers, it
+ // will look like: (BB markers are omitted)
+ // try
+ // try
+ // call @foo
+ // call @bar ;; if it throws, unwind to bb3
+ // catch ;; ehpad (bb2)
+ // ...
+ // end_try
+ // catch ;; ehpad (bb3)
+ // ...
+ // end_try
+ //
+ // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it
+ // is supposed to end up. We solve this problem by wrapping the mismatching
+ // call with an inner try-delegate that rethrows the exception to the right
+ // 'catch'.
+ //
+ // try
+ // try
+ // call @foo
+ // try ;; (new)
+ // call @bar
+ // delegate 1 (bb3) ;; (new)
+ // catch ;; ehpad (bb2)
+ // ...
+ // end_try
+ // catch ;; ehpad (bb3)
+ // ...
+ // end_try
+ //
+ // ---
+ // 2. The same as 1, but in this case an instruction unwinds to a caller
+ // function and not another EH pad.
+ //
+ // Example: we have the following CFG:
+ // bb0:
+ // call @foo ; if it throws, unwind to bb2
+ // bb1:
+ // call @bar ; if it throws, unwind to caller
+ // bb2 (ehpad):
+ // catch
+ // ...
+ //
+ // And the CFG is sorted in this order. Then after placing TRY markers, it
+ // will look like:
+ // try
+ // call @foo
+ // call @bar ;; if it throws, unwind to caller
+ // catch ;; ehpad (bb2)
+ // ...
+ // end_try
+ //
+ // Now if bar() throws, it is going to end up ip in bb2, when it is supposed
+ // throw up to the caller. We solve this problem in the same way, but in this
+ // case 'delegate's immediate argument is the number of block depths + 1,
+ // which means it rethrows to the caller.
+ // try
+ // call @foo
+ // try ;; (new)
+ // call @bar
+ // delegate 1 (caller) ;; (new)
+ // catch ;; ehpad (bb2)
+ // ...
+ // end_try
+ //
+ // Before rewriteDepthImmediates, delegate's argument is a BB. In case of the
+ // caller, it will take a fake BB generated by getFakeCallerBlock(), which
+ // will be converted to a correct immediate argument later.
+ //
+ // In case there are multiple calls in a BB that may throw to the caller, they
+ // can be wrapped together in one nested try-delegate scope. (In 1, this
+ // couldn't happen, because may-throwing instruction there had an unwind
+ // destination, i.e., it was an invoke before, and there could be only one
+ // invoke within a BB.)
+
+ SmallVector<const MachineBasicBlock *, 8> EHPadStack;
+ // Range of intructions to be wrapped in a new nested try/catch. A range
+ // exists in a single BB and does not span multiple BBs.
+ using TryRange = std::pair<MachineInstr *, MachineInstr *>;
+ // In original CFG, <unwind destination BB, a vector of try ranges>
+ DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
+
+ // Gather possibly throwing calls (i.e., previously invokes) whose current
+ // unwind destination is not the same as the original CFG. (Case 1)
+
+ for (auto &MBB : reverse(MF)) {
+ bool SeenThrowableInstInBB = false;
+ for (auto &MI : reverse(MBB)) {
+ if (MI.getOpcode() == WebAssembly::TRY)
+ EHPadStack.pop_back();
+ else if (WebAssembly::isCatch(MI.getOpcode()))
+ EHPadStack.push_back(MI.getParent());
+
+ // In this loop we only gather calls that have an EH pad to unwind. So
+ // there will be at most 1 such call (= invoke) in a BB, so after we've
+ // seen one, we can skip the rest of BB. Also if MBB has no EH pad
+ // successor or MI does not throw, this is not an invoke.
+ if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() ||
+ !WebAssembly::mayThrow(MI))
+ continue;
+ SeenThrowableInstInBB = true;
+
+ // If the EH pad on the stack top is where this instruction should unwind
+ // next, we're good.
+ MachineBasicBlock *UnwindDest = getFakeCallerBlock(MF);
+ for (auto *Succ : MBB.successors()) {
+ // Even though semantically a BB can have multiple successors in case an
+ // exception is not caught by a catchpad, in our backend implementation
+ // it is guaranteed that a BB can have at most one EH pad successor. For
+ // details, refer to comments in findWasmUnwindDestinations function in
+ // SelectionDAGBuilder.cpp.
+ if (Succ->isEHPad()) {
+ UnwindDest = Succ;
+ break;
+ }
+ }
+ if (EHPadStack.back() == UnwindDest)
+ continue;
+
+ // Include EH_LABELs in the range before and afer the invoke
+ MachineInstr *RangeBegin = &MI, *RangeEnd = &MI;
+ if (RangeBegin->getIterator() != MBB.begin() &&
+ std::prev(RangeBegin->getIterator())->isEHLabel())
+ RangeBegin = &*std::prev(RangeBegin->getIterator());
+ if (std::next(RangeEnd->getIterator()) != MBB.end() &&
+ std::next(RangeEnd->getIterator())->isEHLabel())
+ RangeEnd = &*std::next(RangeEnd->getIterator());
+
+ // If not, record the range.
+ UnwindDestToTryRanges[UnwindDest].push_back(
+ TryRange(RangeBegin, RangeEnd));
+ LLVM_DEBUG(dbgs() << "- Call unwind mismatch: MBB = " << MBB.getName()
+ << "\nCall = " << MI
+ << "\nOriginal dest = " << UnwindDest->getName()
+ << " Current dest = " << EHPadStack.back()->getName()
+ << "\n\n");
+ }
}
- assert(Depth < Stack.size() && "Branch destination should be in scope");
- return Depth;
+
+ assert(EHPadStack.empty());
+
+ // Gather possibly throwing calls that are supposed to unwind up to the caller
+ // if they throw, but currently unwind to an incorrect destination. Unlike the
+ // loop above, there can be multiple calls within a BB that unwind to the
+ // caller, which we should group together in a range. (Case 2)
+
+ MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive
+
+ // Record the range.
+ auto RecordCallerMismatchRange = [&](const MachineBasicBlock *CurrentDest) {
+ UnwindDestToTryRanges[getFakeCallerBlock(MF)].push_back(
+ TryRange(RangeBegin, RangeEnd));
+ LLVM_DEBUG(dbgs() << "- Call unwind mismatch: MBB = "
+ << RangeBegin->getParent()->getName()
+ << "\nRange begin = " << *RangeBegin
+ << "Range end = " << *RangeEnd
+ << "\nOriginal dest = caller Current dest = "
+ << CurrentDest->getName() << "\n\n");
+ RangeBegin = RangeEnd = nullptr; // Reset range pointers
+ };
+
+ for (auto &MBB : reverse(MF)) {
+ bool SeenThrowableInstInBB = false;
+ for (auto &MI : reverse(MBB)) {
+ bool MayThrow = WebAssembly::mayThrow(MI);
+
+ // If MBB has an EH pad successor and this is the last instruction that
+ // may throw, this instruction unwinds to the EH pad and not to the
+ // caller.
+ if (MBB.hasEHPadSuccessor() && MayThrow && !SeenThrowableInstInBB)
+ SeenThrowableInstInBB = true;
+
+ // We wrap up the current range when we see a marker even if we haven't
+ // finished a BB.
+ else if (RangeEnd && WebAssembly::isMarker(MI.getOpcode()))
+ RecordCallerMismatchRange(EHPadStack.back());
+
+ // If EHPadStack is empty, that means it correctly unwinds to the caller
+ // if it throws, so we're good. If MI does not throw, we're good too.
+ else if (EHPadStack.empty() || !MayThrow) {
+ }
+
+ // We found an instruction that unwinds to the caller but currently has an
+ // incorrect unwind destination. Create a new range or increment the
+ // currently existing range.
+ else {
+ if (!RangeEnd)
+ RangeBegin = RangeEnd = &MI;
+ else
+ RangeBegin = &MI;
+ }
+
+ // Update EHPadStack.
+ if (MI.getOpcode() == WebAssembly::TRY)
+ EHPadStack.pop_back();
+ else if (WebAssembly::isCatch(MI.getOpcode()))
+ EHPadStack.push_back(MI.getParent());
+ }
+
+ if (RangeEnd)
+ RecordCallerMismatchRange(EHPadStack.back());
+ }
+
+ assert(EHPadStack.empty());
+
+ // We don't have any unwind destination mismatches to resolve.
+ if (UnwindDestToTryRanges.empty())
+ return false;
+
+ // Now we fix the mismatches by wrapping calls with inner try-delegates.
+ for (auto &P : UnwindDestToTryRanges) {
+ NumCallUnwindMismatches += P.second.size();
+ MachineBasicBlock *UnwindDest = P.first;
+ auto &TryRanges = P.second;
+
+ for (auto Range : TryRanges) {
+ MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
+ std::tie(RangeBegin, RangeEnd) = Range;
+ auto *MBB = RangeBegin->getParent();
+
+ // If this BB has an EH pad successor, i.e., ends with an 'invoke', now we
+ // are going to wrap the invoke with try-delegate, making the 'delegate'
+ // BB the new successor instead, so remove the EH pad succesor here. The
+ // BB may not have an EH pad successor if calls in this BB throw to the
+ // caller.
+ MachineBasicBlock *EHPad = nullptr;
+ for (auto *Succ : MBB->successors()) {
+ if (Succ->isEHPad()) {
+ EHPad = Succ;
+ break;
+ }
+ }
+ if (EHPad)
+ MBB->removeSuccessor(EHPad);
+
+ addTryDelegate(RangeBegin, RangeEnd, UnwindDest);
+ }
+ }
+
+ return true;
+}
+
+bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) {
+ // There is another kind of unwind destination mismatches besides call unwind
+ // mismatches, which we will call "catch unwind mismatches". See this example
+ // after the marker placement:
+ // try
+ // try
+ // call @foo
+ // catch __cpp_exception ;; ehpad A (next unwind dest: caller)
+ // ...
+ // end_try
+ // catch_all ;; ehpad B
+ // ...
+ // end_try
+ //
+ // 'call @foo's unwind destination is the ehpad A. But suppose 'call @foo'
+ // throws a foreign exception that is not caught by ehpad A, and its next
+ // destination should be the caller. But after control flow linearization,
+ // another EH pad can be placed in between (e.g. ehpad B here), making the
+ // next unwind destination incorrect. In this case, the foreign exception
+ // will instead go to ehpad B and will be caught there instead. In this
+ // example the correct next unwind destination is the caller, but it can be
+ // another outer catch in other cases.
+ //
+ // There is no specific 'call' or 'throw' instruction to wrap with a
+ // try-delegate, so we wrap the whole try-catch-end with a try-delegate and
+ // make it rethrow to the right destination, as in the example below:
+ // try
+ // try ;; (new)
+ // try
+ // call @foo
+ // catch __cpp_exception ;; ehpad A (next unwind dest: caller)
+ // ...
+ // end_try
+ // delegate 1 (caller) ;; (new)
+ // catch_all ;; ehpad B
+ // ...
+ // end_try
+
+ const auto *EHInfo = MF.getWasmEHFuncInfo();
+ SmallVector<const MachineBasicBlock *, 8> EHPadStack;
+ // For EH pads that have catch unwind mismatches, a map of <EH pad, its
+ // correct unwind destination>.
+ DenseMap<MachineBasicBlock *, MachineBasicBlock *> EHPadToUnwindDest;
+
+ for (auto &MBB : reverse(MF)) {
+ for (auto &MI : reverse(MBB)) {
+ if (MI.getOpcode() == WebAssembly::TRY)
+ EHPadStack.pop_back();
+ else if (MI.getOpcode() == WebAssembly::DELEGATE)
+ EHPadStack.push_back(&MBB);
+ else if (WebAssembly::isCatch(MI.getOpcode())) {
+ auto *EHPad = &MBB;
+
+ // catch_all always catches an exception, so we don't need to do
+ // anything
+ if (MI.getOpcode() == WebAssembly::CATCH_ALL) {
+ }
+
+ // This can happen when the unwind dest was removed during the
+ // optimization, e.g. because it was unreachable.
+ else if (EHPadStack.empty() && EHInfo->hasUnwindDest(EHPad)) {
+ LLVM_DEBUG(dbgs() << "EHPad (" << EHPad->getName()
+ << "'s unwind destination does not exist anymore"
+ << "\n\n");
+ }
+
+ // The EHPad's next unwind destination is the caller, but we incorrectly
+ // unwind to another EH pad.
+ else if (!EHPadStack.empty() && !EHInfo->hasUnwindDest(EHPad)) {
+ EHPadToUnwindDest[EHPad] = getFakeCallerBlock(MF);
+ LLVM_DEBUG(dbgs()
+ << "- Catch unwind mismatch:\nEHPad = " << EHPad->getName()
+ << " Original dest = caller Current dest = "
+ << EHPadStack.back()->getName() << "\n\n");
+ }
+
+ // The EHPad's next unwind destination is an EH pad, whereas we
+ // incorrectly unwind to another EH pad.
+ else if (!EHPadStack.empty() && EHInfo->hasUnwindDest(EHPad)) {
+ auto *UnwindDest = EHInfo->getUnwindDest(EHPad);
+ if (EHPadStack.back() != UnwindDest) {
+ EHPadToUnwindDest[EHPad] = UnwindDest;
+ LLVM_DEBUG(dbgs() << "- Catch unwind mismatch:\nEHPad = "
+ << EHPad->getName() << " Original dest = "
+ << UnwindDest->getName() << " Current dest = "
+ << EHPadStack.back()->getName() << "\n\n");
+ }
+ }
+
+ EHPadStack.push_back(EHPad);
+ }
+ }
+ }
+
+ assert(EHPadStack.empty());
+ if (EHPadToUnwindDest.empty())
+ return false;
+ NumCatchUnwindMismatches += EHPadToUnwindDest.size();
+ SmallPtrSet<MachineBasicBlock *, 4> NewEndTryBBs;
+
+ for (auto &P : EHPadToUnwindDest) {
+ MachineBasicBlock *EHPad = P.first;
+ MachineBasicBlock *UnwindDest = P.second;
+ MachineInstr *Try = EHPadToTry[EHPad];
+ MachineInstr *EndTry = BeginToEnd[Try];
+ addTryDelegate(Try, EndTry, UnwindDest);
+ NewEndTryBBs.insert(EndTry->getParent());
+ }
+
+ // Adding a try-delegate wrapping an existing try-catch-end can make existing
+ // branch destination BBs invalid. For example,
+ //
+ // - Before:
+ // bb0:
+ // block
+ // br bb3
+ // bb1:
+ // try
+ // ...
+ // bb2: (ehpad)
+ // catch
+ // bb3:
+ // end_try
+ // end_block ;; 'br bb3' targets here
+ //
+ // Suppose this try-catch-end has a catch unwind mismatch, so we need to wrap
+ // this with a try-delegate. Then this becomes:
+ //
+ // - After:
+ // bb0:
+ // block
+ // br bb3 ;; invalid destination!
+ // bb1:
+ // try ;; (new instruction)
+ // try
+ // ...
+ // bb2: (ehpad)
+ // catch
+ // bb3:
+ // end_try ;; 'br bb3' still incorrectly targets here!
+ // delegate_bb: ;; (new BB)
+ // delegate ;; (new instruction)
+ // split_bb: ;; (new BB)
+ // end_block
+ //
+ // Now 'br bb3' incorrectly branches to an inner scope.
+ //
+ // As we can see in this case, when branches target a BB that has both
+ // 'end_try' and 'end_block' and the BB is split to insert a 'delegate', we
+ // have to remap existing branch destinations so that they target not the
+ // 'end_try' BB but the new 'end_block' BB. There can be multiple 'delegate's
+ // in between, so we try to find the next BB with 'end_block' instruction. In
+ // this example, the 'br bb3' instruction should be remapped to 'br split_bb'.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.isTerminator()) {
+ for (auto &MO : MI.operands()) {
+ if (MO.isMBB() && NewEndTryBBs.count(MO.getMBB())) {
+ auto *BrDest = MO.getMBB();
+ bool FoundEndBlock = false;
+ for (; std::next(BrDest->getIterator()) != MF.end();
+ BrDest = BrDest->getNextNode()) {
+ for (const auto &MI : *BrDest) {
+ if (MI.getOpcode() == WebAssembly::END_BLOCK) {
+ FoundEndBlock = true;
+ break;
+ }
+ }
+ if (FoundEndBlock)
+ break;
+ }
+ assert(FoundEndBlock);
+ MO.setMBB(BrDest);
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
+ // Renumber BBs and recalculate ScopeTop info because new BBs might have been
+ // created and inserted during fixing unwind mismatches.
+ MF.RenumberBlocks();
+ ScopeTops.clear();
+ ScopeTops.resize(MF.getNumBlockIDs());
+ for (auto &MBB : reverse(MF)) {
+ for (auto &MI : reverse(MBB)) {
+ if (ScopeTops[MBB.getNumber()])
+ break;
+ switch (MI.getOpcode()) {
+ case WebAssembly::END_BLOCK:
+ case WebAssembly::END_LOOP:
+ case WebAssembly::END_TRY:
+ case WebAssembly::DELEGATE:
+ updateScopeTops(EndToBegin[&MI]->getParent(), &MBB);
+ break;
+ case WebAssembly::CATCH:
+ case WebAssembly::CATCH_ALL:
+ updateScopeTops(EHPadToTry[&MBB]->getParent(), &MBB);
+ break;
+ }
+ }
+ }
}
/// In normal assembly languages, when the end of a function is unreachable,
@@ -889,6 +1524,7 @@
}
case WebAssembly::END_BLOCK:
case WebAssembly::END_LOOP:
+ case WebAssembly::DELEGATE:
EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
continue;
default:
@@ -937,37 +1573,144 @@
}
// Fix mismatches in unwind destinations induced by linearizing the code.
if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
- MF.getFunction().hasPersonalityFn())
- fixUnwindMismatches(MF);
+ MF.getFunction().hasPersonalityFn()) {
+ bool Changed = fixCallUnwindMismatches(MF);
+ Changed |= fixCatchUnwindMismatches(MF);
+ if (Changed)
+ recalculateScopeTops(MF);
+ }
+}
+
+unsigned WebAssemblyCFGStackify::getBranchDepth(
+ const SmallVectorImpl<EndMarkerInfo> &Stack, const MachineBasicBlock *MBB) {
+ unsigned Depth = 0;
+ for (auto X : reverse(Stack)) {
+ if (X.first == MBB)
+ break;
+ ++Depth;
+ }
+ assert(Depth < Stack.size() && "Branch destination should be in scope");
+ return Depth;
+}
+
+unsigned WebAssemblyCFGStackify::getDelegateDepth(
+ const SmallVectorImpl<EndMarkerInfo> &Stack, const MachineBasicBlock *MBB) {
+ if (MBB == FakeCallerBB)
+ return Stack.size();
+ // Delegate's destination is either a catch or a another delegate BB. When the
+ // destination is another delegate, we can compute the argument in the same
+ // way as branches, because the target delegate BB only contains the single
+ // delegate instruction.
+ if (!MBB->isEHPad()) // Target is a delegate BB
+ return getBranchDepth(Stack, MBB);
+
+ // When the delegate's destination is a catch BB, we need to use its
+ // corresponding try's end_try BB because Stack contains each marker's end BB.
+ // Also we need to check if the end marker instruction matches, because a
+ // single BB can contain multiple end markers, like this:
+ // bb:
+ // END_BLOCK
+ // END_TRY
+ // END_BLOCK
+ // END_TRY
+ // ...
+ //
+ // In case of branches getting the immediate that targets any of these is
+ // fine, but delegate has to exactly target the correct try.
+ unsigned Depth = 0;
+ const MachineInstr *EndTry = BeginToEnd[EHPadToTry[MBB]];
+ for (auto X : reverse(Stack)) {
+ if (X.first == EndTry->getParent() && X.second == EndTry)
+ break;
+ ++Depth;
+ }
+ assert(Depth < Stack.size() && "Delegate destination should be in scope");
+ return Depth;
+}
+
+unsigned WebAssemblyCFGStackify::getRethrowDepth(
+ const SmallVectorImpl<EndMarkerInfo> &Stack,
+ const SmallVectorImpl<const MachineBasicBlock *> &EHPadStack) {
+ unsigned Depth = 0;
+ // In our current implementation, rethrows always rethrow the exception caught
+ // by the innermost enclosing catch. This means while traversing Stack in the
+ // reverse direction, when we encounter END_TRY, we should check if the
+ // END_TRY corresponds to the current innermost EH pad. For example:
+ // try
+ // ...
+ // catch ;; (a)
+ // try
+ // rethrow 1 ;; (b)
+ // catch ;; (c)
+ // rethrow 0 ;; (d)
+ // end ;; (e)
+ // end ;; (f)
+ //
+ // When we are at 'rethrow' (d), while reversely traversing Stack the first
+ // 'end' we encounter is the 'end' (e), which corresponds to the 'catch' (c).
+ // And 'rethrow' (d) rethrows the exception caught by 'catch' (c), so we stop
+ // there and the depth should be 0. But when we are at 'rethrow' (b), it
+ // rethrows the exception caught by 'catch' (a), so when traversing Stack
+ // reversely, we should skip the 'end' (e) and choose 'end' (f), which
+ // corresponds to 'catch' (a).
+ for (auto X : reverse(Stack)) {
+ const MachineInstr *End = X.second;
+ if (End->getOpcode() == WebAssembly::END_TRY) {
+ auto *EHPad = TryToEHPad[EndToBegin[End]];
+ if (EHPadStack.back() == EHPad)
+ break;
+ }
+ ++Depth;
+ }
+ assert(Depth < Stack.size() && "Rethrow destination should be in scope");
+ return Depth;
}
void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
// Now rewrite references to basic blocks to be depth immediates.
- SmallVector<const MachineBasicBlock *, 8> Stack;
+ SmallVector<EndMarkerInfo, 8> Stack;
+ SmallVector<const MachineBasicBlock *, 8> EHPadStack;
for (auto &MBB : reverse(MF)) {
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
case WebAssembly::BLOCK:
case WebAssembly::TRY:
- assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <=
+ assert(ScopeTops[Stack.back().first->getNumber()]->getNumber() <=
MBB.getNumber() &&
"Block/try marker should be balanced");
Stack.pop_back();
break;
case WebAssembly::LOOP:
- assert(Stack.back() == &MBB && "Loop top should be balanced");
+ assert(Stack.back().first == &MBB && "Loop top should be balanced");
Stack.pop_back();
break;
case WebAssembly::END_BLOCK:
- case WebAssembly::END_TRY:
- Stack.push_back(&MBB);
+ Stack.push_back(std::make_pair(&MBB, &MI));
break;
+ case WebAssembly::END_TRY: {
+ // We handle DELEGATE in the default level, because DELEGATE has
+ // immediate operands to rewrite.
+ Stack.push_back(std::make_pair(&MBB, &MI));
+ auto *EHPad = TryToEHPad[EndToBegin[&MI]];
+ EHPadStack.push_back(EHPad);
+ break;
+ }
+
case WebAssembly::END_LOOP:
- Stack.push_back(EndToBegin[&MI]->getParent());
+ Stack.push_back(std::make_pair(EndToBegin[&MI]->getParent(), &MI));
+ break;
+
+ case WebAssembly::CATCH:
+ case WebAssembly::CATCH_ALL:
+ EHPadStack.pop_back();
+ break;
+
+ case WebAssembly::RETHROW:
+ MI.getOperand(0).setImm(getRethrowDepth(Stack, EHPadStack));
break;
default:
@@ -977,11 +1720,20 @@
while (MI.getNumOperands() > 0)
MI.RemoveOperand(MI.getNumOperands() - 1);
for (auto MO : Ops) {
- if (MO.isMBB())
- MO = MachineOperand::CreateImm(getDepth(Stack, MO.getMBB()));
+ if (MO.isMBB()) {
+ if (MI.getOpcode() == WebAssembly::DELEGATE)
+ MO = MachineOperand::CreateImm(
+ getDelegateDepth(Stack, MO.getMBB()));
+ else
+ MO = MachineOperand::CreateImm(
+ getBranchDepth(Stack, MO.getMBB()));
+ }
MI.addOperand(MF, MO);
}
}
+
+ if (MI.getOpcode() == WebAssembly::DELEGATE)
+ Stack.push_back(std::make_pair(&MBB, &MI));
break;
}
}
@@ -989,13 +1741,18 @@
assert(Stack.empty() && "Control flow should be balanced");
}
+void WebAssemblyCFGStackify::cleanupFunctionData(MachineFunction &MF) {
+ if (FakeCallerBB)
+ MF.DeleteMachineBasicBlock(FakeCallerBB);
+ AppendixBB = FakeCallerBB = nullptr;
+}
+
void WebAssemblyCFGStackify::releaseMemory() {
ScopeTops.clear();
BeginToEnd.clear();
EndToBegin.clear();
TryToEHPad.clear();
EHPadToTry.clear();
- AppendixBB = nullptr;
}
bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
@@ -1031,6 +1788,8 @@
.isOSBinFormatELF())
appendEndToFunction(MF, TII);
+ cleanupFunctionData(MF);
+
MF.getInfo<WebAssemblyFunctionInfo>()->setCFGStackified();
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
index 655e30a..e06e359 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
@@ -15,10 +15,10 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -111,14 +111,17 @@
Stack.pop_back();
assert(Prev.Reg == MO.getReg() &&
"WebAssemblyDebugFixup: Pop: Register not matched!");
- if (Prev.DebugValue) {
+ // We should not put a DBG_VALUE after a terminator; debug ranges
+ // are terminated at the end of a BB anyway.
+ if (Prev.DebugValue && !MI.isTerminator()) {
// This stackified reg is a variable that started life at
// Prev.DebugValue, so now that we're popping it we must insert
// a $noreg DBG_VALUE for the variable to end it, right after
// the current instruction.
BuildMI(*Prev.DebugValue->getParent(), std::next(MII),
- Prev.DebugValue->getDebugLoc(), TII->get(WebAssembly::DBG_VALUE), false,
- Register(), Prev.DebugValue->getOperand(2).getMetadata(),
+ Prev.DebugValue->getDebugLoc(),
+ TII->get(WebAssembly::DBG_VALUE), false, Register(),
+ Prev.DebugValue->getOperand(2).getMetadata(),
Prev.DebugValue->getOperand(3).getMetadata());
}
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 78191ae..55be64a 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -24,13 +24,14 @@
// the whole BB, not just contiguous DBG_VALUEs.
if (!Instr->getOperand(0).isReg())
return;
+ CurrentReg = Instr->getOperand(0).getReg();
MachineBasicBlock::iterator DI = *Instr;
++DI;
for (MachineBasicBlock::iterator DE = Instr->getParent()->end(); DI != DE;
++DI) {
if (DI->isDebugValue() &&
- DI->getDebugOperandForReg(Instr->getOperand(0).getReg()))
+ DI->hasDebugOperandForReg(Instr->getOperand(0).getReg()))
DbgValues.push_back(&*DI);
}
}
@@ -43,7 +44,9 @@
void WebAssemblyDebugValueManager::updateReg(unsigned Reg) {
for (auto *DBI : DbgValues)
- DBI->getDebugOperand(0).setReg(Reg);
+ for (auto &MO : DBI->getDebugOperandsForReg(CurrentReg))
+ MO.setReg(Reg);
+ CurrentReg = Reg;
}
void WebAssemblyDebugValueManager::clone(MachineInstr *Insert,
@@ -52,14 +55,18 @@
MachineFunction *MF = MBB->getParent();
for (MachineInstr *DBI : reverse(DbgValues)) {
MachineInstr *Clone = MF->CloneMachineInstr(DBI);
- Clone->getDebugOperand(0).setReg(NewReg);
+ for (auto &MO : Clone->getDebugOperandsForReg(CurrentReg))
+ MO.setReg(NewReg);
MBB->insert(Insert, Clone);
}
}
void WebAssemblyDebugValueManager::replaceWithLocal(unsigned LocalId) {
for (auto *DBI : DbgValues) {
- MachineOperand &Op = DBI->getDebugOperand(0);
- Op.ChangeToTargetIndex(llvm::WebAssembly::TI_LOCAL, LocalId);
+ auto IndexType = DBI->isIndirectDebugValue()
+ ? llvm::WebAssembly::TI_LOCAL_INDIRECT
+ : llvm::WebAssembly::TI_LOCAL;
+ for (auto &MO : DBI->getDebugOperandsForReg(CurrentReg))
+ MO.ChangeToTargetIndex(IndexType, LocalId);
}
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
index 7eae3cb..c2dd569 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
@@ -23,6 +23,7 @@
class WebAssemblyDebugValueManager {
SmallVector<MachineInstr *, 2> DbgValues;
+ unsigned CurrentReg;
public:
WebAssemblyDebugValueManager(MachineInstr *Instr);
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index c75de7a..b949812 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -13,11 +13,14 @@
#include "WebAssemblyExceptionInfo.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "WebAssemblyUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachineDominanceFrontier.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -37,14 +40,43 @@
"********** Function: "
<< MF.getName() << '\n');
releaseMemory();
+ if (MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() !=
+ ExceptionHandling::Wasm ||
+ !MF.getFunction().hasPersonalityFn())
+ return false;
auto &MDT = getAnalysis<MachineDominatorTree>();
auto &MDF = getAnalysis<MachineDominanceFrontier>();
- recalculate(MDT, MDF);
+ recalculate(MF, MDT, MDF);
+ LLVM_DEBUG(dump());
+ return false;
+}
+
+// Check if Dst is reachable from Src using BFS. Search only within BBs
+// dominated by Header.
+static bool isReachableAmongDominated(const MachineBasicBlock *Src,
+ const MachineBasicBlock *Dst,
+ const MachineBasicBlock *Header,
+ const MachineDominatorTree &MDT) {
+ assert(MDT.dominates(Header, Dst));
+ SmallVector<const MachineBasicBlock *, 8> WL;
+ SmallPtrSet<const MachineBasicBlock *, 8> Visited;
+ WL.push_back(Src);
+
+ while (!WL.empty()) {
+ const auto *MBB = WL.pop_back_val();
+ if (MBB == Dst)
+ return true;
+ Visited.insert(MBB);
+ for (auto *Succ : MBB->successors())
+ if (!Visited.count(Succ) && MDT.dominates(Header, Succ))
+ WL.push_back(Succ);
+ }
return false;
}
void WebAssemblyExceptionInfo::recalculate(
- MachineDominatorTree &MDT, const MachineDominanceFrontier &MDF) {
+ MachineFunction &MF, MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF) {
// Postorder traversal of the dominator tree.
SmallVector<std::unique_ptr<WebAssemblyException>, 8> Exceptions;
for (auto DomNode : post_order(&MDT)) {
@@ -56,12 +88,160 @@
Exceptions.push_back(std::move(WE));
}
- // Add BBs to exceptions
+ // WasmEHFuncInfo contains a map of <catchpad, its next unwind destination>,
+ // which means, if an exception is not caught by the catchpad, it should end
+ // up in the next unwind destination stored in this data structure. (It is
+ // written as catchswitch's 'unwind' destination in ll files.) The below is an
+ // intuitive example of their relationship in C++ code:
+ // try {
+ // try {
+ // } catch (int) { // catchpad
+ // ... // this catch (int) { ... } is grouped as an exception
+ // }
+ // } catch (...) { // next unwind destination
+ // }
+ // (The example is try-catches for illustration purpose, but the unwind
+ // destination can be also a cleanuppad generated by destructor calls.) So the
+ // unwind destination is in the outside of the catchpad's exception.
+ //
+ // We group exceptions in this analysis simply by including all BBs dominated
+ // by an EH pad. But in case the EH pad's unwind destination does not have any
+ // children outside of the exception, that unwind destination ends up also
+ // being dominated by the EH pad and included in the exception, which is not
+ // semantically correct, because it unwinds/rethrows into an inner scope.
+ //
+ // Here we extract those unwind destinations from their (incorrect) parent
+ // exception. Note that the unwind destinations may not be an immediate
+ // children of the parent exception, so we have to traverse the parent chain.
+ //
+ // We should traverse BBs in the preorder of the dominator tree, because
+ // otherwise the result can be incorrect. For example, when there are three
+ // exceptions A, B, and C and A > B > C (> is subexception relationship here),
+ // and A's unwind destination is B and B's is C. When we visit B before A, we
+ // end up extracting C only out of B but not out of A.
+ const auto *EHInfo = MF.getWasmEHFuncInfo();
+ SmallVector<std::pair<WebAssemblyException *, WebAssemblyException *>>
+ UnwindWEVec;
+ for (auto *DomNode : depth_first(&MDT)) {
+ MachineBasicBlock *EHPad = DomNode->getBlock();
+ if (!EHPad->isEHPad())
+ continue;
+ if (!EHInfo->hasUnwindDest(EHPad))
+ continue;
+ auto *UnwindDest = EHInfo->getUnwindDest(EHPad);
+ auto *SrcWE = getExceptionFor(EHPad);
+ auto *DstWE = getExceptionFor(UnwindDest);
+ if (SrcWE->contains(DstWE)) {
+ UnwindWEVec.push_back(std::make_pair(SrcWE, DstWE));
+ LLVM_DEBUG(dbgs() << "Unwind destination ExceptionInfo fix:\n "
+ << DstWE->getEHPad()->getNumber() << "."
+ << DstWE->getEHPad()->getName()
+ << "'s exception is taken out of "
+ << SrcWE->getEHPad()->getNumber() << "."
+ << SrcWE->getEHPad()->getName() << "'s exception\n");
+ DstWE->setParentException(SrcWE->getParentException());
+ }
+ }
+
+ // After fixing subexception relationship between unwind destinations above,
+ // there can still be remaining discrepancies.
+ //
+ // For example, suppose Exception A is dominated by EHPad A and Exception B is
+ // dominated by EHPad B. EHPad A's unwind destination is EHPad B, but because
+ // EHPad B is dominated by EHPad A, the initial grouping makes Exception B a
+ // subexception of Exception A, and we fix it by taking Exception B out of
+ // Exception A above. But there can still be remaining BBs within Exception A
+ // that are reachable from Exception B. These BBs semantically don't belong
+ // to Exception A and were not a part of this 'catch' clause or cleanup code
+ // in the original code, but they just happened to be grouped within Exception
+ // A because they were dominated by EHPad A. We fix this case by taking those
+ // BBs out of the incorrect exception and all its subexceptions that it
+ // belongs to.
+ //
+ // 1. First, we take out remaining incorrect subexceptions. This part is
+ // easier, because we haven't added BBs to exceptions yet, we only need to
+ // change parent exception pointer.
+ for (auto *DomNode : depth_first(&MDT)) {
+ MachineBasicBlock *EHPad = DomNode->getBlock();
+ if (!EHPad->isEHPad())
+ continue;
+ auto *WE = getExceptionFor(EHPad);
+
+ // For each source EHPad -> unwind destination EHPad
+ for (auto &P : UnwindWEVec) {
+ auto *SrcWE = P.first;
+ auto *DstWE = P.second;
+ // If WE (the current EH pad's exception) is still contained in SrcWE but
+ // reachable from DstWE that was taken out of SrcWE above, we have to take
+ // out WE out of SrcWE too.
+ if (WE != SrcWE && SrcWE->contains(WE) && !DstWE->contains(WE) &&
+ isReachableAmongDominated(DstWE->getEHPad(), EHPad, SrcWE->getEHPad(),
+ MDT)) {
+ LLVM_DEBUG(dbgs() << "Remaining reachable ExceptionInfo fix:\n "
+ << WE->getEHPad()->getNumber() << "."
+ << WE->getEHPad()->getName()
+ << "'s exception is taken out of "
+ << SrcWE->getEHPad()->getNumber() << "."
+ << SrcWE->getEHPad()->getName() << "'s exception\n");
+ WE->setParentException(SrcWE->getParentException());
+ }
+ }
+ }
+
+ // Add BBs to exceptions' block set. This is a preparation to take out
+ // remaining incorect BBs from exceptions, because we need to iterate over BBs
+ // for each exception.
+ for (auto *DomNode : post_order(&MDT)) {
+ MachineBasicBlock *MBB = DomNode->getBlock();
+ WebAssemblyException *WE = getExceptionFor(MBB);
+ for (; WE; WE = WE->getParentException())
+ WE->addToBlocksSet(MBB);
+ }
+
+ // 2. We take out remaining individual BBs out. Now we have added BBs to each
+ // exceptions' BlockSet, when we take a BB out of an exception, we need to fix
+ // those sets too.
+ for (auto &P : UnwindWEVec) {
+ auto *SrcWE = P.first;
+ auto *DstWE = P.second;
+
+ for (auto *MBB : SrcWE->getBlocksSet()) {
+ if (MBB->isEHPad()) {
+ assert(!isReachableAmongDominated(DstWE->getEHPad(), MBB,
+ SrcWE->getEHPad(), MDT) &&
+ "We already handled EH pads above");
+ continue;
+ }
+ if (isReachableAmongDominated(DstWE->getEHPad(), MBB, SrcWE->getEHPad(),
+ MDT)) {
+ LLVM_DEBUG(dbgs() << "Remainder BB: " << MBB->getNumber() << "."
+ << MBB->getName() << " is\n");
+ WebAssemblyException *InnerWE = getExceptionFor(MBB);
+ while (InnerWE != SrcWE) {
+ LLVM_DEBUG(dbgs()
+ << " removed from " << InnerWE->getEHPad()->getNumber()
+ << "." << InnerWE->getEHPad()->getName()
+ << "'s exception\n");
+ InnerWE->removeFromBlocksSet(MBB);
+ InnerWE = InnerWE->getParentException();
+ }
+ SrcWE->removeFromBlocksSet(MBB);
+ LLVM_DEBUG(dbgs() << " removed from " << SrcWE->getEHPad()->getNumber()
+ << "." << SrcWE->getEHPad()->getName()
+ << "'s exception\n");
+ changeExceptionFor(MBB, SrcWE->getParentException());
+ if (SrcWE->getParentException())
+ SrcWE->getParentException()->addToBlocksSet(MBB);
+ }
+ }
+ }
+
+ // Add BBs to exceptions' block vector
for (auto DomNode : post_order(&MDT)) {
MachineBasicBlock *MBB = DomNode->getBlock();
WebAssemblyException *WE = getExceptionFor(MBB);
for (; WE; WE = WE->getParentException())
- WE->addBlock(MBB);
+ WE->addToBlocksVector(MBB);
}
SmallVector<WebAssemblyException*, 8> ExceptionPointers;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
index 50151ec..e330188 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -45,7 +45,7 @@
WebAssemblyException *ParentException = nullptr;
std::vector<std::unique_ptr<WebAssemblyException>> SubExceptions;
std::vector<MachineBasicBlock *> Blocks;
- SmallPtrSet<const MachineBasicBlock *, 8> BlockSet;
+ SmallPtrSet<MachineBasicBlock *, 8> BlockSet;
public:
WebAssemblyException(MachineBasicBlock *EHPad) : EHPad(EHPad) {}
@@ -68,6 +68,9 @@
return BlockSet.count(MBB);
}
+ void addToBlocksSet(MachineBasicBlock *MBB) { BlockSet.insert(MBB); }
+ void removeFromBlocksSet(MachineBasicBlock *MBB) { BlockSet.erase(MBB); }
+ void addToBlocksVector(MachineBasicBlock *MBB) { Blocks.push_back(MBB); }
void addBlock(MachineBasicBlock *MBB) {
Blocks.push_back(MBB);
BlockSet.insert(MBB);
@@ -81,8 +84,10 @@
}
unsigned getNumBlocks() const { return Blocks.size(); }
std::vector<MachineBasicBlock *> &getBlocksVector() { return Blocks; }
+ SmallPtrSetImpl<MachineBasicBlock *> &getBlocksSet() { return BlockSet; }
- const std::vector<std::unique_ptr<WebAssemblyException>> &getSubExceptions() const {
+ const std::vector<std::unique_ptr<WebAssemblyException>> &
+ getSubExceptions() const {
return SubExceptions;
}
std::vector<std::unique_ptr<WebAssemblyException>> &getSubExceptions() {
@@ -137,7 +142,7 @@
bool runOnMachineFunction(MachineFunction &) override;
void releaseMemory() override;
- void recalculate(MachineDominatorTree &MDT,
+ void recalculate(MachineFunction &MF, MachineDominatorTree &MDT,
const MachineDominanceFrontier &MDF);
void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -149,7 +154,8 @@
return BBMap.lookup(MBB);
}
- void changeExceptionFor(MachineBasicBlock *MBB, WebAssemblyException *WE) {
+ void changeExceptionFor(const MachineBasicBlock *MBB,
+ WebAssemblyException *WE) {
if (!WE) {
BBMap.erase(MBB);
return;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index ac94e9e..4a0738d 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -16,11 +16,11 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyDebugValueManager.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -239,8 +239,10 @@
Changed = true;
}
- // Start assigning local numbers after the last parameter.
+ // Start assigning local numbers after the last parameter and after any
+ // already-assigned locals.
unsigned CurLocal = static_cast<unsigned>(MFI.getParams().size());
+ CurLocal += static_cast<unsigned>(MFI.getLocals().size());
// Precompute the set of registers that are unused, so that we can insert
// drops to their defs.
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 4eb4275..171d59a 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -16,11 +16,11 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -130,9 +130,12 @@
case MVT::i64:
case MVT::f32:
case MVT::f64:
+ return VT;
case MVT::funcref:
case MVT::externref:
- return VT;
+ if (Subtarget->hasReferenceTypes())
+ return VT;
+ break;
case MVT::f16:
return MVT::f32;
case MVT::v16i8:
@@ -866,18 +869,32 @@
if (IsDirect) {
MIB.addGlobalAddress(Func);
} else {
- // Add placeholders for the type index and immediate flags
+ // Placeholder for the type index.
MIB.addImm(0);
- MIB.addImm(0);
-
- // Ensure that the object file has a __indirect_function_table import, as we
- // call_indirect against it.
- MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
- MF->getMMI().getContext(), "__indirect_function_table");
- // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
- // it as NO_STRIP so as to ensure that the indirect function table makes it
- // to linked output.
- Sym->setNoStrip();
+ // The table into which this call_indirect indexes.
+ MCSymbolWasm *Table = WebAssembly::getOrCreateFunctionTableSymbol(
+ MF->getMMI().getContext(), Subtarget);
+ if (Subtarget->hasReferenceTypes()) {
+ MIB.addSym(Table);
+ } else {
+ // Otherwise for the MVP there is at most one table whose number is 0, but
+ // we can't write a table symbol or issue relocations. Instead we just
+ // ensure the table is live.
+ Table->setNoStrip();
+ MIB.addImm(0);
+ }
+ // See if we must truncate the function pointer.
+ // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
+ // as 64-bit for uniformity with other pointer types.
+ // See also: WebAssemblyISelLowering.cpp: LowerCallResults
+ if (Subtarget->hasAddr64()) {
+ auto Wrap = BuildMI(*FuncInfo.MBB, std::prev(FuncInfo.InsertPt), DbgLoc,
+ TII.get(WebAssembly::I32_WRAP_I64));
+ unsigned Reg32 = createResultReg(&WebAssembly::I32RegClass);
+ Wrap.addReg(Reg32, RegState::Define);
+ Wrap.addReg(CalleeReg);
+ CalleeReg = Reg32;
+ }
}
for (unsigned ArgReg : Args)
@@ -1153,7 +1170,7 @@
}
Register Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
- In, I->getOperand(0)->hasOneUse());
+ In);
if (!Reg)
return false;
MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
@@ -1168,6 +1185,8 @@
const auto *Load = cast<LoadInst>(I);
if (Load->isAtomic())
return false;
+ if (!WebAssembly::isDefaultAddressSpace(Load->getPointerAddressSpace()))
+ return false;
if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
return false;
@@ -1226,6 +1245,8 @@
const auto *Store = cast<StoreInst>(I);
if (Store->isAtomic())
return false;
+ if (!WebAssembly::isDefaultAddressSpace(Store->getPointerAddressSpace()))
+ return false;
if (!Subtarget->hasSIMD128() &&
Store->getValueOperand()->getType()->isVectorTy())
return false;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 9566993..c45f7d7 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -19,17 +19,19 @@
#include "WebAssemblyFrameLowering.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyInstrInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
-#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -39,6 +41,52 @@
// TODO: wasm64
// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
+// In an ideal world, when objects are added to the MachineFrameInfo by
+// FunctionLoweringInfo::set, we could somehow hook into target-specific code to
+// ensure they are assigned the right stack ID. However there isn't a hook that
+// runs between then and DAG building time, though, so instead we hoist stack
+// objects lazily when they are first used, and comprehensively after the DAG is
+// built via the PreprocessISelDAG hook, called by the
+// SelectionDAGISel::runOnMachineFunction. We have to do it in two places
+// because we want to do it while building the selection DAG for uses of alloca,
+// but not all alloca instructions are used so we have to follow up afterwards.
+Optional<unsigned>
+WebAssemblyFrameLowering::getLocalForStackObject(MachineFunction &MF,
+ int FrameIndex) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // If already hoisted to a local, done.
+ if (MFI.getStackID(FrameIndex) == TargetStackID::WasmLocal)
+ return static_cast<unsigned>(MFI.getObjectOffset(FrameIndex));
+
+ // If not allocated in the object address space, this object will be in
+ // linear memory.
+ const AllocaInst *AI = MFI.getObjectAllocation(FrameIndex);
+ if (!AI ||
+ !WebAssembly::isWasmVarAddressSpace(AI->getType()->getAddressSpace()))
+ return None;
+
+ // Otherwise, allocate this object in the named value stack, outside of linear
+ // memory.
+ SmallVector<EVT, 4> ValueVTs;
+ const WebAssemblyTargetLowering &TLI =
+ *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+ WebAssemblyFunctionInfo *FuncInfo = MF.getInfo<WebAssemblyFunctionInfo>();
+ ComputeValueVTs(TLI, MF.getDataLayout(), AI->getAllocatedType(), ValueVTs);
+ MFI.setStackID(FrameIndex, TargetStackID::WasmLocal);
+ // Abuse SP offset to record the index of the first local in the object.
+ unsigned Local = FuncInfo->getParams().size() + FuncInfo->getLocals().size();
+ MFI.setObjectOffset(FrameIndex, Local);
+ // Allocate WebAssembly locals for each non-aggregate component of the
+ // allocation.
+ for (EVT ValueVT : ValueVTs)
+ FuncInfo->addLocal(ValueVT.getSimpleVT());
+ // Abuse object size to record number of WebAssembly locals allocated to
+ // this object.
+ MFI.setObjectSize(FrameIndex, ValueVTs.size());
+ return static_cast<unsigned>(Local);
+}
+
/// We need a base pointer in the case of having items on the stack that
/// require stricter alignment than the stack pointer itself. Because we need
/// to shift the stack pointer by some unknown amount to force the alignment,
@@ -46,7 +94,7 @@
bool WebAssemblyFrameLowering::hasBP(const MachineFunction &MF) const {
const auto *RegInfo =
MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
- return RegInfo->needsStackRealignment(MF);
+ return RegInfo->hasStackRealignment(MF);
}
/// Return true if the specified function should have a dedicated frame pointer
@@ -314,6 +362,16 @@
writeSPToGlobal(SPReg, MF, MBB, InsertPt, DL);
}
+bool WebAssemblyFrameLowering::isSupportedStackID(
+ TargetStackID::Value ID) const {
+ // Use the Object stack for WebAssembly locals which can only be accessed
+ // by name, not via an address in linear memory.
+ if (ID == TargetStackID::WasmLocal)
+ return true;
+
+ return TargetFrameLowering::isSupportedStackID(ID);
+}
+
TargetFrameLowering::DwarfFrameBase
WebAssemblyFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
DwarfFrameBase Loc;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index e16f639..d30a3fa 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -43,6 +43,7 @@
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool isSupportedStackID(TargetStackID::Value ID) const override;
DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override;
bool needsPrologForEH(const MachineFunction &MF) const;
@@ -53,6 +54,11 @@
MachineBasicBlock::iterator &InsertStore,
const DebugLoc &DL) const;
+ // Returns the index of the WebAssembly local to which the stack object
+ // FrameIndex in MF should be allocated, or None.
+ static Optional<unsigned> getLocalForStackObject(MachineFunction &MF,
+ int FrameIndex);
+
static unsigned getSPReg(const MachineFunction &MF);
static unsigned getFPReg(const MachineFunction &MF);
static unsigned getOpcConst(const MachineFunction &MF);
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 33fd2ae..21519d6 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -17,9 +17,11 @@
HANDLE_NODETYPE(RET_CALL)
HANDLE_NODETYPE(RETURN)
HANDLE_NODETYPE(ARGUMENT)
+HANDLE_NODETYPE(LOCAL_GET)
+HANDLE_NODETYPE(LOCAL_SET)
// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
HANDLE_NODETYPE(Wrapper)
-// A special wapper used in PIC code for __memory_base/__table_base relcative
+// A special wapper used in PIC code for __memory_base/__table_base relative
// access.
HANDLE_NODETYPE(WrapperPIC)
HANDLE_NODETYPE(BR_IF)
@@ -35,8 +37,10 @@
HANDLE_NODETYPE(EXTEND_HIGH_U)
HANDLE_NODETYPE(CONVERT_LOW_S)
HANDLE_NODETYPE(CONVERT_LOW_U)
+HANDLE_NODETYPE(PROMOTE_LOW)
HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
+HANDLE_NODETYPE(DEMOTE_ZERO)
HANDLE_NODETYPE(THROW)
HANDLE_NODETYPE(CATCH)
HANDLE_NODETYPE(MEMORY_COPY)
@@ -44,3 +48,6 @@
// Memory intrinsics
HANDLE_MEM_NODETYPE(LOAD_SPLAT)
+HANDLE_MEM_NODETYPE(GLOBAL_GET)
+HANDLE_MEM_NODETYPE(GLOBAL_SET)
+HANDLE_MEM_NODETYPE(TABLE_SET)
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index b9154b0..f4bae59 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -13,7 +13,9 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
+#include "WebAssemblyISelLowering.h"
#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h" // To access function attributes.
@@ -46,20 +48,44 @@
return "WebAssembly Instruction Selection";
}
+ void checkForInvalidNodes(const Function &F) {
+ // This function will check for uses of ptrtoint on reference types and
+ // report a fatal error if these are found.
+ for (const BasicBlock &BB : F) {
+ for (const Instruction &I : BB) {
+ if (const PtrToIntInst *PTI = dyn_cast<const PtrToIntInst>(&I)) {
+ const Value *V = PTI->getPointerOperand();
+ if (WebAssemblyTargetLowering::isFuncrefType(V->getType()) ||
+ WebAssemblyTargetLowering::isExternrefType(V->getType()))
+ report_fatal_error("ptrtoint not allowed on reference types");
+ } else if (const IntToPtrInst *ITP = dyn_cast<const IntToPtrInst>(&I)) {
+ if (WebAssemblyTargetLowering::isFuncrefType(ITP->getDestTy()) ||
+ WebAssemblyTargetLowering::isExternrefType(ITP->getDestTy()))
+ report_fatal_error("inttoptr not allowed on reference types");
+ }
+ }
+ }
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override {
LLVM_DEBUG(dbgs() << "********** ISelDAGToDAG **********\n"
"********** Function: "
<< MF.getName() << '\n');
+ checkForInvalidNodes(MF.getFunction());
+
Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
+ void PreprocessISelDAG() override;
+
void Select(SDNode *Node) override;
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
+ bool SelectExternRefAddr(const SDValue &Addr, const SDValue &Base);
// Include the pieces autogenerated from the target description.
#include "WebAssemblyGenDAGISel.inc"
@@ -69,6 +95,18 @@
};
} // end anonymous namespace
+void WebAssemblyDAGToDAGISel::PreprocessISelDAG() {
+ // Stack objects that should be allocated to locals are hoisted to WebAssembly
+ // locals when they are first used. However for those without uses, we hoist
+ // them here. It would be nice if there were some hook to do this when they
+ // are added to the MachineFrameInfo, but that's not the case right now.
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ for (int Idx = 0; Idx < FrameInfo.getObjectIndexEnd(); Idx++)
+ WebAssemblyFrameLowering::getLocalForStackObject(*MF, Idx);
+
+ SelectionDAGISel::PreprocessISelDAG();
+}
+
void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2974217..62c53c0 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -13,17 +13,18 @@
#include "WebAssemblyISelLowering.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
-#include "WebAssemblyUtilities.h"
-#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
@@ -66,9 +67,33 @@
addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
}
+ if (Subtarget->hasReferenceTypes()) {
+ addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
+ addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
+ }
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget->getRegisterInfo());
+ // Transform loads and stores to pointers in address space 1 to loads and
+ // stores to WebAssembly global variables, outside linear memory.
+ for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) {
+ setOperationAction(ISD::LOAD, T, Custom);
+ setOperationAction(ISD::STORE, T, Custom);
+ }
+ if (Subtarget->hasSIMD128()) {
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+ MVT::v2f64}) {
+ setOperationAction(ISD::LOAD, T, Custom);
+ setOperationAction(ISD::STORE, T, Custom);
+ }
+ }
+ if (Subtarget->hasReferenceTypes()) {
+ for (auto T : {MVT::externref, MVT::funcref}) {
+ setOperationAction(ISD::LOAD, T, Custom);
+ setOperationAction(ISD::STORE, T, Custom);
+ }
+ }
+
setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVTPtr, Custom);
setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
@@ -135,11 +160,18 @@
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
- // Combine {s,u}int_to_fp of extract_vectors into conversion ops
+ // Combine int_to_fp or fp_extend of extract_vectors and vice versa into
+ // conversions ops
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::FP_EXTEND);
+ setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
- // Combine concat of {s,u}int_to_fp_sat to i32x4.trunc_sat_f64x2_zero_{s,u}
+ // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
+ // into conversion ops
+ setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
+ setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
+ setTargetDAGCombine(ISD::FP_ROUND);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
// Support saturating add for i8x16 and i16x8
@@ -148,7 +180,7 @@
setOperationAction(Op, T, Legal);
// Support integer abs
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
setOperationAction(ISD::ABS, T, Legal);
// Custom lower BUILD_VECTORs to minimize number of replace_lanes
@@ -191,6 +223,9 @@
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(Op, T, Legal);
+ // And we have popcnt for i8x16
+ setOperationAction(ISD::CTPOP, MVT::v16i8, Legal);
+
// Expand float operations supported for scalars but not SIMD
for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
ISD::FEXP, ISD::FEXP2, ISD::FRINT})
@@ -320,6 +355,26 @@
return AtomicExpansionKind::CmpXChg;
}
+bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
+ // Implementation copied from X86TargetLowering.
+ unsigned Opc = VecOp.getOpcode();
+
+ // Assume target opcodes can't be scalarized.
+ // TODO - do we have any exceptions?
+ if (Opc >= ISD::BUILTIN_OP_END)
+ return false;
+
+ // If the vector op is not supported, try to convert to scalar.
+ EVT VecVT = VecOp.getValueType();
+ if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
+ return true;
+
+ // If the vector op is supported, but the scalar op is not, the transform may
+ // not be worthwhile.
+ EVT ScalarVT = VecVT.getScalarType();
+ return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
+}
+
FastISel *WebAssemblyTargetLowering::createFastISel(
FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const {
return WebAssembly::createFastISel(FuncInfo, LibInfo);
@@ -441,9 +496,10 @@
return DoneMBB;
}
-static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
- DebugLoc DL, MachineBasicBlock *BB,
- const TargetInstrInfo &TII) {
+static MachineBasicBlock *
+LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
+ const WebAssemblySubtarget *Subtarget,
+ const TargetInstrInfo &TII) {
MachineInstr &CallParams = *CallResults.getPrevNode();
assert(CallParams.getOpcode() == WebAssembly::CALL_PARAMS);
assert(CallResults.getOpcode() == WebAssembly::CALL_RESULTS ||
@@ -452,6 +508,16 @@
bool IsIndirect = CallParams.getOperand(0).isReg();
bool IsRetCall = CallResults.getOpcode() == WebAssembly::RET_CALL_RESULTS;
+ bool IsFuncrefCall = false;
+ if (IsIndirect) {
+ Register Reg = CallParams.getOperand(0).getReg();
+ const MachineFunction *MF = BB->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
+ IsFuncrefCall = (TRC == &WebAssembly::FUNCREFRegClass);
+ assert(!IsFuncrefCall || Subtarget->hasReferenceTypes());
+ }
+
unsigned CallOp;
if (IsIndirect && IsRetCall) {
CallOp = WebAssembly::RET_CALL_INDIRECT;
@@ -470,6 +536,7 @@
// See if we must truncate the function pointer.
// CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
// as 64-bit for uniformity with other pointer types.
+ // See also: WebAssemblyFastISel::selectCall
if (IsIndirect && MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) {
Register Reg32 =
MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
@@ -490,19 +557,24 @@
for (auto Def : CallResults.defs())
MIB.add(Def);
- // Add placeholders for the type index and immediate flags
if (IsIndirect) {
+ // Placeholder for the type index.
MIB.addImm(0);
- MIB.addImm(0);
-
- // Ensure that the object file has a __indirect_function_table import, as we
- // call_indirect against it.
- MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
- MF.getContext(), "__indirect_function_table");
- // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
- // it as NO_STRIP so as to ensure that the indirect function table makes it
- // to linked output.
- Sym->setNoStrip();
+ // The table into which this call_indirect indexes.
+ MCSymbolWasm *Table = IsFuncrefCall
+ ? WebAssembly::getOrCreateFuncrefCallTableSymbol(
+ MF.getContext(), Subtarget)
+ : WebAssembly::getOrCreateFunctionTableSymbol(
+ MF.getContext(), Subtarget);
+ if (Subtarget->hasReferenceTypes()) {
+ MIB.addSym(Table);
+ } else {
+ // For the MVP there is at most one table whose number is 0, but we can't
+ // write a table symbol or issue relocations. Instead we just ensure the
+ // table is live and write a zero.
+ Table->setNoStrip();
+ MIB.addImm(0);
+ }
}
for (auto Use : CallParams.uses())
@@ -512,6 +584,39 @@
CallParams.eraseFromParent();
CallResults.eraseFromParent();
+ // If this is a funcref call, to avoid hidden GC roots, we need to clear the
+ // table slot with ref.null upon call_indirect return.
+ //
+ // This generates the following code, which comes right after a call_indirect
+ // of a funcref:
+ //
+ // i32.const 0
+ // ref.null func
+ // table.set __funcref_call_table
+ if (IsIndirect && IsFuncrefCall) {
+ MCSymbolWasm *Table = WebAssembly::getOrCreateFuncrefCallTableSymbol(
+ MF.getContext(), Subtarget);
+ Register RegZero =
+ MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+ MachineInstr *Const0 =
+ BuildMI(MF, DL, TII.get(WebAssembly::CONST_I32), RegZero).addImm(0);
+ BB->insertAfter(MIB.getInstr()->getIterator(), Const0);
+
+ Register RegFuncref =
+ MF.getRegInfo().createVirtualRegister(&WebAssembly::FUNCREFRegClass);
+ MachineInstr *RefNull =
+ BuildMI(MF, DL, TII.get(WebAssembly::REF_NULL_FUNCREF), RegFuncref)
+ .addImm(static_cast<int32_t>(WebAssembly::HeapType::Funcref));
+ BB->insertAfter(Const0->getIterator(), RefNull);
+
+ MachineInstr *TableSet =
+ BuildMI(MF, DL, TII.get(WebAssembly::TABLE_SET_FUNCREF))
+ .addSym(Table)
+ .addReg(RegZero)
+ .addReg(RegFuncref);
+ BB->insertAfter(RefNull->getIterator(), TableSet);
+ }
+
return BB;
}
@@ -549,7 +654,7 @@
WebAssembly::I64_TRUNC_U_F64);
case WebAssembly::CALL_RESULTS:
case WebAssembly::RET_CALL_RESULTS:
- return LowerCallResults(MI, DL, BB, TII);
+ return LowerCallResults(MI, DL, BB, Subtarget, TII);
}
}
@@ -637,7 +742,7 @@
}
bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
- EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/,
+ EVT /*VT*/, unsigned /*AddrSpace*/, Align /*Align*/,
MachineMemOperand::Flags /*Flags*/, bool *Fast) const {
// WebAssembly supports unaligned accesses, though it should be declared
// with the p2align attribute on loads and stores which do so, and there
@@ -713,65 +818,6 @@
Info.align = Align(8);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
- case Intrinsic::wasm_load32_zero:
- case Intrinsic::wasm_load64_zero:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8);
- Info.flags = MachineMemOperand::MOLoad;
- return true;
- case Intrinsic::wasm_load8_lane:
- case Intrinsic::wasm_load16_lane:
- case Intrinsic::wasm_load32_lane:
- case Intrinsic::wasm_load64_lane:
- case Intrinsic::wasm_store8_lane:
- case Intrinsic::wasm_store16_lane:
- case Intrinsic::wasm_store32_lane:
- case Intrinsic::wasm_store64_lane: {
- MVT MemVT;
- Align MemAlign;
- switch (Intrinsic) {
- case Intrinsic::wasm_load8_lane:
- case Intrinsic::wasm_store8_lane:
- MemVT = MVT::i8;
- MemAlign = Align(1);
- break;
- case Intrinsic::wasm_load16_lane:
- case Intrinsic::wasm_store16_lane:
- MemVT = MVT::i16;
- MemAlign = Align(2);
- break;
- case Intrinsic::wasm_load32_lane:
- case Intrinsic::wasm_store32_lane:
- MemVT = MVT::i32;
- MemAlign = Align(4);
- break;
- case Intrinsic::wasm_load64_lane:
- case Intrinsic::wasm_store64_lane:
- MemVT = MVT::i64;
- MemAlign = Align(8);
- break;
- default:
- llvm_unreachable("unexpected intrinsic");
- }
- if (Intrinsic == Intrinsic::wasm_load8_lane ||
- Intrinsic == Intrinsic::wasm_load16_lane ||
- Intrinsic == Intrinsic::wasm_load32_lane ||
- Intrinsic == Intrinsic::wasm_load64_lane) {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.flags = MachineMemOperand::MOLoad;
- } else {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.flags = MachineMemOperand::MOStore;
- }
- Info.ptrVal = I.getArgOperand(0);
- Info.memVT = MemVT;
- Info.offset = 0;
- Info.align = MemAlign;
- return true;
- }
default:
return false;
}
@@ -1040,6 +1086,33 @@
InTys.push_back(In.VT);
}
+ // Lastly, if this is a call to a funcref we need to add an instruction
+ // table.set to the chain and transform the call.
+ if (CLI.CB && isFuncrefType(CLI.CB->getCalledOperand()->getType())) {
+ // In the absence of function references proposal where a funcref call is
+ // lowered to call_ref, using reference types we generate a table.set to set
+ // the funcref to a special table used solely for this purpose, followed by
+ // a call_indirect. Here we just generate the table set, and return the
+ // SDValue of the table.set so that LowerCall can finalize the lowering by
+ // generating the call_indirect.
+ SDValue Chain = Ops[0];
+
+ MCSymbolWasm *Table = WebAssembly::getOrCreateFuncrefCallTableSymbol(
+ MF.getContext(), Subtarget);
+ SDValue Sym = DAG.getMCSymbol(Table, PtrVT);
+ SDValue TableSlot = DAG.getConstant(0, DL, MVT::i32);
+ SDValue TableSetOps[] = {Chain, Sym, TableSlot, Callee};
+ SDValue TableSet = DAG.getMemIntrinsicNode(
+ WebAssemblyISD::TABLE_SET, DL, DAG.getVTList(MVT::Other), TableSetOps,
+ MVT::funcref,
+ // Machine Mem Operand args
+ MachinePointerInfo(WasmAddressSpace::FUNCREF),
+ CLI.CB->getCalledOperand()->getPointerAlignment(DAG.getDataLayout()),
+ MachineMemOperand::MOStore);
+
+ Ops[0] = TableSet; // The new chain is the TableSet itself
+ }
+
if (CLI.IsTailCall) {
// ret_calls do not return values to the current frame
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -1248,9 +1321,107 @@
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
return LowerFP_TO_INT_SAT(Op, DAG);
+ case ISD::LOAD:
+ return LowerLoad(Op, DAG);
+ case ISD::STORE:
+ return LowerStore(Op, DAG);
}
}
+static bool IsWebAssemblyGlobal(SDValue Op) {
+ if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ return WebAssembly::isWasmVarAddressSpace(GA->getAddressSpace());
+
+ return false;
+}
+
+static Optional<unsigned> IsWebAssemblyLocal(SDValue Op, SelectionDAG &DAG) {
+ const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op);
+ if (!FI)
+ return None;
+
+ auto &MF = DAG.getMachineFunction();
+ return WebAssemblyFrameLowering::getLocalForStackObject(MF, FI->getIndex());
+}
+
+bool WebAssemblyTargetLowering::isFuncrefType(const Type *Ty) {
+ return isa<PointerType>(Ty) &&
+ Ty->getPointerAddressSpace() == WasmAddressSpace::FUNCREF;
+}
+
+bool WebAssemblyTargetLowering::isExternrefType(const Type *Ty) {
+ return isa<PointerType>(Ty) &&
+ Ty->getPointerAddressSpace() == WasmAddressSpace::EXTERNREF;
+}
+
+SDValue WebAssemblyTargetLowering::LowerStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+ const SDValue &Value = SN->getValue();
+ const SDValue &Base = SN->getBasePtr();
+ const SDValue &Offset = SN->getOffset();
+
+ if (IsWebAssemblyGlobal(Base)) {
+ if (!Offset->isUndef())
+ report_fatal_error("unexpected offset when storing to webassembly global",
+ false);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {SN->getChain(), Value, Base};
+ return DAG.getMemIntrinsicNode(WebAssemblyISD::GLOBAL_SET, DL, Tys, Ops,
+ SN->getMemoryVT(), SN->getMemOperand());
+ }
+
+ if (Optional<unsigned> Local = IsWebAssemblyLocal(Base, DAG)) {
+ if (!Offset->isUndef())
+ report_fatal_error("unexpected offset when storing to webassembly local",
+ false);
+
+ SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
+ SDVTList Tys = DAG.getVTList(MVT::Other); // The chain.
+ SDValue Ops[] = {SN->getChain(), Idx, Value};
+ return DAG.getNode(WebAssemblyISD::LOCAL_SET, DL, Tys, Ops);
+ }
+
+ return Op;
+}
+
+SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+ const SDValue &Base = LN->getBasePtr();
+ const SDValue &Offset = LN->getOffset();
+
+ if (IsWebAssemblyGlobal(Base)) {
+ if (!Offset->isUndef())
+ report_fatal_error(
+ "unexpected offset when loading from webassembly global", false);
+
+ SDVTList Tys = DAG.getVTList(LN->getValueType(0), MVT::Other);
+ SDValue Ops[] = {LN->getChain(), Base};
+ return DAG.getMemIntrinsicNode(WebAssemblyISD::GLOBAL_GET, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ }
+
+ if (Optional<unsigned> Local = IsWebAssemblyLocal(Base, DAG)) {
+ if (!Offset->isUndef())
+ report_fatal_error(
+ "unexpected offset when loading from webassembly local", false);
+
+ SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
+ EVT LocalVT = LN->getValueType(0);
+ SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT,
+ {LN->getChain(), Idx});
+ SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL);
+ assert(Result->getNumValues() == 2 && "Loads must carry a chain!");
+ return Result;
+ }
+
+ return Op;
+}
+
SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(2);
@@ -1369,8 +1540,8 @@
EVT VT = Op.getValueType();
assert(GA->getTargetFlags() == 0 &&
"Unexpected target flags on generic GlobalAddressSDNode");
- if (GA->getAddressSpace() != 0)
- fail(DL, DAG, "WebAssembly only expects the 0 address space");
+ if (!WebAssembly::isValidAddressSpace(GA->getAddressSpace()))
+ fail(DL, DAG, "Invalid address space for WebAssembly target");
unsigned OperandFlags = 0;
if (isPositionIndependent()) {
@@ -1968,12 +2139,13 @@
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT ResT = Op.getValueType();
- uint64_t Width = Op.getConstantOperandVal(1);
+ EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- if ((ResT == MVT::i32 || ResT == MVT::i64) && (Width == 32 || Width == 64))
+ if ((ResT == MVT::i32 || ResT == MVT::i64) &&
+ (SatVT == MVT::i32 || SatVT == MVT::i64))
return Op;
- if (ResT == MVT::v4i32 && Width == 32)
+ if (ResT == MVT::v4i32 && SatVT == MVT::i32)
return Op;
return SDValue();
@@ -2058,78 +2230,232 @@
performVectorConvertLowCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
auto &DAG = DCI.DAG;
- assert(N->getOpcode() == ISD::SINT_TO_FP ||
- N->getOpcode() == ISD::UINT_TO_FP);
- // Combine ({s,u}int_to_fp (extract_subvector ... 0)) to an
- // f64x2.convert_low_i32x4_{s,u} SDNode.
+ EVT ResVT = N->getValueType(0);
+ if (ResVT != MVT::v2f64)
+ return SDValue();
+
+ auto GetWasmConversionOp = [](unsigned Op) {
+ switch (Op) {
+ case ISD::SINT_TO_FP:
+ return WebAssemblyISD::CONVERT_LOW_S;
+ case ISD::UINT_TO_FP:
+ return WebAssemblyISD::CONVERT_LOW_U;
+ case ISD::FP_EXTEND:
+ return WebAssemblyISD::PROMOTE_LOW;
+ }
+ llvm_unreachable("unexpected op");
+ };
+
+ if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ // Combine this:
+ //
+ // (v2f64 (extract_subvector
+ // (v4f64 ({s,u}int_to_fp (v4i32 $x))), 0))
+ //
+ // into (f64x2.convert_low_i32x4_{s,u} $x).
+ //
+ // Or this:
+ //
+ // (v2f64 (extract_subvector
+ // (v4f64 (fp_extend (v4f32 $x))), 0))
+ //
+ // into (f64x2.promote_low_f32x4 $x).
+ auto Conversion = N->getOperand(0);
+ auto ConversionOp = Conversion.getOpcode();
+ MVT ExpectedSourceType;
+ switch (ConversionOp) {
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ ExpectedSourceType = MVT::v4i32;
+ break;
+ case ISD::FP_EXTEND:
+ ExpectedSourceType = MVT::v4f32;
+ break;
+ default:
+ return SDValue();
+ }
+
+ if (Conversion.getValueType() != MVT::v4f64)
+ return SDValue();
+
+ auto Source = Conversion.getOperand(0);
+ if (Source.getValueType() != ExpectedSourceType)
+ return SDValue();
+
+ auto IndexNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
+ return SDValue();
+
+ auto Op = GetWasmConversionOp(ConversionOp);
+ return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+ }
+
+ // Combine this:
+ //
+ // (v2f64 ({s,u}int_to_fp
+ // (v2i32 (extract_subvector (v4i32 $x), 0))))
+ //
+ // into (f64x2.convert_low_i32x4_{s,u} $x).
+ //
+ // Or this:
+ //
+ // (v2f64 (fp_extend
+ // (v2f32 (extract_subvector (v4f32 $x), 0))))
+ //
+ // into (f64x2.promote_low_f32x4 $x).
+ auto ConversionOp = N->getOpcode();
+ MVT ExpectedExtractType;
+ MVT ExpectedSourceType;
+ switch (ConversionOp) {
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ ExpectedExtractType = MVT::v2i32;
+ ExpectedSourceType = MVT::v4i32;
+ break;
+ case ISD::FP_EXTEND:
+ ExpectedExtractType = MVT::v2f32;
+ ExpectedSourceType = MVT::v4f32;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
auto Extract = N->getOperand(0);
if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return SDValue();
+
+ if (Extract.getValueType() != ExpectedExtractType)
+ return SDValue();
+
auto Source = Extract.getOperand(0);
- if (Source.getValueType() != MVT::v4i32)
+ if (Source.getValueType() != ExpectedSourceType)
return SDValue();
+
auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
- if (IndexNode == nullptr)
- return SDValue();
- auto Index = IndexNode->getZExtValue();
-
- // The types must be correct.
- EVT ResVT = N->getValueType(0);
- if (ResVT != MVT::v2f64 || Extract.getValueType() != MVT::v2i32)
+ if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
return SDValue();
- // The extracted vector must be the low half.
- if (Index != 0)
- return SDValue();
-
- unsigned Op = N->getOpcode() == ISD::SINT_TO_FP
- ? WebAssemblyISD::CONVERT_LOW_S
- : WebAssemblyISD::CONVERT_LOW_U;
-
+ unsigned Op = GetWasmConversionOp(ConversionOp);
return DAG.getNode(Op, SDLoc(N), ResVT, Source);
}
static SDValue
-performVectorTruncSatLowCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
auto &DAG = DCI.DAG;
- assert(N->getOpcode() == ISD::CONCAT_VECTORS);
+
+ auto GetWasmConversionOp = [](unsigned Op) {
+ switch (Op) {
+ case ISD::FP_TO_SINT_SAT:
+ return WebAssemblyISD::TRUNC_SAT_ZERO_S;
+ case ISD::FP_TO_UINT_SAT:
+ return WebAssemblyISD::TRUNC_SAT_ZERO_U;
+ case ISD::FP_ROUND:
+ return WebAssemblyISD::DEMOTE_ZERO;
+ }
+ llvm_unreachable("unexpected op");
+ };
+
+ auto IsZeroSplat = [](SDValue SplatVal) {
+ auto *Splat = dyn_cast<BuildVectorSDNode>(SplatVal.getNode());
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ return Splat &&
+ Splat->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs) &&
+ SplatValue == 0;
+ };
+
+ if (N->getOpcode() == ISD::CONCAT_VECTORS) {
+ // Combine this:
+ //
+ // (concat_vectors (v2i32 (fp_to_{s,u}int_sat $x, 32)), (v2i32 (splat 0)))
+ //
+ // into (i32x4.trunc_sat_f64x2_zero_{s,u} $x).
+ //
+ // Or this:
+ //
+ // (concat_vectors (v2f32 (fp_round (v2f64 $x))), (v2f32 (splat 0)))
+ //
+ // into (f32x4.demote_zero_f64x2 $x).
+ EVT ResVT;
+ EVT ExpectedConversionType;
+ auto Conversion = N->getOperand(0);
+ auto ConversionOp = Conversion.getOpcode();
+ switch (ConversionOp) {
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
+ ResVT = MVT::v4i32;
+ ExpectedConversionType = MVT::v2i32;
+ break;
+ case ISD::FP_ROUND:
+ ResVT = MVT::v4f32;
+ ExpectedConversionType = MVT::v2f32;
+ break;
+ default:
+ return SDValue();
+ }
+
+ if (N->getValueType(0) != ResVT)
+ return SDValue();
+
+ if (Conversion.getValueType() != ExpectedConversionType)
+ return SDValue();
+
+ auto Source = Conversion.getOperand(0);
+ if (Source.getValueType() != MVT::v2f64)
+ return SDValue();
+
+ if (!IsZeroSplat(N->getOperand(1)) ||
+ N->getOperand(1).getValueType() != ExpectedConversionType)
+ return SDValue();
+
+ unsigned Op = GetWasmConversionOp(ConversionOp);
+ return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+ }
// Combine this:
//
- // (concat_vectors (v2i32 (fp_to_{s,u}int_sat $x, 32)), (v2i32 (splat 0)))
+ // (fp_to_{s,u}int_sat (concat_vectors $x, (v2f64 (splat 0))), 32)
//
// into (i32x4.trunc_sat_f64x2_zero_{s,u} $x).
- EVT ResVT = N->getValueType(0);
- if (ResVT != MVT::v4i32)
+ //
+ // Or this:
+ //
+ // (v4f32 (fp_round (concat_vectors $x, (v2f64 (splat 0)))))
+ //
+ // into (f32x4.demote_zero_f64x2 $x).
+ EVT ResVT;
+ auto ConversionOp = N->getOpcode();
+ switch (ConversionOp) {
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
+ ResVT = MVT::v4i32;
+ break;
+ case ISD::FP_ROUND:
+ ResVT = MVT::v4f32;
+ break;
+ default:
+ llvm_unreachable("unexpected op");
+ }
+
+ if (N->getValueType(0) != ResVT)
return SDValue();
- auto FPToInt = N->getOperand(0);
- auto FPToIntOp = FPToInt.getOpcode();
- if (FPToIntOp != ISD::FP_TO_SINT_SAT && FPToIntOp != ISD::FP_TO_UINT_SAT)
- return SDValue();
- if (FPToInt.getConstantOperandVal(1) != 32)
+ auto Concat = N->getOperand(0);
+ if (Concat.getValueType() != MVT::v4f64)
return SDValue();
- auto Source = FPToInt.getOperand(0);
+ auto Source = Concat.getOperand(0);
if (Source.getValueType() != MVT::v2f64)
return SDValue();
- auto *Splat = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
- APInt SplatValue, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!Splat || !Splat->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
- HasAnyUndefs))
- return SDValue();
- if (SplatValue != 0)
+ if (!IsZeroSplat(Concat.getOperand(1)) ||
+ Concat.getOperand(1).getValueType() != MVT::v2f64)
return SDValue();
- unsigned Op = FPToIntOp == ISD::FP_TO_SINT_SAT
- ? WebAssemblyISD::TRUNC_SAT_ZERO_S
- : WebAssemblyISD::TRUNC_SAT_ZERO_U;
-
+ unsigned Op = GetWasmConversionOp(ConversionOp);
return DAG.getNode(Op, SDLoc(N), ResVT, Source);
}
@@ -2146,8 +2472,13 @@
return performVectorExtendCombine(N, DCI);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
+ case ISD::FP_EXTEND:
+ case ISD::EXTRACT_SUBVECTOR:
return performVectorConvertLowCombine(N, DCI);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
+ case ISD::FP_ROUND:
case ISD::CONCAT_VECTORS:
- return performVectorTruncSatLowCombine(N, DCI);
+ return performVectorTruncZeroCombine(N, DCI);
}
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 10aca77..5d813fe 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -45,12 +45,43 @@
WebAssemblyTargetLowering(const TargetMachine &TM,
const WebAssemblySubtarget &STI);
+ enum WasmAddressSpace : unsigned {
+ // WebAssembly uses the following address spaces:
+ // AS 0 : is the default address space for values in linear memory
+ DEFAULT = 0,
+ // AS 1 : is a non-integral address space for global variables
+ GLOBAL = 1,
+ // AS 10 : is a non-integral address space for externref values
+ EXTERNREF = 10,
+ // AS 20 : is a non-integral address space for funcref values
+ FUNCREF = 20,
+ };
+
+ MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
+ if (AS == WasmAddressSpace::EXTERNREF)
+ return MVT::externref;
+ if (AS == WasmAddressSpace::FUNCREF)
+ return MVT::funcref;
+ return TargetLowering::getPointerTy(DL, AS);
+ }
+ MVT getPointerMemTy(const DataLayout &DL, uint32_t AS = 0) const override {
+ if (AS == WasmAddressSpace::EXTERNREF)
+ return MVT::externref;
+ if (AS == WasmAddressSpace::FUNCREF)
+ return MVT::funcref;
+ return TargetLowering::getPointerMemTy(DL, AS);
+ }
+
+ static bool isFuncrefType(const Type *Ty);
+ static bool isExternrefType(const Type *Ty);
+
private:
/// Keep a pointer to the WebAssemblySubtarget around so that we can make the
/// right decision when generating code for different targets.
const WebAssemblySubtarget *Subtarget;
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+ bool shouldScalarizeBinop(SDValue VecOp) const override;
FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) const override;
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
@@ -66,7 +97,7 @@
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I = nullptr) const override;
- bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+ bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
@@ -120,6 +151,8 @@
SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
// Custom DAG combine hooks
SDValue
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 22103b0..1ee6ae1 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -259,26 +259,20 @@
// therefore don't have the extension type field. So instead of matching that,
// we match the patterns that the type legalizer expands them to.
-// We directly match zext patterns and select the zext atomic loads.
-// i32 (zext (i8 (atomic_load_8))) gets legalized to
-// i32 (and (i32 (atomic_load_8)), 255)
-// These can be selected to a single zero-extending atomic load instruction.
-def zext_aload_8_32 :
- PatFrag<(ops node:$addr), (and (i32 (atomic_load_8 node:$addr)), 255)>;
-def zext_aload_16_32 :
- PatFrag<(ops node:$addr), (and (i32 (atomic_load_16 node:$addr)), 65535)>;
// Unlike regular loads, extension to i64 is handled differently than i32.
// i64 (zext (i8 (atomic_load_8))) gets legalized to
// i64 (and (i64 (anyext (i32 (atomic_load_8)))), 255)
+// Extension to i32 is elided by SelectionDAG as our atomic loads are
+// zero-extending.
def zext_aload_8_64 :
PatFrag<(ops node:$addr),
- (and (i64 (anyext (i32 (atomic_load_8 node:$addr)))), 255)>;
+ (i64 (zext (i32 (atomic_load_8 node:$addr))))>;
def zext_aload_16_64 :
PatFrag<(ops node:$addr),
- (and (i64 (anyext (i32 (atomic_load_16 node:$addr)))), 65535)>;
+ (i64 (zext (i32 (atomic_load_16 node:$addr))))>;
def zext_aload_32_64 :
PatFrag<(ops node:$addr),
- (zext (i32 (atomic_load node:$addr)))>;
+ (i64 (zext (i32 (atomic_load_32 node:$addr))))>;
// We don't have single sext atomic load instructions. So for sext loads, we
// match bare subword loads (for 32-bit results) and anyext loads (for 64-bit
@@ -290,8 +284,6 @@
PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_16 node:$addr)))>;
// Select zero-extending loads with no constant offset.
-defm : LoadPatNoOffset<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
-defm : LoadPatNoOffset<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
defm : LoadPatNoOffset<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatNoOffset<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
defm : LoadPatNoOffset<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
@@ -304,10 +296,6 @@
// 32->64 sext load gets selected as i32.atomic.load, i64.extend_i32_s
// Zero-extending loads with constant offset
-defm : LoadPatImmOff<i32, zext_aload_8_32, regPlusImm, "ATOMIC_LOAD8_U_I32">;
-defm : LoadPatImmOff<i32, zext_aload_16_32, regPlusImm, "ATOMIC_LOAD16_U_I32">;
-defm : LoadPatImmOff<i32, zext_aload_8_32, or_is_add, "ATOMIC_LOAD8_U_I32">;
-defm : LoadPatImmOff<i32, zext_aload_16_32, or_is_add, "ATOMIC_LOAD16_U_I32">;
defm : LoadPatImmOff<i64, zext_aload_8_64, regPlusImm, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatImmOff<i64, zext_aload_16_64, regPlusImm, "ATOMIC_LOAD16_U_I64">;
defm : LoadPatImmOff<i64, zext_aload_32_64, regPlusImm, "ATOMIC_LOAD32_U_I64">;
@@ -327,8 +315,6 @@
// No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
// Extending loads with just a constant offset
-defm : LoadPatOffsetOnly<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
-defm : LoadPatOffsetOnly<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
defm : LoadPatOffsetOnly<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatOffsetOnly<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
defm : LoadPatOffsetOnly<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
@@ -337,8 +323,6 @@
defm : LoadPatOffsetOnly<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatOffsetOnly<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
-defm : LoadPatGlobalAddrOffOnly<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
-defm : LoadPatGlobalAddrOffOnly<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
@@ -607,7 +591,7 @@
Requires<[HasAddr64, HasAtomics]>;
}
-multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> {
+multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
(!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
@@ -651,22 +635,13 @@
// These are combined patterns of truncating store patterns and zero-extending
// load patterns above.
class zext_bin_rmw_8_32<PatFrag kind> :
- PatFrag<(ops node:$addr, node:$val),
- (and (i32 (kind node:$addr, node:$val)), 255)>;
-class zext_bin_rmw_16_32<PatFrag kind> :
- PatFrag<(ops node:$addr, node:$val),
- (and (i32 (kind node:$addr, node:$val)), 65535)>;
+ PatFrag<(ops node:$addr, node:$val), (i32 (kind node:$addr, node:$val))>;
+class zext_bin_rmw_16_32<PatFrag kind> : zext_bin_rmw_8_32<kind>;
class zext_bin_rmw_8_64<PatFrag kind> :
PatFrag<(ops node:$addr, node:$val),
- (and (i64 (anyext (i32 (kind node:$addr,
- (i32 (trunc (i64 node:$val))))))), 255)>;
-class zext_bin_rmw_16_64<PatFrag kind> :
- PatFrag<(ops node:$addr, node:$val),
- (and (i64 (anyext (i32 (kind node:$addr,
- (i32 (trunc (i64 node:$val))))))), 65535)>;
-class zext_bin_rmw_32_64<PatFrag kind> :
- PatFrag<(ops node:$addr, node:$val),
(zext (i32 (kind node:$addr, (i32 (trunc (i64 node:$val))))))>;
+class zext_bin_rmw_16_64<PatFrag kind> : zext_bin_rmw_8_64<kind>;
+class zext_bin_rmw_32_64<PatFrag kind> : zext_bin_rmw_8_64<kind>;
// Truncating & sign-extending binary RMW patterns.
// These are combined patterns of truncating store patterns and sign-extending
@@ -685,7 +660,7 @@
// Patterns for various addressing modes for truncating-extending binary RMWs.
multiclass BinRMWTruncExtPattern<
PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
- NI inst8_32, NI inst16_32, NI inst8_64, NI inst16_64, NI inst32_64> {
+ string inst8_32, string inst16_32, string inst8_64, string inst16_64, string inst32_64> {
// Truncating-extending binary RMWs with no constant offset
defm : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
defm : BinRMWPatNoOffset<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
@@ -887,10 +862,8 @@
// additional nodes such as anyext or assertzext depending on operand types.
class zext_ter_rmw_8_32<PatFrag kind> :
PatFrag<(ops node:$addr, node:$exp, node:$new),
- (and (i32 (kind node:$addr, node:$exp, node:$new)), 255)>;
-class zext_ter_rmw_16_32<PatFrag kind> :
- PatFrag<(ops node:$addr, node:$exp, node:$new),
- (and (i32 (kind node:$addr, node:$exp, node:$new)), 65535)>;
+ (i32 (kind node:$addr, node:$exp, node:$new))>;
+class zext_ter_rmw_16_32<PatFrag kind> : zext_ter_rmw_8_32<kind>;
class zext_ter_rmw_8_64<PatFrag kind> :
PatFrag<(ops node:$addr, node:$exp, node:$new),
(zext (i32 (assertzext (i32 (kind node:$addr,
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index b997c1c..6a123f8 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -48,6 +48,9 @@
I<(outs), (ins variable_ops), (outs), (ins), [],
"return_call_results", "return_call_results", -1>;
+// Note that instructions with variable_ops have custom printers in
+// WebAssemblyInstPrinter.cpp.
+
let variadicOpsAreDefs = 1 in
defm CALL :
I<(outs), (ins function32_op:$callee, variable_ops),
@@ -56,9 +59,12 @@
let variadicOpsAreDefs = 1 in
defm CALL_INDIRECT :
- I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- (outs), (ins TypeIndex:$type, i32imm:$flags), [],
- "call_indirect", "call_indirect\t$type", 0x11>;
+ I<(outs),
+ (ins TypeIndex:$type, table32_op:$table, variable_ops),
+ (outs),
+ (ins TypeIndex:$type, table32_op:$table),
+ [],
+ "call_indirect", "call_indirect\t$type, $table", 0x11>;
let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
defm RET_CALL :
@@ -69,9 +75,9 @@
let isReturn = 1 in
defm RET_CALL_INDIRECT :
- I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- (outs), (ins TypeIndex:$type, i32imm:$flags), [],
- "return_call_indirect\t", "return_call_indirect\t$type",
+ I<(outs), (ins TypeIndex:$type, table32_op:$table, variable_ops),
+ (outs), (ins TypeIndex:$type, table32_op:$table), [],
+ "return_call_indirect\t", "return_call_indirect\t$type, $table",
0x13>,
Requires<[HasTailCall]>;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 702560b..437b07b 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -30,6 +30,8 @@
(BR_IF bb_op:$dst, I32:$cond)>;
def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
(BR_UNLESS bb_op:$dst, I32:$cond)>;
+def : Pat<(brcond (i32 (xor bool_node:$cond, (i32 1))), bb:$dst),
+ (BR_UNLESS bb_op:$dst, I32:$cond)>;
// A list of branch targets enclosed in {} and separated by comma.
// Used by br_table only.
@@ -127,14 +129,14 @@
// Throwing an exception: throw / rethrow
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-defm THROW : I<(outs), (ins event_op:$tag, variable_ops),
- (outs), (ins event_op:$tag),
+defm THROW : I<(outs), (ins tag_op:$tag, variable_ops),
+ (outs), (ins tag_op:$tag),
[(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))],
"throw \t$tag", "throw \t$tag", 0x08>;
defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
-// For C++ support, we only rethrow the latest exception, thus always setting
-// the depth to 0.
+// The depth argument will be computed in CFGStackify. We set it to 0 here for
+// now.
def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
// Region within which an exception is caught: try / end_try
@@ -147,15 +149,19 @@
let hasCtrlDep = 1, hasSideEffects = 1 in {
// Currently 'catch' can only extract an i32, which is sufficient for C++
// support, but according to the spec 'catch' can extract any number of values
-// based on the event type.
-defm CATCH : I<(outs I32:$dst), (ins event_op:$tag),
- (outs), (ins event_op:$tag),
+// based on the tag type.
+defm CATCH : I<(outs I32:$dst), (ins tag_op:$tag),
+ (outs), (ins tag_op:$tag),
[(set I32:$dst,
(WebAssemblycatch (WebAssemblywrapper texternalsym:$tag)))],
"catch \t$dst, $tag", "catch \t$tag", 0x07>;
-defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
+defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x19>;
}
+// Delegating an exception: delegate
+let isTerminator = 1, hasCtrlDep = 1, hasSideEffects = 1 in
+defm DELEGATE : NRI<(outs), (ins bb_op:$dst), [], "delegate \t $dst", 0x18>;
+
// Pseudo instructions: cleanupret / catchret
let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
isPseudo = 1, isEHScopeReturn = 1 in {
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 68ef43f..262d5f6 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -97,14 +97,14 @@
Requires<[HasNontrappingFPToInt]>;
// Support the explicitly saturating operations as well.
-def : Pat<(fp_to_sint_sat F32:$src, (i32 32)), (I32_TRUNC_S_SAT_F32 F32:$src)>;
-def : Pat<(fp_to_uint_sat F32:$src, (i32 32)), (I32_TRUNC_U_SAT_F32 F32:$src)>;
-def : Pat<(fp_to_sint_sat F64:$src, (i32 32)), (I32_TRUNC_S_SAT_F64 F64:$src)>;
-def : Pat<(fp_to_uint_sat F64:$src, (i32 32)), (I32_TRUNC_U_SAT_F64 F64:$src)>;
-def : Pat<(fp_to_sint_sat F32:$src, (i32 64)), (I64_TRUNC_S_SAT_F32 F32:$src)>;
-def : Pat<(fp_to_uint_sat F32:$src, (i32 64)), (I64_TRUNC_U_SAT_F32 F32:$src)>;
-def : Pat<(fp_to_sint_sat F64:$src, (i32 64)), (I64_TRUNC_S_SAT_F64 F64:$src)>;
-def : Pat<(fp_to_uint_sat F64:$src, (i32 64)), (I64_TRUNC_U_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_sint_sat F32:$src, i32), (I32_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_uint_sat F32:$src, i32), (I32_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_sint_sat F64:$src, i32), (I32_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_uint_sat F64:$src, i32), (I32_TRUNC_U_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_sint_sat F32:$src, i64), (I64_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_uint_sat F32:$src, i64), (I64_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_sint_sat F64:$src, i64), (I64_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_uint_sat F64:$src, i64), (I64_TRUNC_U_SAT_F64 F64:$src)>;
// Conversion from floating point to integer pseudo-instructions which don't
// trap on overflow or invalid.
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 0a4289c..4dc0c9a 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -15,7 +15,7 @@
// We instantiate 2 of these for every actual instruction (register based
// and stack based), see below.
class WebAssemblyInst<bits<32> inst, string asmstr, string stack, string is64>
- : StackRel, Wasm64Rel, Instruction {
+ : StackRel, RegisterRel, Wasm64Rel, Instruction {
bits<32> Inst = inst; // Instruction encoding.
string StackBased = stack;
string BaseName = NAME;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index db2ad05..5484c0d 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -14,6 +14,7 @@
#include "WebAssemblyInstrInfo.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
@@ -210,6 +211,12 @@
{WebAssembly::TI_LOCAL, "wasm-local"},
{WebAssembly::TI_GLOBAL_FIXED, "wasm-global-fixed"},
{WebAssembly::TI_OPERAND_STACK, "wasm-operand-stack"},
- {WebAssembly::TI_GLOBAL_RELOC, "wasm-global-reloc"}};
+ {WebAssembly::TI_GLOBAL_RELOC, "wasm-global-reloc"},
+ {WebAssembly::TI_LOCAL_INDIRECT, "wasm-local-indirect"}};
return makeArrayRef(TargetIndices);
}
+
+const MachineOperand &
+WebAssemblyInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
+ return WebAssembly::getCalleeOp(MI);
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 5762fd9..f45a379 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -68,6 +68,8 @@
ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const override;
+
+ const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 49d0614..1574806 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -72,6 +72,8 @@
SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyLocalGet : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyLocalSet : SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>;
def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
@@ -79,6 +81,8 @@
SDTCisPtrTy<0>]>;
def SDT_WebAssemblyThrow : SDTypeProfile<0, -1, []>;
def SDT_WebAssemblyCatch : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyGlobalGet : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyGlobalSet : SDTypeProfile<0, 2, [SDTCisPtrTy<1>]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Nodes.
@@ -106,6 +110,18 @@
[SDNPHasChain, SDNPVariadic]>;
def WebAssemblycatch : SDNode<"WebAssemblyISD::CATCH", SDT_WebAssemblyCatch,
[SDNPHasChain, SDNPSideEffect]>;
+def WebAssemblyglobal_get :
+ SDNode<"WebAssemblyISD::GLOBAL_GET", SDT_WebAssemblyGlobalGet,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def WebAssemblyglobal_set :
+ SDNode<"WebAssemblyISD::GLOBAL_SET", SDT_WebAssemblyGlobalSet,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def WebAssemblylocal_get :
+ SDNode<"WebAssemblyISD::LOCAL_GET", SDT_WebAssemblyLocalGet,
+ [SDNPHasChain, SDNPMayLoad]>;
+def WebAssemblylocal_set :
+ SDNode<"WebAssemblyISD::LOCAL_SET", SDT_WebAssemblyLocalSet,
+ [SDNPHasChain, SDNPMayStore]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific Operands.
@@ -113,13 +129,13 @@
// Default Operand has AsmOperandClass "Imm" which is for integers (and
// symbols), so specialize one for floats:
-def FPImmAsmOperand : AsmOperandClass {
- let Name = "FPImm";
+class FPImmAsmOperand<ValueType ty> : AsmOperandClass {
+ let Name = "FPImm" # ty;
let PredicateMethod = "isFPImm";
}
class FPOperand<ValueType ty> : Operand<ty> {
- AsmOperandClass ParserMatchClass = FPImmAsmOperand;
+ AsmOperandClass ParserMatchClass = FPImmAsmOperand<ty>;
}
let OperandNamespace = "WebAssembly" in {
@@ -130,8 +146,14 @@
let OperandType = "OPERAND_LOCAL" in
def local_op : Operand<i32>;
-let OperandType = "OPERAND_GLOBAL" in
-def global_op : Operand<i32>;
+let OperandType = "OPERAND_GLOBAL" in {
+ // The operand to global instructions is always a 32-bit index.
+ def global_op32 : Operand<i32>;
+ // In PIC mode however, we temporarily represent this index as an external
+ // symbol, which to LLVM is a pointer, so in wasm64 mode it is easiest to
+ // pretend we use a 64-bit index for it.
+ def global_op64 : Operand<i64>;
+}
let OperandType = "OPERAND_I32IMM" in
def i32imm_op : Operand<i32>;
@@ -174,8 +196,8 @@
let PrintMethod = "printWebAssemblyP2AlignOperand";
}
-let OperandType = "OPERAND_EVENT" in
-def event_op : Operand<i32>;
+let OperandType = "OPERAND_TAG" in
+def tag_op : Operand<i32>;
} // OperandType = "OPERAND_P2ALIGN"
@@ -194,6 +216,11 @@
} // OperandNamespace = "WebAssembly"
+// TODO: Find more places to use this.
+def bool_node : PatLeaf<(i32 I32:$cond), [{
+ return CurDAG->computeKnownBits(SDValue(N, 0)).countMinLeadingZeros() == 31;
+}]>;
+
//===----------------------------------------------------------------------===//
// WebAssembly Register to Stack instruction mapping
//===----------------------------------------------------------------------===//
@@ -208,6 +235,19 @@
}
//===----------------------------------------------------------------------===//
+// WebAssembly Stack to Register instruction mapping
+//===----------------------------------------------------------------------===//
+
+class RegisterRel;
+def getRegisterOpcode : InstrMapping {
+ let FilterClass = "RegisterRel";
+ let RowFields = ["BaseName"];
+ let ColFields = ["StackBased"];
+ let KeyCol = ["true"];
+ let ValueCols = [["false"]];
+}
+
+//===----------------------------------------------------------------------===//
// WebAssembly 32 to 64-bit instruction mapping
//===----------------------------------------------------------------------===//
@@ -230,12 +270,12 @@
// Additional instructions.
//===----------------------------------------------------------------------===//
-multiclass ARGUMENT<WebAssemblyRegClass reg, ValueType vt> {
+multiclass ARGUMENT<WebAssemblyRegClass rc, ValueType vt> {
let hasSideEffects = 1, isCodeGenOnly = 1, Defs = []<Register>,
Uses = [ARGUMENTS] in
defm ARGUMENT_#vt :
- I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno),
- [(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>;
+ I<(outs rc:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno),
+ [(set (vt rc:$res), (WebAssemblyargument timm:$argno))]>;
}
defm "": ARGUMENT<I32, i32>;
defm "": ARGUMENT<I64, i64>;
@@ -246,7 +286,7 @@
// local.get and local.set are not generated by instruction selection; they
// are implied by virtual register uses and defs.
-multiclass LOCAL<WebAssemblyRegClass vt> {
+multiclass LOCAL<WebAssemblyRegClass rc, Operand global_op> {
let hasSideEffects = 0 in {
// COPY is not an actual instruction in wasm, but since we allow local.get and
// local.set to be implicit during most of codegen, we can have a COPY which
@@ -254,21 +294,21 @@
// and local.set. COPYs are eliminated (and replaced with
// local.get/local.set) in the ExplicitLocals pass.
let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
- defm COPY_#vt : I<(outs vt:$res), (ins vt:$src), (outs), (ins), [],
+ defm COPY_#rc : I<(outs rc:$res), (ins rc:$src), (outs), (ins), [],
"local.copy\t$res, $src", "local.copy">;
// TEE is similar to COPY, but writes two copies of its result. Typically
// this would be used to stackify one result and write the other result to a
// local.
let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
- defm TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), (outs), (ins), [],
+ defm TEE_#rc : I<(outs rc:$res, rc:$also), (ins rc:$src), (outs), (ins), [],
"local.tee\t$res, $also, $src", "local.tee">;
// This is the actual local.get instruction in wasm. These are made explicit
// by the ExplicitLocals pass. It has mayLoad because it reads from a wasm
// local, which is a side effect not otherwise modeled in LLVM.
let mayLoad = 1, isAsCheapAsAMove = 1 in
- defm LOCAL_GET_#vt : I<(outs vt:$res), (ins local_op:$local),
+ defm LOCAL_GET_#rc : I<(outs rc:$res), (ins local_op:$local),
(outs), (ins local_op:$local), [],
"local.get\t$res, $local", "local.get\t$local", 0x20>;
@@ -276,7 +316,7 @@
// by the ExplicitLocals pass. It has mayStore because it writes to a wasm
// local, which is a side effect not otherwise modeled in LLVM.
let mayStore = 1, isAsCheapAsAMove = 1 in
- defm LOCAL_SET_#vt : I<(outs), (ins local_op:$local, vt:$src),
+ defm LOCAL_SET_#rc : I<(outs), (ins local_op:$local, rc:$src),
(outs), (ins local_op:$local), [],
"local.set\t$local, $src", "local.set\t$local", 0x21>;
@@ -284,36 +324,48 @@
// LOCAL_TEEs by the ExplicitLocals pass. It has mayStore for the same reason
// as LOCAL_SET.
let mayStore = 1, isAsCheapAsAMove = 1 in
- defm LOCAL_TEE_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src),
+ defm LOCAL_TEE_#rc : I<(outs rc:$res), (ins local_op:$local, rc:$src),
(outs), (ins local_op:$local), [],
"local.tee\t$res, $local, $src", "local.tee\t$local",
0x22>;
// Unused values must be dropped in some contexts.
- defm DROP_#vt : I<(outs), (ins vt:$src), (outs), (ins), [],
+ defm DROP_#rc : I<(outs), (ins rc:$src), (outs), (ins), [],
"drop\t$src", "drop", 0x1a>;
let mayLoad = 1 in
- defm GLOBAL_GET_#vt : I<(outs vt:$res), (ins global_op:$local),
- (outs), (ins global_op:$local), [],
- "global.get\t$res, $local", "global.get\t$local",
+ defm GLOBAL_GET_#rc : I<(outs rc:$res), (ins global_op:$addr),
+ (outs), (ins global_op:$addr), [],
+ "global.get\t$res, $addr", "global.get\t$addr",
0x23>;
let mayStore = 1 in
- defm GLOBAL_SET_#vt : I<(outs), (ins global_op:$local, vt:$src),
- (outs), (ins global_op:$local), [],
- "global.set\t$local, $src", "global.set\t$local",
+ defm GLOBAL_SET_#rc : I<(outs), (ins global_op:$addr, rc:$src),
+ (outs), (ins global_op:$addr), [],
+ "global.set\t$addr, $src", "global.set\t$addr",
0x24>;
-} // hasSideEffects = 0
+ } // hasSideEffects = 0
+ foreach vt = rc.RegTypes in {
+ def : Pat<(vt (WebAssemblyglobal_get
+ (WebAssemblywrapper tglobaladdr:$addr))),
+ (!cast<NI>("GLOBAL_GET_" # rc) tglobaladdr:$addr)>;
+ def : Pat<(WebAssemblyglobal_set
+ vt:$src, (WebAssemblywrapper tglobaladdr:$addr)),
+ (!cast<NI>("GLOBAL_SET_" # rc) tglobaladdr:$addr, vt:$src)>;
+ def : Pat<(vt (WebAssemblylocal_get (i32 timm:$local))),
+ (!cast<NI>("LOCAL_GET_" # rc) timm:$local)>;
+ def : Pat<(WebAssemblylocal_set timm:$local, vt:$src),
+ (!cast<NI>("LOCAL_SET_" # rc) timm:$local, vt:$src)>;
+ }
}
-defm "" : LOCAL<I32>;
-defm "" : LOCAL<I64>;
-defm "" : LOCAL<F32>;
-defm "" : LOCAL<F64>;
-defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
-defm "" : LOCAL<FUNCREF>, Requires<[HasReferenceTypes]>;
-defm "" : LOCAL<EXTERNREF>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<I32, global_op32>;
+defm "" : LOCAL<I64, global_op64>; // 64-bit only needed for pointers.
+defm "" : LOCAL<F32, global_op32>;
+defm "" : LOCAL<F64, global_op32>;
+defm "" : LOCAL<V128, global_op32>, Requires<[HasSIMD128]>;
+defm "" : LOCAL<FUNCREF, global_op32>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXTERNREF, global_op32>, Requires<[HasReferenceTypes]>;
let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
@@ -341,6 +393,8 @@
def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
(GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
+ (GLOBAL_GET_I64 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr64]>;
def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)),
(CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
@@ -354,6 +408,8 @@
def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
(GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper texternalsym:$addr)),
+ (GLOBAL_GET_I64 texternalsym:$addr)>, Requires<[IsPIC, HasAddr64]>;
def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
(CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 18250cf..7a0c524 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -93,6 +93,14 @@
[(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
"i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
+// Optimize away an explicit mask on a shift count.
+def : Pat<(shl I32:$lhs, (and I32:$rhs, 31)), (SHL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(sra I32:$lhs, (and I32:$rhs, 31)), (SHR_S_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(srl I32:$lhs, (and I32:$rhs, 31)), (SHR_U_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(shl I64:$lhs, (and I64:$rhs, 63)), (SHL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(sra I64:$lhs, (and I64:$rhs, 63)), (SHR_S_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(srl I64:$lhs, (and I64:$rhs, 63)), (SHR_U_I64 I64:$lhs, I64:$rhs)>;
+
// Optimize away an explicit mask on a rotate count.
def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 48b9344..82f5e98 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -67,7 +67,7 @@
defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
// Select loads with no constant offset.
-multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
+multiclass LoadPatNoOffset<ValueType ty, SDPatternOperator kind, string inst> {
def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
Requires<[HasAddr32]>;
def : Pat<(ty (kind (i64 I64:$addr))), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
@@ -82,7 +82,7 @@
// Select loads with a constant offset.
// Pattern with address + immediate offset
-multiclass LoadPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+multiclass LoadPatImmOff<ValueType ty, SDPatternOperator kind, PatFrag operand,
string inst> {
def : Pat<(ty (kind (operand I32:$addr, imm:$off))),
(!cast<NI>(inst # "_A32") 0, imm:$off, I32:$addr)>,
@@ -102,7 +102,7 @@
defm : LoadPatImmOff<f64, load, or_is_add, "LOAD_F64">;
// Select loads with just a constant offset.
-multiclass LoadPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+multiclass LoadPatOffsetOnly<ValueType ty, SDPatternOperator kind, string inst> {
def : Pat<(ty (kind imm:$off)),
(!cast<NI>(inst # "_A32") 0, imm:$off, (CONST_I32 0))>,
Requires<[HasAddr32]>;
@@ -116,7 +116,7 @@
defm : LoadPatOffsetOnly<f32, load, "LOAD_F32">;
defm : LoadPatOffsetOnly<f64, load, "LOAD_F64">;
-multiclass LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
+multiclass LoadPatGlobalAddrOffOnly<ValueType ty, SDPatternOperator kind, string inst> {
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
(!cast<NI>(inst # "_A32") 0, tglobaladdr:$off, (CONST_I32 0))>,
Requires<[IsNotPIC, HasAddr32]>;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 7f324fc..ef9bd35 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -11,29 +11,29 @@
///
//===----------------------------------------------------------------------===//
-multiclass REF_I<WebAssemblyRegClass reg, ValueType vt> {
- defm REF_NULL_#reg : I<(outs reg:$res), (ins HeapType:$heaptype),
- (outs), (ins HeapType:$heaptype),
- [],
- "ref.null\t$res, $heaptype",
- "ref.null\t$heaptype",
- 0xd0>,
- Requires<[HasReferenceTypes]>;
- defm SELECT_#reg: I<(outs reg:$dst), (ins reg:$lhs, reg:$rhs, I32:$cond),
- (outs), (ins),
- [(set reg:$dst,
- (select I32:$cond, reg:$lhs, reg:$rhs))],
- vt#".select\t$dst, $lhs, $rhs, $cond",
- vt#".select", 0x1b>,
- Requires<[HasReferenceTypes]>;
+multiclass REF_I<WebAssemblyRegClass rc, ValueType vt> {
+ defm REF_NULL_#rc : I<(outs rc:$res), (ins HeapType:$heaptype),
+ (outs), (ins HeapType:$heaptype),
+ [],
+ "ref.null\t$res, $heaptype",
+ "ref.null\t$heaptype",
+ 0xd0>,
+ Requires<[HasReferenceTypes]>;
+ defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set rc:$dst,
+ (select I32:$cond, rc:$lhs, rc:$rhs))],
+ vt#".select\t$dst, $lhs, $rhs, $cond",
+ vt#".select", 0x1b>,
+ Requires<[HasReferenceTypes]>;
}
defm "" : REF_I<FUNCREF, funcref>;
defm "" : REF_I<EXTERNREF, externref>;
-foreach reg = [FUNCREF, EXTERNREF] in {
-def : Pat<(select (i32 (setne I32:$cond, 0)), reg:$lhs, reg:$rhs),
- (!cast<Instruction>("SELECT_"#reg) reg:$lhs, reg:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (seteq I32:$cond, 0)), reg:$lhs, reg:$rhs),
- (!cast<Instruction>("SELECT_"#reg) reg:$rhs, reg:$lhs, I32:$cond)>;
+foreach rc = [FUNCREF, EXTERNREF] in {
+def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs),
+ (!cast<Instruction>("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs),
+ (!cast<Instruction>("SELECT_"#rc) rc:$rhs, rc:$lhs, I32:$cond)>;
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index efcdf036..6429b46 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -264,19 +264,19 @@
} // mayLoad = 1, UseNamedOperandTable = 1
}
-// TODO: Also support v4f32 and v2f64 once the instructions are merged
-// to the proposal
defm "" : SIMDLoadZero<I32x4, 0x5c>;
defm "" : SIMDLoadZero<I64x2, 0x5d>;
+// TODO: f32x4 and f64x2 as well
foreach vec = [I32x4, I64x2] in {
-defvar loadpat = !cast<Intrinsic>("int_wasm_load"#vec.lane_bits#"_zero");
-defvar inst = "LOAD_ZERO_"#vec;
-defm : LoadPatNoOffset<vec.vt, loadpat, inst>;
-defm : LoadPatImmOff<vec.vt, loadpat, regPlusImm, inst>;
-defm : LoadPatImmOff<vec.vt, loadpat, or_is_add, inst>;
-defm : LoadPatOffsetOnly<vec.vt, loadpat, inst>;
-defm : LoadPatGlobalAddrOffOnly<vec.vt, loadpat, inst>;
+ defvar inst = "LOAD_ZERO_"#vec;
+ defvar pat = PatFrag<(ops node:$ptr),
+ (vector_insert (vec.splat (vec.lane_vt 0)), (vec.lane_vt (load $ptr)), 0)>;
+ defm : LoadPatNoOffset<vec.vt, pat, inst>;
+ defm : LoadPatImmOff<vec.vt, pat, regPlusImm, inst>;
+ defm : LoadPatImmOff<vec.vt, pat, or_is_add, inst>;
+ defm : LoadPatOffsetOnly<vec.vt, pat, inst>;
+ defm : LoadPatGlobalAddrOffOnly<vec.vt, pat, inst>;
}
// Load lane
@@ -300,15 +300,13 @@
} // mayLoad = 1, UseNamedOperandTable = 1
}
-// TODO: Also support v4f32 and v2f64 once the instructions are merged
-// to the proposal
defm "" : SIMDLoadLane<I8x16, 0x54>;
defm "" : SIMDLoadLane<I16x8, 0x55>;
defm "" : SIMDLoadLane<I32x4, 0x56>;
defm "" : SIMDLoadLane<I64x2, 0x57>;
// Select loads with no constant offset.
-multiclass LoadLanePatNoOffset<Vec vec, PatFrag kind> {
+multiclass LoadLanePatNoOffset<Vec vec, SDPatternOperator kind> {
defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
def : Pat<(vec.vt (kind (i32 I32:$addr),
@@ -321,10 +319,24 @@
Requires<[HasAddr64]>;
}
-defm : LoadLanePatNoOffset<I8x16, int_wasm_load8_lane>;
-defm : LoadLanePatNoOffset<I16x8, int_wasm_load16_lane>;
-defm : LoadLanePatNoOffset<I32x4, int_wasm_load32_lane>;
-defm : LoadLanePatNoOffset<I64x2, int_wasm_load64_lane>;
+def load8_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (vector_insert $vec, (i32 (extloadi8 $ptr)), $idx)>;
+def load16_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (vector_insert $vec, (i32 (extloadi16 $ptr)), $idx)>;
+def load32_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (vector_insert $vec, (i32 (load $ptr)), $idx)>;
+def load64_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (vector_insert $vec, (i64 (load $ptr)), $idx)>;
+// TODO: floating point lanes as well
+
+defm : LoadLanePatNoOffset<I8x16, load8_lane>;
+defm : LoadLanePatNoOffset<I16x8, load16_lane>;
+defm : LoadLanePatNoOffset<I32x4, load32_lane>;
+defm : LoadLanePatNoOffset<I64x2, load64_lane>;
// TODO: Also support the other load patterns for load_lane once the instructions
// are merged to the proposal.
@@ -373,15 +385,13 @@
} // mayStore = 1, UseNamedOperandTable = 1
}
-// TODO: Also support v4f32 and v2f64 once the instructions are merged
-// to the proposal
defm "" : SIMDStoreLane<I8x16, 0x58>;
defm "" : SIMDStoreLane<I16x8, 0x59>;
defm "" : SIMDStoreLane<I32x4, 0x5a>;
defm "" : SIMDStoreLane<I64x2, 0x5b>;
// Select stores with no constant offset.
-multiclass StoreLanePatNoOffset<Vec vec, PatFrag kind> {
+multiclass StoreLanePatNoOffset<Vec vec, SDPatternOperator kind> {
def : Pat<(kind (i32 I32:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)),
(!cast<NI>("STORE_LANE_"#vec#"_A32") 0, 0, imm:$idx, $addr, $vec)>,
Requires<[HasAddr32]>;
@@ -390,13 +400,26 @@
Requires<[HasAddr64]>;
}
-defm : StoreLanePatNoOffset<I8x16, int_wasm_store8_lane>;
-defm : StoreLanePatNoOffset<I16x8, int_wasm_store16_lane>;
-defm : StoreLanePatNoOffset<I32x4, int_wasm_store32_lane>;
-defm : StoreLanePatNoOffset<I64x2, int_wasm_store64_lane>;
+def store8_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (truncstorei8 (i32 (vector_extract $vec, $idx)), $ptr)>;
+def store16_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (truncstorei16 (i32 (vector_extract $vec, $idx)), $ptr)>;
+def store32_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (store (i32 (vector_extract $vec, $idx)), $ptr)>;
+def store64_lane :
+ PatFrag<(ops node:$ptr, node:$vec, node:$idx),
+ (store (i64 (vector_extract $vec, $idx)), $ptr)>;
+// TODO: floating point lanes as well
-// TODO: Also support the other store patterns for store_lane once the
-// instructions are merged to the proposal.
+let AddedComplexity = 1 in {
+defm : StoreLanePatNoOffset<I8x16, store8_lane>;
+defm : StoreLanePatNoOffset<I16x8, store16_lane>;
+defm : StoreLanePatNoOffset<I32x4, store32_lane>;
+defm : StoreLanePatNoOffset<I64x2, store64_lane>;
+}
//===----------------------------------------------------------------------===//
// Constructing SIMD values
@@ -702,7 +725,7 @@
// Bitwise operations
//===----------------------------------------------------------------------===//
-multiclass SIMDBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name, bits<32> simdop> {
defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
(outs), (ins),
[(set (vec.vt V128:$dst),
@@ -711,7 +734,8 @@
vec.prefix#"."#name, simdop>;
}
-multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop, bit commutable = false> {
+multiclass SIMDBitwise<SDPatternOperator node, string name, bits<32> simdop,
+ bit commutable = false> {
let isCommutable = commutable in
defm "" : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
(outs), (ins), [],
@@ -721,7 +745,7 @@
(!cast<NI>(NAME) $lhs, $rhs)>;
}
-multiclass SIMDUnary<Vec vec, SDNode node, string name, bits<32> simdop> {
+multiclass SIMDUnary<Vec vec, SDPatternOperator node, string name, bits<32> simdop> {
defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$v), (outs), (ins),
[(set (vec.vt V128:$dst),
(vec.vt (node (vec.vt V128:$v))))],
@@ -792,27 +816,13 @@
// Integer unary arithmetic
//===----------------------------------------------------------------------===//
-multiclass SIMDUnaryInt<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDUnaryInt<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDUnary<I8x16, node, name, baseInst>;
defm "" : SIMDUnary<I16x8, node, name, !add(baseInst, 32)>;
defm "" : SIMDUnary<I32x4, node, name, !add(baseInst, 64)>;
defm "" : SIMDUnary<I64x2, node, name, !add(baseInst, 96)>;
}
-multiclass SIMDReduceVec<Vec vec, SDNode op, string name, bits<32> simdop> {
- defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
- [(set I32:$dst, (i32 (op (vec.vt V128:$vec))))],
- vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name,
- simdop>;
-}
-
-multiclass SIMDReduce<SDNode op, string name, bits<32> baseInst> {
- defm "" : SIMDReduceVec<I8x16, op, name, baseInst>;
- defm "" : SIMDReduceVec<I16x8, op, name, !add(baseInst, 32)>;
- defm "" : SIMDReduceVec<I32x4, op, name, !add(baseInst, 64)>;
- defm "" : SIMDReduceVec<I64x2, op, name, !add(baseInst, 96)>;
-}
-
// Integer vector negation
def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, $in)>;
@@ -823,7 +833,7 @@
defm NEG : SIMDUnaryInt<ivneg, "neg", 97>;
// Population count: popcnt
-defm POPCNT : SIMDUnary<I8x16, int_wasm_popcnt, "popcnt", 0x62>;
+defm POPCNT : SIMDUnary<I8x16, ctpop, "popcnt", 0x62>;
// Any lane true: any_true
defm ANYTRUE : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins), [],
@@ -911,27 +921,56 @@
defm SHR_S : SIMDShiftInt<wasm_shr_s, "shr_s", 108>;
defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
+// Optimize away an explicit mask on a shift count.
+def : Pat<(wasm_shl (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+ (SHL_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+ (SHR_S_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+ (SHR_U_I8x16 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+ (SHL_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+ (SHR_S_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+ (SHR_U_I16x8 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+ (SHL_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+ (SHR_S_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+ (SHR_U_I32x4 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+ (SHL_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_s (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+ (SHR_S_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_u (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+ (SHR_U_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+
//===----------------------------------------------------------------------===//
// Integer binary arithmetic
//===----------------------------------------------------------------------===//
-multiclass SIMDBinaryIntNoI8x16<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDBinaryIntNoI8x16<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
}
-multiclass SIMDBinaryIntSmall<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDBinaryIntSmall<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDBinary<I8x16, node, name, baseInst>;
defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
}
-multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDBinaryIntNoI64x2<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDBinaryIntSmall<node, name, baseInst>;
defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
}
-multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDBinaryInt<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
}
@@ -989,7 +1028,14 @@
186>;
// Extending multiplication: extmul_{low,high}_P, extmul_high
-multiclass SIMDExtBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+def extend_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def extend_low_s : SDNode<"WebAssemblyISD::EXTEND_LOW_S", extend_t>;
+def extend_high_s : SDNode<"WebAssemblyISD::EXTEND_HIGH_S", extend_t>;
+def extend_low_u : SDNode<"WebAssemblyISD::EXTEND_LOW_U", extend_t>;
+def extend_high_u : SDNode<"WebAssemblyISD::EXTEND_HIGH_U", extend_t>;
+
+multiclass SIMDExtBinary<Vec vec, SDPatternOperator node, string name,
+ bits<32> simdop> {
defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
(outs), (ins),
[(set (vec.vt V128:$dst), (node
@@ -998,32 +1044,41 @@
vec.prefix#"."#name, simdop>;
}
-defm EXTMUL_LOW_S :
- SIMDExtBinary<I16x8, int_wasm_extmul_low_signed, "extmul_low_i8x16_s", 0x9c>;
-defm EXTMUL_HIGH_S :
- SIMDExtBinary<I16x8, int_wasm_extmul_high_signed, "extmul_high_i8x16_s", 0x9d>;
-defm EXTMUL_LOW_U :
- SIMDExtBinary<I16x8, int_wasm_extmul_low_unsigned, "extmul_low_i8x16_u", 0x9e>;
-defm EXTMUL_HIGH_U :
- SIMDExtBinary<I16x8, int_wasm_extmul_high_unsigned, "extmul_high_i8x16_u", 0x9f>;
+class ExtMulPat<SDNode extend> :
+ PatFrag<(ops node:$lhs, node:$rhs),
+ (mul (extend $lhs), (extend $rhs))> {}
+
+def extmul_low_s : ExtMulPat<extend_low_s>;
+def extmul_high_s : ExtMulPat<extend_high_s>;
+def extmul_low_u : ExtMulPat<extend_low_u>;
+def extmul_high_u : ExtMulPat<extend_high_u>;
defm EXTMUL_LOW_S :
- SIMDExtBinary<I32x4, int_wasm_extmul_low_signed, "extmul_low_i16x8_s", 0xbc>;
+ SIMDExtBinary<I16x8, extmul_low_s, "extmul_low_i8x16_s", 0x9c>;
defm EXTMUL_HIGH_S :
- SIMDExtBinary<I32x4, int_wasm_extmul_high_signed, "extmul_high_i16x8_s", 0xbd>;
+ SIMDExtBinary<I16x8, extmul_high_s, "extmul_high_i8x16_s", 0x9d>;
defm EXTMUL_LOW_U :
- SIMDExtBinary<I32x4, int_wasm_extmul_low_unsigned, "extmul_low_i16x8_u", 0xbe>;
+ SIMDExtBinary<I16x8, extmul_low_u, "extmul_low_i8x16_u", 0x9e>;
defm EXTMUL_HIGH_U :
- SIMDExtBinary<I32x4, int_wasm_extmul_high_unsigned, "extmul_high_i16x8_u", 0xbf>;
+ SIMDExtBinary<I16x8, extmul_high_u, "extmul_high_i8x16_u", 0x9f>;
defm EXTMUL_LOW_S :
- SIMDExtBinary<I64x2, int_wasm_extmul_low_signed, "extmul_low_i32x4_s", 0xdc>;
+ SIMDExtBinary<I32x4, extmul_low_s, "extmul_low_i16x8_s", 0xbc>;
defm EXTMUL_HIGH_S :
- SIMDExtBinary<I64x2, int_wasm_extmul_high_signed, "extmul_high_i32x4_s", 0xdd>;
+ SIMDExtBinary<I32x4, extmul_high_s, "extmul_high_i16x8_s", 0xbd>;
defm EXTMUL_LOW_U :
- SIMDExtBinary<I64x2, int_wasm_extmul_low_unsigned, "extmul_low_i32x4_u", 0xde>;
+ SIMDExtBinary<I32x4, extmul_low_u, "extmul_low_i16x8_u", 0xbe>;
defm EXTMUL_HIGH_U :
- SIMDExtBinary<I64x2, int_wasm_extmul_high_unsigned, "extmul_high_i32x4_u", 0xdf>;
+ SIMDExtBinary<I32x4, extmul_high_u, "extmul_high_i16x8_u", 0xbf>;
+
+defm EXTMUL_LOW_S :
+ SIMDExtBinary<I64x2, extmul_low_s, "extmul_low_i32x4_s", 0xdc>;
+defm EXTMUL_HIGH_S :
+ SIMDExtBinary<I64x2, extmul_high_s, "extmul_high_i32x4_s", 0xdd>;
+defm EXTMUL_LOW_U :
+ SIMDExtBinary<I64x2, extmul_low_u, "extmul_low_i32x4_u", 0xde>;
+defm EXTMUL_HIGH_U :
+ SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
//===----------------------------------------------------------------------===//
// Floating-point unary arithmetic
@@ -1057,7 +1112,7 @@
// Floating-point binary arithmetic
//===----------------------------------------------------------------------===//
-multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
+multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
defm "" : SIMDBinary<F32x4, node, name, baseInst>;
defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
}
@@ -1083,16 +1138,38 @@
defm MAX : SIMDBinaryFP<fmaximum, "max", 233>;
// Pseudo-minimum: pmin
-defm PMIN : SIMDBinaryFP<int_wasm_pmin, "pmin", 234>;
+def pmin : PatFrag<(ops node:$lhs, node:$rhs),
+ (vselect (setolt $rhs, $lhs), $rhs, $lhs)>;
+defm PMIN : SIMDBinaryFP<pmin, "pmin", 234>;
// Pseudo-maximum: pmax
-defm PMAX : SIMDBinaryFP<int_wasm_pmax, "pmax", 235>;
+def pmax : PatFrag<(ops node:$lhs, node:$rhs),
+ (vselect (setolt $lhs, $rhs), $rhs, $lhs)>;
+defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
+
+// Also match the pmin/pmax cases where the operands are int vectors (but the
+// comparison is still a floating point comparison). This can happen when using
+// the wasm_simd128.h intrinsics because v128_t is an integer vector.
+foreach vec = [F32x4, F64x2] in {
+defvar pmin = !cast<NI>("PMIN_"#vec);
+defvar pmax = !cast<NI>("PMAX_"#vec);
+def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$rhs)),
+ (vec.vt (bitconvert V128:$lhs))),
+ V128:$rhs, V128:$lhs)),
+ (pmin $lhs, $rhs)>;
+def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$lhs)),
+ (vec.vt (bitconvert V128:$rhs))),
+ V128:$rhs, V128:$lhs)),
+ (pmax $lhs, $rhs)>;
+}
//===----------------------------------------------------------------------===//
// Conversions
//===----------------------------------------------------------------------===//
-multiclass SIMDConvert<Vec vec, Vec arg, SDNode op, string name,
+multiclass SIMDConvert<Vec vec, Vec arg, SDPatternOperator op, string name,
bits<32> simdop> {
defm op#_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
@@ -1105,8 +1182,8 @@
defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
// Support the saturating variety as well.
-def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, (i32 32))>;
-def trunc_u_sat32 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, (i32 32))>;
+def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>;
+def trunc_u_sat32 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, i32)>;
def : Pat<(v4i32 (trunc_s_sat32 (v4f32 V128:$src))), (fp_to_sint_I32x4 $src)>;
def : Pat<(v4i32 (trunc_u_sat32 (v4f32 V128:$src))), (fp_to_uint_I32x4 $src)>;
@@ -1130,12 +1207,6 @@
defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
// Extending operations
-def extend_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
-def extend_low_s : SDNode<"WebAssemblyISD::EXTEND_LOW_S", extend_t>;
-def extend_high_s : SDNode<"WebAssemblyISD::EXTEND_HIGH_S", extend_t>;
-def extend_low_u : SDNode<"WebAssemblyISD::EXTEND_LOW_U", extend_t>;
-def extend_high_u : SDNode<"WebAssemblyISD::EXTEND_HIGH_U", extend_t>;
-
// TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
multiclass SIMDExtend<Vec vec, bits<32> baseInst> {
defm "" : SIMDConvert<vec, vec.split, extend_low_s,
@@ -1173,12 +1244,15 @@
// Use narrowing operations for truncating stores. Since the narrowing
// operations are saturating instead of truncating, we need to mask
// the stored values first.
-// TODO: Use consts instead of splats
def store_v8i8_trunc_v8i16 :
OutPatFrag<(ops node:$val),
(EXTRACT_LANE_I64x2
(NARROW_U_I8x16
- (AND (SPLAT_I32x4 (CONST_I32 0x00ff00ff)), node:$val),
+ (AND
+ (CONST_V128_I16x8
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff),
+ node:$val),
$val), // Unused input
0)>;
@@ -1186,7 +1260,10 @@
OutPatFrag<(ops node:$val),
(EXTRACT_LANE_I64x2
(NARROW_U_I16x8
- (AND (SPLAT_I32x4 (CONST_I32 0x0000ffff)), node:$val),
+ (AND
+ (CONST_V128_I32x4
+ 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff),
+ node:$val),
$val), // Unused input
0)>;
@@ -1247,14 +1324,10 @@
// Bitcasts are nops
// Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
-foreach t1 = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
-foreach t2 = !foldl(
- []<ValueType>, [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- acc, cur, !if(!eq(!cast<string>(t1), !cast<string>(cur)),
- acc, !listconcat(acc, [cur])
- )
-) in
-def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
+foreach t1 = AllVecs in
+foreach t2 = AllVecs in
+if !ne(t1, t2) then
+def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>;
// Extended pairwise addition
defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
@@ -1266,11 +1339,15 @@
defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
"extadd_pairwise_i16x8_u", 0x7f>;
-// Prototype f64x2 conversions
-defm "" : SIMDConvert<F32x4, F64x2, int_wasm_demote_zero,
+// f64x2 <-> f32x4 conversions
+def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>;
+defm "" : SIMDConvert<F32x4, F64x2, demote_zero,
"demote_zero_f64x2", 0x5e>;
-defm "" : SIMDConvert<F64x2, F32x4, int_wasm_promote_low,
- "promote_low_f32x4", 0x5f>;
+
+def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
+defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
//===----------------------------------------------------------------------===//
// Saturating Rounding Q-Format Multiplication
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index 97638c3..2348bb1 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -11,33 +11,34 @@
/// Instructions that handle tables
//===----------------------------------------------------------------------===//
-
multiclass TABLE<WebAssemblyRegClass rt> {
- defm TABLE_GET_#rt : I<(outs rt:$res), (ins table32_op:$table),
+ let mayLoad = 1 in
+ defm TABLE_GET_#rt : I<(outs rt:$res), (ins table32_op:$table, I32:$i),
(outs), (ins table32_op:$table),
[],
- "table.get\t$res, $table",
+ "table.get\t$res, $table, $i",
"table.get\t$table",
0x25>;
- defm TABLE_SET_#rt : I<(outs), (ins table32_op:$table, rt:$val, I32:$i),
+ let mayStore = 1 in
+ defm TABLE_SET_#rt : I<(outs), (ins table32_op:$table, I32:$i, rt:$val),
(outs), (ins table32_op:$table),
[],
- "table.set\t$table, $val, $i",
+ "table.set\t$table, $i, $val",
"table.set\t$table",
0x26>;
- defm TABLE_GROW_#rt : I<(outs I32:$sz), (ins table32_op:$table, I32:$n, rt:$val),
+ defm TABLE_GROW_#rt : I<(outs I32:$sz), (ins table32_op:$table, rt:$val, I32:$n),
(outs), (ins table32_op:$table),
[],
- "table.grow\t$sz, $table, $n, $val",
+ "table.grow\t$sz, $table, $val, $n",
"table.grow\t$table",
0xfc0f>;
- defm TABLE_FILL_#rt : I<(outs), (ins table32_op:$table, I32:$n, rt:$val, I32:$i),
+ defm TABLE_FILL_#rt : I<(outs), (ins table32_op:$table, I32:$i, rt:$val, I32:$n),
(outs), (ins table32_op:$table),
[],
- "table.fill\t$table, $n, $val, $i",
+ "table.fill\t$table, $i, $val, $n",
"table.fill\t$table",
0xfc11>;
@@ -46,6 +47,17 @@
defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>;
defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>;
+def wasm_table_set_t : SDTypeProfile<0, 3, []>;
+def wasm_table_set : SDNode<"WebAssemblyISD::TABLE_SET", wasm_table_set_t,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def : Pat<(wasm_table_set i32:$table, i32:$idx, funcref:$r),
+ (TABLE_SET_FUNCREF i32:$table, i32:$idx, funcref:$r)>,
+ Requires<[HasReferenceTypes]>;
+def : Pat<(wasm_table_set i32:$table, i32:$idx, externref:$r),
+ (TABLE_SET_EXTERNREF i32:$table, i32:$idx, externref:$r)>,
+ Requires<[HasReferenceTypes]>;
+
defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
(outs), (ins table32_op:$table),
[],
@@ -55,10 +67,10 @@
Requires<[HasReferenceTypes]>;
-defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$n, I32:$s, I32:$d),
+defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$d, I32:$s, I32:$n),
(outs), (ins table32_op:$table1, table32_op:$table2),
[],
- "table.copy\t$table1, $table2, $n, $s, $d",
+ "table.copy\t$table1, $table2, $d, $s, $n",
"table.copy\t$table1, $table2",
0xfc0e>,
Requires<[HasReferenceTypes]>;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index e07dae6..309fcaf 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -12,9 +12,9 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
@@ -38,7 +38,6 @@
bool addCatchAlls(MachineFunction &MF);
bool replaceFuncletReturns(MachineFunction &MF);
bool removeUnnecessaryUnreachables(MachineFunction &MF);
- bool ensureSingleBBTermPads(MachineFunction &MF);
bool restoreStackPointer(MachineFunction &MF);
MachineBasicBlock *getMatchingEHPad(MachineInstr *MI);
@@ -128,7 +127,6 @@
Changed |= hoistCatches(MF);
Changed |= addCatchAlls(MF);
Changed |= replaceFuncletReturns(MF);
- Changed |= ensureSingleBBTermPads(MF);
}
Changed |= removeUnnecessaryUnreachables(MF);
if (MF.getFunction().hasPersonalityFn())
@@ -212,11 +210,12 @@
while (InsertPos != MBB.end() && InsertPos->isEHLabel())
InsertPos++;
// This runs after hoistCatches(), so we assume that if there is a catch,
- // that should be the non-EH label first instruction in an EH pad.
+ // that should be the first non-EH-label instruction in an EH pad.
if (InsertPos == MBB.end() ||
!WebAssembly::isCatch(InsertPos->getOpcode())) {
Changed = true;
- BuildMI(MBB, InsertPos, InsertPos->getDebugLoc(),
+ BuildMI(MBB, InsertPos,
+ InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(),
TII.get(WebAssembly::CATCH_ALL));
}
}
@@ -287,80 +286,6 @@
return Changed;
}
-// Clang-generated terminate pads are an single-BB EH pad in the form of
-// termpad:
-// %exn = catch $__cpp_exception
-// call @__clang_call_terminate(%exn)
-// unreachable
-// (There can be local.set and local.gets before the call if we didn't run
-// RegStackify)
-// But code transformations can change or add more control flow, so the call to
-// __clang_call_terminate() function may not be in the original EH pad anymore.
-// This ensures every terminate pad is a single BB in the form illustrated
-// above.
-//
-// This is preparation work for the HandleEHTerminatePads pass later, which
-// duplicates terminate pads both for 'catch' and 'catch_all'. Refer to
-// WebAssemblyHandleEHTerminatePads.cpp for details.
-bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-
- // Find calls to __clang_call_terminate()
- SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
- SmallPtrSet<MachineBasicBlock *, 8> TermPads;
- for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- if (MI.isCall()) {
- const MachineOperand &CalleeOp = MI.getOperand(0);
- if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
- WebAssembly::ClangCallTerminateFn) {
- MachineBasicBlock *EHPad = getMatchingEHPad(&MI);
- assert(EHPad && "No matching EH pad for __clang_call_terminate");
- // In case a __clang_call_terminate call is duplicated during code
- // transformation so one terminate pad contains multiple
- // __clang_call_terminate calls, we only count one of them
- if (TermPads.insert(EHPad).second)
- ClangCallTerminateCalls.push_back(&MI);
- }
- }
- }
- }
-
- bool Changed = false;
- for (auto *Call : ClangCallTerminateCalls) {
- MachineBasicBlock *EHPad = getMatchingEHPad(Call);
- assert(EHPad && "No matching EH pad for __clang_call_terminate");
-
- // If it is already the form we want, skip it
- if (Call->getParent() == EHPad &&
- Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
- continue;
-
- // In case the __clang_call_terminate() call is not in its matching EH pad,
- // move the call to the end of EH pad and add an unreachable instruction
- // after that. Delete all successors and their children if any, because here
- // the program terminates.
- Changed = true;
- // This runs after hoistCatches(), so catch instruction should be at the top
- MachineInstr *Catch = WebAssembly::findCatch(EHPad);
- assert(Catch && "EH pad does not have a catch instruction");
- // Takes the result register of the catch instruction as argument. There may
- // have been some other local.set/local.gets in between, but at this point
- // we don't care.
- Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
- auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
- EHPad->insert(InsertPos, Call->removeFromParent());
- BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
- TII.get(WebAssembly::UNREACHABLE));
- EHPad->erase(InsertPos, EHPad->end());
- SmallVector<MachineBasicBlock *, 8> Succs(EHPad->successors());
- for (auto *Succ : Succs)
- EHPad->removeSuccessor(Succ);
- eraseDeadBBsAndChildren(Succs);
- }
- return Changed;
-}
-
// After the stack is unwound due to a thrown exception, the __stack_pointer
// global can point to an invalid address. This inserts instructions that
// restore __stack_pointer global.
@@ -383,10 +308,13 @@
// with leaf functions, and we don't restore __stack_pointer in leaf
// functions anyway.
auto InsertPos = MBB.begin();
- if (InsertPos->isEHLabel()) // EH pad starts with an EH label
- ++InsertPos;
- if (WebAssembly::isCatch(InsertPos->getOpcode()))
- ++InsertPos;
+ // Skip EH_LABELs in the beginning of an EH pad if present.
+ while (InsertPos != MBB.end() && InsertPos->isEHLabel())
+ InsertPos++;
+ assert(InsertPos != MBB.end() &&
+ WebAssembly::isCatch(InsertPos->getOpcode()) &&
+ "catch/catch_all should be present in every EH pad at this point");
+ ++InsertPos; // Skip the catch instruction
FrameLowering->writeSPToGlobal(FrameLowering->getSPReg(MF), MF, MBB,
InsertPos, MBB.begin()->getDebugLoc());
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index ff6404c..599829a 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -13,14 +13,7 @@
///
/// To handle exceptions and setjmp/longjmps, this scheme relies on JavaScript's
/// try and catch syntax and relevant exception-related libraries implemented
-/// in JavaScript glue code that will be produced by Emscripten. This is similar
-/// to the current Emscripten asm.js exception handling in fastcomp. For
-/// fastcomp's EH / SjLj scheme, see these files in fastcomp LLVM branch:
-/// (Location: https://github.com/kripken/emscripten-fastcomp)
-/// lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp
-/// lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp
-/// lib/Target/JSBackend/JSBackend.cpp
-/// lib/Target/JSBackend/CallHandlers.h
+/// in JavaScript glue code that will be produced by Emscripten.
///
/// * Exception handling
/// This pass lowers invokes and landingpads into library functions in JS glue
@@ -50,25 +43,21 @@
/// In detail, this pass does following things:
///
/// 1) Assumes the existence of global variables: __THREW__, __threwValue
-/// __THREW__ and __threwValue will be set in invoke wrappers
-/// in JS glue code. For what invoke wrappers are, refer to 3). These
-/// variables are used for both exceptions and setjmp/longjmps.
+/// __THREW__ and __threwValue are defined in compiler-rt in Emscripten.
+/// These variables are used for both exceptions and setjmp/longjmps.
/// __THREW__ indicates whether an exception or a longjmp occurred or not. 0
/// means nothing occurred, 1 means an exception occurred, and other numbers
-/// mean a longjmp occurred. In the case of longjmp, __threwValue variable
+/// mean a longjmp occurred. In the case of longjmp, __THREW__ variable
/// indicates the corresponding setjmp buffer the longjmp corresponds to.
+/// __threwValue is 0 for exceptions, and the argument to longjmp in case of
+/// longjmp.
///
/// * Exception handling
///
/// 2) We assume the existence of setThrew and setTempRet0/getTempRet0 functions
-/// at link time.
-/// The global variables in 1) will exist in wasm address space,
-/// but their values should be set in JS code, so these functions
-/// as interfaces to JS glue code. These functions are equivalent to the
-/// following JS functions, which actually exist in asm.js version of JS
-/// library.
+/// at link time. setThrew exists in Emscripten's compiler-rt:
///
-/// function setThrew(threw, value) {
+/// void setThrew(uintptr_t threw, int value) {
/// if (__THREW__ == 0) {
/// __THREW__ = threw;
/// __threwValue = value;
@@ -76,7 +65,6 @@
/// }
//
/// setTempRet0 is called from __cxa_find_matching_catch() in JS glue code.
-///
/// In exception handling, getTempRet0 indicates the type of an exception
/// caught, and in setjmp/longjmp, it means the second argument to longjmp
/// function.
@@ -105,7 +93,7 @@
/// Module["dynCall_vi"](index,a1); // This calls original callee
/// } catch(e) {
/// if (typeof e !== 'number' && e !== 'longjmp') throw e;
-/// asm["setThrew"](1, 0); // setThrew is called here
+/// _setThrew(1, 0); // setThrew is called here
/// }
/// }
/// If an exception is thrown, __THREW__ will be set to true in a wrapper,
@@ -149,8 +137,8 @@
/// setjmpTableSize = 4;
/// setjmpTable = (int *) malloc(40);
/// setjmpTable[0] = 0;
-/// setjmpTable and setjmpTableSize are used in saveSetjmp() function in JS
-/// code.
+/// setjmpTable and setjmpTableSize are used to call saveSetjmp() function in
+/// Emscripten compiler-rt.
///
/// 3) Lower
/// setjmp(buf)
@@ -160,11 +148,11 @@
/// For each dynamic setjmp call, setjmpTable stores its ID (a number which
/// is incrementally assigned from 0) and its label (a unique number that
/// represents each callsite of setjmp). When we need more entries in
-/// setjmpTable, it is reallocated in saveSetjmp() in JS code and it will
-/// return the new table address, and assign the new table size in
-/// setTempRet0(). saveSetjmp also stores the setjmp's ID into the buffer
-/// buf. A BB with setjmp is split into two after setjmp call in order to
-/// make the post-setjmp BB the possible destination of longjmp BB.
+/// setjmpTable, it is reallocated in saveSetjmp() in Emscripten's
+/// compiler-rt and it will return the new table address, and assign the new
+/// table size in setTempRet0(). saveSetjmp also stores the setjmp's ID into
+/// the buffer buf. A BB with setjmp is split into two after setjmp call in
+/// order to make the post-setjmp BB the possible destination of longjmp BB.
///
///
/// 4) Lower every call that might longjmp into
@@ -172,12 +160,13 @@
/// call @__invoke_SIG(func, arg1, arg2)
/// %__THREW__.val = __THREW__;
/// __THREW__ = 0;
-/// if (%__THREW__.val != 0 & __threwValue != 0) {
+/// %__threwValue.val = __threwValue;
+/// if (%__THREW__.val != 0 & %__threwValue.val != 0) {
/// %label = testSetjmp(mem[%__THREW__.val], setjmpTable,
/// setjmpTableSize);
/// if (%label == 0)
-/// emscripten_longjmp(%__THREW__.val, __threwValue);
-/// setTempRet0(__threwValue);
+/// emscripten_longjmp(%__THREW__.val, %__threwValue.val);
+/// setTempRet0(%__threwValue.val);
/// } else {
/// %label = -1;
/// }
@@ -191,7 +180,7 @@
/// testSetjmp examines setjmpTable to see if there is a matching setjmp
/// call. After calling an invoke wrapper, if a longjmp occurred, __THREW__
/// will be the address of matching jmp_buf buffer and __threwValue be the
-/// second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is
+/// second argument to longjmp. mem[%__THREW__.val] is a setjmp ID that is
/// stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
/// each setjmp callsite. Label 0 means this longjmp buffer does not
/// correspond to one of the setjmp callsites in this function, so in this
@@ -227,6 +216,7 @@
class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
bool EnableEH; // Enable exception handling
bool EnableSjLj; // Enable setjmp/longjmp handling
+ bool DoSjLj; // Whether we actually perform setjmp/longjmp handling
GlobalVariable *ThrewGV = nullptr;
GlobalVariable *ThrewValueGV = nullptr;
@@ -245,6 +235,8 @@
StringMap<Function *> InvokeWrappers;
// Set of allowed function names for exception handling
std::set<std::string> EHAllowlistSet;
+ // Functions that contains calls to setjmp
+ SmallPtrSet<Function *, 8> SetjmpUsers;
StringRef getPassName() const override {
return "WebAssembly Lower Emscripten Exceptions";
@@ -263,6 +255,10 @@
bool areAllExceptionsAllowed() const { return EHAllowlistSet.empty(); }
bool canLongjmp(Module &M, const Value *Callee) const;
bool isEmAsmCall(Module &M, const Value *Callee) const;
+ bool supportsException(const Function *F) const {
+ return EnableEH && (areAllExceptionsAllowed() ||
+ EHAllowlistSet.count(std::string(F->getName())));
+ }
void rebuildSSA(Function &F);
@@ -298,7 +294,7 @@
return false;
StringRef Name = F->getName();
// leave setjmp and longjmp (mostly) alone, we process them properly later
- if (Name == "setjmp" || Name == "longjmp")
+ if (Name == "setjmp" || Name == "longjmp" || Name == "emscripten_longjmp")
return false;
return !F->doesNotThrow();
}
@@ -306,13 +302,12 @@
return true;
}
-// Get a global variable with the given name. If it doesn't exist declare it,
-// which will generate an import and asssumes that it will exist at link time.
-static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
- WebAssemblyTargetMachine &TM,
- const char *Name) {
- auto Int32Ty = IRB.getInt32Ty();
- auto *GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, Int32Ty));
+// Get a global variable with the given name. If it doesn't exist declare it,
+// which will generate an import and assume that it will exist at link time.
+static GlobalVariable *getGlobalVariable(Module &M, Type *Ty,
+ WebAssemblyTargetMachine &TM,
+ const char *Name) {
+ auto *GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, Ty));
if (!GV)
report_fatal_error(Twine("unable to create global: ") + Name);
@@ -368,6 +363,28 @@
return F;
}
+// Returns an integer type for the target architecture's address space.
+// i32 for wasm32 and i64 for wasm64.
+static Type *getAddrIntType(Module *M) {
+ IRBuilder<> IRB(M->getContext());
+ return IRB.getIntNTy(M->getDataLayout().getPointerSizeInBits());
+}
+
+// Returns an integer pointer type for the target architecture's address space.
+// i32* for wasm32 and i64* for wasm64.
+static Type *getAddrPtrType(Module *M) {
+ return Type::getIntNPtrTy(M->getContext(),
+ M->getDataLayout().getPointerSizeInBits());
+}
+
+// Returns an integer whose type is the integer type for the target's address
+// space. Returns (i32 C) for wasm32 and (i64 C) for wasm64, when C is the
+// integer.
+static Value *getAddrSizeInt(Module *M, uint64_t C) {
+ IRBuilder<> IRB(M->getContext());
+ return IRB.getIntN(M->getDataLayout().getPointerSizeInBits(), C);
+}
+
// Returns __cxa_find_matching_catch_N function, where N = NumClauses + 2.
// This is because a landingpad instruction contains two more arguments, a
// personality function and a cleanup bit, and __cxa_find_matching_catch_N
@@ -395,7 +412,8 @@
// Returns %__THREW__.val, which indicates whether an exception is thrown (or
// whether longjmp occurred), for future use.
Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
- LLVMContext &C = CI->getModule()->getContext();
+ Module *M = CI->getModule();
+ LLVMContext &C = M->getContext();
// If we are calling a function that is noreturn, we must remove that
// attribute. The code we insert here does expect it to return, after we
@@ -411,7 +429,7 @@
// Pre-invoke
// __THREW__ = 0;
- IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+ IRB.CreateStore(getAddrSizeInt(M, 0), ThrewGV);
// Invoke function wrapper in JavaScript
SmallVector<Value *, 16> Args;
@@ -459,8 +477,8 @@
// Post-invoke
// %__THREW__.val = __THREW__; __THREW__ = 0;
Value *Threw =
- IRB.CreateLoad(IRB.getInt32Ty(), ThrewGV, ThrewGV->getName() + ".val");
- IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+ IRB.CreateLoad(getAddrIntType(M), ThrewGV, ThrewGV->getName() + ".val");
+ IRB.CreateStore(getAddrSizeInt(M, 0), ThrewGV);
return Threw;
}
@@ -505,7 +523,7 @@
if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free")
return false;
- // There are functions in JS glue code
+ // There are functions in Emscripten's JS glue code or compiler-rt
if (CalleeName == "__resumeException" || CalleeName == "llvm_eh_typeid_for" ||
CalleeName == "saveSetjmp" || CalleeName == "testSetjmp" ||
CalleeName == "getTempRet0" || CalleeName == "setTempRet0")
@@ -538,11 +556,12 @@
// Generate testSetjmp function call seqence with preamble and postamble.
// The code this generates is equivalent to the following JavaScript code:
-// if (%__THREW__.val != 0 & threwValue != 0) {
+// %__threwValue.val = __threwValue;
+// if (%__THREW__.val != 0 & %__threwValue.val != 0) {
// %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
// if (%label == 0)
-// emscripten_longjmp(%__THREW__.val, threwValue);
-// setTempRet0(threwValue);
+// emscripten_longjmp(%__THREW__.val, %__threwValue.val);
+// setTempRet0(%__threwValue.val);
// } else {
// %label = -1;
// }
@@ -555,16 +574,17 @@
Value *SetjmpTableSize, Value *&Label, Value *&LongjmpResult,
BasicBlock *&EndBB) {
Function *F = BB->getParent();
- LLVMContext &C = BB->getModule()->getContext();
+ Module *M = F->getParent();
+ LLVMContext &C = M->getContext();
IRBuilder<> IRB(C);
IRB.SetCurrentDebugLocation(DL);
- // if (%__THREW__.val != 0 & threwValue != 0)
+ // if (%__THREW__.val != 0 & %__threwValue.val != 0)
IRB.SetInsertPoint(BB);
BasicBlock *ThenBB1 = BasicBlock::Create(C, "if.then1", F);
BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F);
BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F);
- Value *ThrewCmp = IRB.CreateICmpNE(Threw, IRB.getInt32(0));
+ Value *ThrewCmp = IRB.CreateICmpNE(Threw, getAddrSizeInt(M, 0));
Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
ThrewValueGV->getName() + ".val");
Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0));
@@ -576,21 +596,21 @@
IRB.SetInsertPoint(ThenBB1);
BasicBlock *ThenBB2 = BasicBlock::Create(C, "if.then2", F);
BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F);
- Value *ThrewInt = IRB.CreateIntToPtr(Threw, Type::getInt32PtrTy(C),
- Threw->getName() + ".i32p");
- Value *LoadedThrew = IRB.CreateLoad(IRB.getInt32Ty(), ThrewInt,
- ThrewInt->getName() + ".loaded");
+ Value *ThrewPtr =
+ IRB.CreateIntToPtr(Threw, getAddrPtrType(M), Threw->getName() + ".p");
+ Value *LoadedThrew = IRB.CreateLoad(getAddrIntType(M), ThrewPtr,
+ ThrewPtr->getName() + ".loaded");
Value *ThenLabel = IRB.CreateCall(
TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label");
Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0));
IRB.CreateCondBr(Cmp2, ThenBB2, EndBB2);
- // emscripten_longjmp(%__THREW__.val, threwValue);
+ // emscripten_longjmp(%__THREW__.val, %__threwValue.val);
IRB.SetInsertPoint(ThenBB2);
IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
IRB.CreateUnreachable();
- // setTempRet0(threwValue);
+ // setTempRet0(%__threwValue.val);
IRB.SetInsertPoint(EndBB2);
IRB.CreateCall(SetTempRet0Func, ThrewValue);
IRB.CreateBr(EndBB1);
@@ -636,11 +656,12 @@
}
// Replace uses of longjmp with emscripten_longjmp. emscripten_longjmp takes
-// arguments of type {i32, i32} and longjmp takes {jmp_buf*, i32}, so we need a
-// ptrtoint instruction here to make the type match. jmp_buf* will eventually be
-// lowered to i32 in the wasm backend.
+// arguments of type {i32, i32} (wasm32) / {i64, i32} (wasm64) and longjmp takes
+// {jmp_buf*, i32}, so we need a ptrtoint instruction here to make the type
+// match. jmp_buf* will eventually be lowered to i32 in the wasm backend.
static void replaceLongjmpWithEmscriptenLongjmp(Function *LongjmpF,
Function *EmLongjmpF) {
+ Module *M = LongjmpF->getParent();
SmallVector<CallInst *, 8> ToErase;
LLVMContext &C = LongjmpF->getParent()->getContext();
IRBuilder<> IRB(C);
@@ -652,7 +673,7 @@
if (CI && CI->getCalledFunction() == LongjmpF) {
IRB.SetInsertPoint(CI);
Value *Jmpbuf =
- IRB.CreatePtrToInt(CI->getArgOperand(0), IRB.getInt32Ty(), "jmpbuf");
+ IRB.CreatePtrToInt(CI->getArgOperand(0), getAddrIntType(M), "jmpbuf");
IRB.CreateCall(EmLongjmpF, {Jmpbuf, CI->getArgOperand(1)});
ToErase.push_back(CI);
}
@@ -679,21 +700,21 @@
Function *LongjmpF = M.getFunction("longjmp");
bool SetjmpUsed = SetjmpF && !SetjmpF->use_empty();
bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
- bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
-
- if ((EnableEH || DoSjLj) &&
- Triple(M.getTargetTriple()).getArch() == Triple::wasm64)
- report_fatal_error("Emscripten EH/SjLj is not supported with wasm64 yet");
+ DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
assert(TPC && "Expected a TargetPassConfig");
auto &TM = TPC->getTM<WebAssemblyTargetMachine>();
+ if (EnableEH && TM.Options.ExceptionModel == ExceptionHandling::Wasm)
+ report_fatal_error("-exception-model=wasm not allowed with "
+ "-enable-emscripten-cxx-exceptions");
+
// Declare (or get) global variables __THREW__, __threwValue, and
// getTempRet0/setTempRet0 function which are used in common for both
// exception handling and setjmp/longjmp handling
- ThrewGV = getGlobalVariableI32(M, IRB, TM, "__THREW__");
- ThrewValueGV = getGlobalVariableI32(M, IRB, TM, "__threwValue");
+ ThrewGV = getGlobalVariable(M, getAddrIntType(&M), TM, "__THREW__");
+ ThrewValueGV = getGlobalVariable(M, IRB.getInt32Ty(), TM, "__threwValue");
GetTempRet0Func = getEmscriptenFunction(
FunctionType::get(IRB.getInt32Ty(), false), "getTempRet0", &M);
SetTempRet0Func = getEmscriptenFunction(
@@ -704,7 +725,7 @@
bool Changed = false;
- // Exception handling
+ // Function registration for exception handling
if (EnableEH) {
// Register __resumeException function
FunctionType *ResumeFTy =
@@ -715,26 +736,15 @@
FunctionType *EHTypeIDTy =
FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
EHTypeIDF = getEmscriptenFunction(EHTypeIDTy, "llvm_eh_typeid_for", &M);
-
- for (Function &F : M) {
- if (F.isDeclaration())
- continue;
- Changed |= runEHOnFunction(F);
- }
}
- // Setjmp/longjmp handling
+ // Function registration and data pre-gathering for setjmp/longjmp handling
if (DoSjLj) {
- Changed = true; // We have setjmp or longjmp somewhere
-
// Register emscripten_longjmp function
FunctionType *FTy = FunctionType::get(
- IRB.getVoidTy(), {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+ IRB.getVoidTy(), {getAddrIntType(&M), IRB.getInt32Ty()}, false);
EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
- if (LongjmpF)
- replaceLongjmpWithEmscriptenLongjmp(LongjmpF, EmLongjmpF);
-
if (SetjmpF) {
// Register saveSetjmp function
FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
@@ -747,19 +757,37 @@
// Register testSetjmp function
FTy = FunctionType::get(
IRB.getInt32Ty(),
- {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}, false);
+ {getAddrIntType(&M), Type::getInt32PtrTy(C), IRB.getInt32Ty()},
+ false);
TestSetjmpF = getEmscriptenFunction(FTy, "testSetjmp", &M);
- // Only traverse functions that uses setjmp in order not to insert
- // unnecessary prep / cleanup code in every function
- SmallPtrSet<Function *, 8> SetjmpUsers;
+ // Precompute setjmp users
for (User *U : SetjmpF->users()) {
auto *UI = cast<Instruction>(U);
SetjmpUsers.insert(UI->getFunction());
}
+ }
+ }
+
+ // Exception handling transformation
+ if (EnableEH) {
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ Changed |= runEHOnFunction(F);
+ }
+ }
+
+ // Setjmp/longjmp handling transformation
+ if (DoSjLj) {
+ Changed = true; // We have setjmp or longjmp somewhere
+ if (LongjmpF)
+ replaceLongjmpWithEmscriptenLongjmp(LongjmpF, EmLongjmpF);
+ // Only traverse functions that uses setjmp in order not to insert
+ // unnecessary prep / cleanup code in every function
+ if (SetjmpF)
for (Function *F : SetjmpUsers)
runSjLjOnFunction(*F);
- }
}
if (!Changed) {
@@ -787,8 +815,6 @@
bool Changed = false;
SmallVector<Instruction *, 64> ToErase;
SmallPtrSet<LandingPadInst *, 32> LandingPads;
- bool AllowExceptions = areAllExceptionsAllowed() ||
- EHAllowlistSet.count(std::string(F.getName()));
for (BasicBlock &BB : F) {
auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
@@ -798,14 +824,53 @@
LandingPads.insert(II->getLandingPadInst());
IRB.SetInsertPoint(II);
- bool NeedInvoke = AllowExceptions && canThrow(II->getCalledOperand());
+ const Value *Callee = II->getCalledOperand();
+ bool NeedInvoke = supportsException(&F) && canThrow(Callee);
if (NeedInvoke) {
// Wrap invoke with invoke wrapper and generate preamble/postamble
Value *Threw = wrapInvoke(II);
ToErase.push_back(II);
+ // If setjmp/longjmp handling is enabled, the thrown value can be not an
+ // exception but a longjmp. If the current function contains calls to
+ // setjmp, it will be appropriately handled in runSjLjOnFunction. But even
+ // if the function does not contain setjmp calls, we shouldn't silently
+ // ignore longjmps; we should rethrow them so they can be correctly
+ // handled in somewhere up the call chain where setjmp is.
+ // __THREW__'s value is 0 when nothing happened, 1 when an exception is
+ // thrown, other values when longjmp is thrown.
+ //
+ // if (%__THREW__.val == 0 || %__THREW__.val == 1)
+ // goto %tail
+ // else
+ // goto %longjmp.rethrow
+ //
+ // longjmp.rethrow: ;; This is longjmp. Rethrow it
+ // %__threwValue.val = __threwValue
+ // emscripten_longjmp(%__THREW__.val, %__threwValue.val);
+ //
+ // tail: ;; Nothing happened or an exception is thrown
+ // ... Continue exception handling ...
+ if (DoSjLj && !SetjmpUsers.count(&F) && canLongjmp(M, Callee)) {
+ BasicBlock *Tail = BasicBlock::Create(C, "tail", &F);
+ BasicBlock *RethrowBB = BasicBlock::Create(C, "longjmp.rethrow", &F);
+ Value *CmpEqOne =
+ IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 1), "cmp.eq.one");
+ Value *CmpEqZero =
+ IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 0), "cmp.eq.zero");
+ Value *Or = IRB.CreateOr(CmpEqZero, CmpEqOne, "or");
+ IRB.CreateCondBr(Or, Tail, RethrowBB);
+ IRB.SetInsertPoint(RethrowBB);
+ Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
+ ThrewValueGV->getName() + ".val");
+ IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
+
+ IRB.CreateUnreachable();
+ IRB.SetInsertPoint(Tail);
+ }
+
// Insert a branch based on __THREW__ variable
- Value *Cmp = IRB.CreateICmpEQ(Threw, IRB.getInt32(1), "cmp");
+ Value *Cmp = IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 1), "cmp");
IRB.CreateCondBr(Cmp, II->getUnwindDest(), II->getNormalDest());
} else {
@@ -1064,12 +1129,15 @@
// __THREW__ = 0;
for (auto I = std::next(BasicBlock::iterator(ThrewLI)), IE = BB->end();
I != IE; ++I) {
- if (auto *SI = dyn_cast<StoreInst>(I))
- if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand()))
- if (GV == ThrewGV && SI->getValueOperand() == IRB.getInt32(0)) {
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand())) {
+ if (GV == ThrewGV &&
+ SI->getValueOperand() == getAddrSizeInt(&M, 0)) {
ThrewResetSI = SI;
break;
}
+ }
+ }
}
assert(Threw && ThrewLI && "Cannot find __THREW__ load after invoke");
assert(ThrewResetSI && "Cannot find __THREW__ store after invoke");
@@ -1080,6 +1148,46 @@
Threw = wrapInvoke(CI);
ToErase.push_back(CI);
Tail = SplitBlock(BB, CI->getNextNode());
+
+ // If exception handling is enabled, the thrown value can be not a
+ // longjmp but an exception, in which case we shouldn't silently ignore
+ // exceptions; we should rethrow them.
+ // __THREW__'s value is 0 when nothing happened, 1 when an exception is
+ // thrown, other values when longjmp is thrown.
+ //
+ // if (%__THREW__.val == 1)
+ // goto %eh.rethrow
+ // else
+ // goto %normal
+ //
+ // eh.rethrow: ;; Rethrow exception
+ // %exn = call @__cxa_find_matching_catch_2() ;; Retrieve thrown ptr
+ // __resumeException(%exn)
+ //
+ // normal:
+ // <-- Insertion point. Will insert sjlj handling code from here
+ // goto %tail
+ //
+ // tail:
+ // ...
+ if (supportsException(&F) && canThrow(Callee)) {
+ IRB.SetInsertPoint(CI);
+ // We will add a new conditional branch. So remove the branch created
+ // when we split the BB
+ ToErase.push_back(BB->getTerminator());
+ BasicBlock *NormalBB = BasicBlock::Create(C, "normal", &F);
+ BasicBlock *RethrowBB = BasicBlock::Create(C, "eh.rethrow", &F);
+ Value *CmpEqOne =
+ IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 1), "cmp.eq.one");
+ IRB.CreateCondBr(CmpEqOne, RethrowBB, NormalBB);
+ IRB.SetInsertPoint(RethrowBB);
+ CallInst *Exn = IRB.CreateCall(getFindMatchingCatch(M, 0), {}, "exn");
+ IRB.CreateCall(ResumeF, {Exn});
+ IRB.CreateUnreachable();
+ IRB.SetInsertPoint(NormalBB);
+ IRB.CreateBr(Tail);
+ BB = NormalBB; // New insertion point to insert testSetjmp()
+ }
}
// We need to replace the terminator in Tail - SplitBlock makes BB go
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 86d59ef..ec2380a 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -13,11 +13,11 @@
//===----------------------------------------------------------------------===//
#include "WebAssemblyMCInstLower.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssemblyAsmPrinter.h"
#include "WebAssemblyMachineFunctionInfo.h"
-#include "WebAssemblyRuntimeLibcallSignatures.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Constants.h"
@@ -46,8 +46,28 @@
MCSymbol *
WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
const GlobalValue *Global = MO.getGlobal();
- if (!isa<Function>(Global))
- return cast<MCSymbolWasm>(Printer.getSymbol(Global));
+ if (!isa<Function>(Global)) {
+ auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
+ // If the symbol doesn't have an explicit WasmSymbolType yet and the
+ // GlobalValue is actually a WebAssembly global, then ensure the symbol is a
+ // WASM_SYMBOL_TYPE_GLOBAL.
+ if (WebAssembly::isWasmVarAddressSpace(Global->getAddressSpace()) &&
+ !WasmSym->getType()) {
+ const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+ const TargetMachine &TM = MF.getTarget();
+ const Function &CurrentFunc = MF.getFunction();
+ SmallVector<MVT, 1> VTs;
+ computeLegalValueVTs(CurrentFunc, TM, Global->getValueType(), VTs);
+ if (VTs.size() != 1)
+ report_fatal_error("Aggregate globals not yet implemented");
+
+ bool Mutable = true;
+ wasm::ValType Type = WebAssembly::toValType(VTs[0]);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+ WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable});
+ }
+ return WasmSym;
+ }
const auto *FuncTy = cast<FunctionType>(Global->getValueType());
const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
@@ -71,58 +91,7 @@
MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
const MachineOperand &MO) const {
- const char *Name = MO.getSymbolName();
- auto *WasmSym = cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
- const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
-
- // Except for certain known symbols, all symbols used by CodeGen are
- // functions. It's OK to hardcode knowledge of specific symbols here; this
- // method is precisely there for fetching the signatures of known
- // Clang-provided symbols.
- if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 ||
- strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 ||
- strcmp(Name, "__tls_size") == 0 || strcmp(Name, "__tls_align") == 0) {
- bool Mutable =
- strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0;
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
- WasmSym->setGlobalType(wasm::WasmGlobalType{
- uint8_t(Subtarget.hasAddr64() && strcmp(Name, "__table_base") != 0
- ? wasm::WASM_TYPE_I64
- : wasm::WASM_TYPE_I32),
- Mutable});
- return WasmSym;
- }
-
- SmallVector<wasm::ValType, 4> Returns;
- SmallVector<wasm::ValType, 4> Params;
- if (strcmp(Name, "__cpp_exception") == 0) {
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT);
- // We can't confirm its signature index for now because there can be
- // imported exceptions. Set it to be 0 for now.
- WasmSym->setEventType(
- {wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION, /* SigIndex */ 0});
- // We may have multiple C++ compilation units to be linked together, each of
- // which defines the exception symbol. To resolve them, we declare them as
- // weak.
- WasmSym->setWeak(true);
- WasmSym->setExternal(true);
-
- // All C++ exceptions are assumed to have a single i32 (for wasm32) or i64
- // (for wasm64) param type and void return type. The reaon is, all C++
- // exception values are pointers, and to share the type section with
- // functions, exceptions are assumed to have void return type.
- Params.push_back(Subtarget.hasAddr64() ? wasm::ValType::I64
- : wasm::ValType::I32);
- } else { // Function symbols
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
- getLibcallSignature(Subtarget, Name, Returns, Params);
- }
- auto Signature =
- std::make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
- WasmSym->setSignature(Signature.get());
- Printer.addSignature(std::move(Signature));
-
- return WasmSym;
+ return Printer.getOrCreateWasmSymbol(MO.getSymbolName());
}
MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
@@ -159,8 +128,10 @@
report_fatal_error("Function addresses with offsets not supported");
if (WasmSym->isGlobal())
report_fatal_error("Global indexes with offsets not supported");
- if (WasmSym->isEvent())
- report_fatal_error("Event indexes with offsets not supported");
+ if (WasmSym->isTag())
+ report_fatal_error("Tag indexes with offsets not supported");
+ if (WasmSym->isTable())
+ report_fatal_error("Table indexes with offsets not supported");
Expr = MCBinaryExpr::createAdd(
Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
@@ -196,6 +167,10 @@
return wasm::ValType::F64;
if (RC == &WebAssembly::V128RegClass)
return wasm::ValType::V128;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return wasm::ValType::EXTERNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return wasm::ValType::FUNCREF;
llvm_unreachable("Unexpected register class");
}
@@ -285,13 +260,13 @@
break;
}
case MachineOperand::MO_FPImmediate: {
- // TODO: MC converts all floating point immediate operands to double.
- // This is fine for numeric values, but may cause NaNs to change bits.
const ConstantFP *Imm = MO.getFPImm();
+ const uint64_t BitPattern =
+ Imm->getValueAPF().bitcastToAPInt().getZExtValue();
if (Imm->getType()->isFloatTy())
- MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat());
+ MCOp = MCOperand::createSFPImm(static_cast<uint32_t>(BitPattern));
else if (Imm->getType()->isDoubleTy())
- MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble());
+ MCOp = MCOperand::createDFPImm(BitPattern);
else
llvm_unreachable("unknown floating point immediate type");
break;
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
new file mode 100644
index 0000000..3daffd1
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
@@ -0,0 +1,86 @@
+//===-- WebAssemblyMCLowerPrePass.cpp - Prepare for MC lower --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Some information in MC lowering / asm printing gets generated as
+/// instructions get emitted, but may be necessary at the start, such as for
+/// .globaltype declarations. This pass collects this information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-mclower-prepass"
+
+namespace {
+class WebAssemblyMCLowerPrePass final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly MC Lower Pre Pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyMCLowerPrePass() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyMCLowerPrePass::ID = 0;
+INITIALIZE_PASS(
+ WebAssemblyMCLowerPrePass, DEBUG_TYPE,
+ "Collects information ahead of time for MC lowering",
+ false, false)
+
+FunctionPass *llvm::createWebAssemblyMCLowerPrePass() {
+ return new WebAssemblyMCLowerPrePass();
+}
+
+bool WebAssemblyMCLowerPrePass::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** MC Lower Pre Pass **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ MachineModuleInfoWasm &MMIW = MMI.getObjFileInfo<MachineModuleInfoWasm>();
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &MI : MBB) {
+ // FIXME: what should all be filtered out beyond these?
+ if (MI.isDebugInstr() || MI.isInlineAsm())
+ continue;
+ for (MachineOperand &MO : MI.uses()) {
+ if (MO.isSymbol()) {
+ MMIW.MachineSymbolsUsed.insert(MO.getSymbolName());
+ }
+ }
+ }
+ }
+
+ return true;
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index adee2f0..00b1132 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -13,9 +13,12 @@
//===----------------------------------------------------------------------===//
#include "WebAssemblyMachineFunctionInfo.h"
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "WebAssemblyISelLowering.h"
#include "WebAssemblySubtarget.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -104,7 +107,32 @@
yaml::WebAssemblyFunctionInfo::WebAssemblyFunctionInfo(
const llvm::WebAssemblyFunctionInfo &MFI)
- : CFGStackified(MFI.isCFGStackified()) {}
+ : CFGStackified(MFI.isCFGStackified()) {
+ auto *EHInfo = MFI.getWasmEHFuncInfo();
+ const llvm::MachineFunction &MF = MFI.getMachineFunction();
+
+ for (auto VT : MFI.getParams())
+ Params.push_back(EVT(VT).getEVTString());
+ for (auto VT : MFI.getResults())
+ Results.push_back(EVT(VT).getEVTString());
+
+ // MFI.getWasmEHFuncInfo() is non-null only for functions with the
+ // personality function.
+ if (EHInfo) {
+ // SrcToUnwindDest can contain stale mappings in case BBs are removed in
+ // optimizations, in case, for example, they are unreachable. We should not
+ // include their info.
+ SmallPtrSet<const MachineBasicBlock *, 16> MBBs;
+ for (const auto &MBB : MF)
+ MBBs.insert(&MBB);
+ for (auto KV : EHInfo->SrcToUnwindDest) {
+ auto *SrcBB = KV.first.get<MachineBasicBlock *>();
+ auto *DestBB = KV.second.get<MachineBasicBlock *>();
+ if (MBBs.count(SrcBB) && MBBs.count(DestBB))
+ SrcToUnwindDest[SrcBB->getNumber()] = DestBB->getNumber();
+ }
+ }
+}
void yaml::WebAssemblyFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
MappingTraits<WebAssemblyFunctionInfo>::mapping(YamlIO, *this);
@@ -113,4 +141,13 @@
void WebAssemblyFunctionInfo::initializeBaseYamlFields(
const yaml::WebAssemblyFunctionInfo &YamlMFI) {
CFGStackified = YamlMFI.CFGStackified;
+ for (auto VT : YamlMFI.Params)
+ addParam(WebAssembly::parseMVT(VT.Value));
+ for (auto VT : YamlMFI.Results)
+ addResult(WebAssembly::parseMVT(VT.Value));
+ if (WasmEHInfo) {
+ for (auto KV : YamlMFI.SrcToUnwindDest)
+ WasmEHInfo->setUnwindDest(MF.getBlockNumbered(KV.first),
+ MF.getBlockNumbered(KV.second));
+ }
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index ca164fd..3fa2d0c 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -16,13 +16,14 @@
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/MC/MCSymbolWasm.h"
namespace llvm {
+struct WasmEHFuncInfo;
+
namespace yaml {
struct WebAssemblyFunctionInfo;
}
@@ -30,6 +31,8 @@
/// This class is derived from MachineFunctionInfo and contains private
/// WebAssembly-specific information for each MachineFunction.
class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
+ const MachineFunction &MF;
+
std::vector<MVT> Params;
std::vector<MVT> Results;
std::vector<MVT> Locals;
@@ -63,9 +66,16 @@
// Function properties.
bool CFGStackified = false;
+ // Catchpad unwind destination info for wasm EH.
+ WasmEHFuncInfo *WasmEHInfo = nullptr;
+
public:
- explicit WebAssemblyFunctionInfo(MachineFunction &MF) {}
+ explicit WebAssemblyFunctionInfo(MachineFunction &MF)
+ : MF(MF), WasmEHInfo(MF.getWasmEHFuncInfo()) {}
~WebAssemblyFunctionInfo() override;
+
+ const MachineFunction &getMachineFunction() const { return MF; }
+
void initializeBaseYamlFields(const yaml::WebAssemblyFunctionInfo &YamlMFI);
void addParam(MVT VT) { Params.push_back(VT); }
@@ -151,6 +161,9 @@
bool isCFGStackified() const { return CFGStackified; }
void setCFGStackified(bool Value = true) { CFGStackified = Value; }
+
+ WasmEHFuncInfo *getWasmEHFuncInfo() const { return WasmEHInfo; }
+ void setWasmEHFuncInfo(WasmEHFuncInfo *Info) { WasmEHInfo = Info; }
};
void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty,
@@ -172,8 +185,15 @@
namespace yaml {
+using BBNumberMap = DenseMap<int, int>;
+
struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo {
+ std::vector<FlowStringValue> Params;
+ std::vector<FlowStringValue> Results;
bool CFGStackified = false;
+ // The same as WasmEHFuncInfo's SrcToUnwindDest, but stored in the mapping of
+ // BB numbers
+ BBNumberMap SrcToUnwindDest;
WebAssemblyFunctionInfo() = default;
WebAssemblyFunctionInfo(const llvm::WebAssemblyFunctionInfo &MFI);
@@ -184,7 +204,23 @@
template <> struct MappingTraits<WebAssemblyFunctionInfo> {
static void mapping(IO &YamlIO, WebAssemblyFunctionInfo &MFI) {
+ YamlIO.mapOptional("params", MFI.Params, std::vector<FlowStringValue>());
+ YamlIO.mapOptional("results", MFI.Results, std::vector<FlowStringValue>());
YamlIO.mapOptional("isCFGStackified", MFI.CFGStackified, false);
+ YamlIO.mapOptional("wasmEHFuncInfo", MFI.SrcToUnwindDest);
+ }
+};
+
+template <> struct CustomMappingTraits<BBNumberMap> {
+ static void inputOne(IO &YamlIO, StringRef Key,
+ BBNumberMap &SrcToUnwindDest) {
+ YamlIO.mapRequired(Key.str().c_str(),
+ SrcToUnwindDest[std::atoi(Key.str().c_str())]);
+ }
+
+ static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) {
+ for (auto KV : SrcToUnwindDest)
+ YamlIO.mapRequired(std::to_string(KV.first).c_str(), KV.second);
}
};
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp
new file mode 100644
index 0000000..62fa089
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp
@@ -0,0 +1,68 @@
+//=== WebAssemblyNullifyDebugValueLists.cpp - Nullify DBG_VALUE_LISTs ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Nullify DBG_VALUE_LISTs instructions as a temporary measure before we
+/// implement DBG_VALUE_LIST handling in WebAssemblyDebugValueManager.
+/// See https://bugs.llvm.org/show_bug.cgi?id=50361.
+/// TODO Correctly handle DBG_VALUE_LISTs
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-nullify-dbg-value-lists"
+
+namespace {
+class WebAssemblyNullifyDebugValueLists final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Nullify DBG_VALUE_LISTs";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyNullifyDebugValueLists() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyNullifyDebugValueLists::ID = 0;
+INITIALIZE_PASS(WebAssemblyNullifyDebugValueLists, DEBUG_TYPE,
+ "WebAssembly Nullify DBG_VALUE_LISTs", false, false)
+
+FunctionPass *llvm::createWebAssemblyNullifyDebugValueLists() {
+ return new WebAssemblyNullifyDebugValueLists();
+}
+
+bool WebAssemblyNullifyDebugValueLists::runOnMachineFunction(
+ MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** Nullify DBG_VALUE_LISTs **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ SmallVector<MachineInstr *, 2> DbgValueLists;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (MI.getOpcode() == TargetOpcode::DBG_VALUE_LIST)
+ DbgValueLists.push_back(&MI);
+
+ // Our backend, including WebAssemblyDebugValueManager, currently cannot
+ // handle DBG_VALUE_LISTs correctly. So this converts DBG_VALUE_LISTs to
+ // "DBG_VALUE $noreg", which will appear as "optimized out".
+ for (auto *DVL : DbgValueLists) {
+ BuildMI(*DVL->getParent(), DVL, DVL->getDebugLoc(),
+ TII.get(TargetOpcode::DBG_VALUE), false, Register(),
+ DVL->getOperand(0).getMetadata(), DVL->getOperand(1).getMetadata());
+ DVL->eraseFromParent();
+ }
+
+ return !DbgValueLists.empty();
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 799b938..ed5f7cc 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -19,10 +19,10 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index b655014..c73b8a2 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -13,10 +13,10 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d474b9a..d6adc2f 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -20,11 +20,11 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyDebugValueManager.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -121,14 +121,9 @@
Type::getDoubleTy(MF.getFunction().getContext())));
MI->addOperand(MachineOperand::CreateFPImm(Val));
} else if (RegClass == &WebAssembly::V128RegClass) {
- // TODO: Replace this with v128.const 0 once that is supported in V8
- Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
- MI->setDesc(TII->get(WebAssembly::SPLAT_I32x4));
- MI->addOperand(MachineOperand::CreateReg(TempReg, false));
- MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(WebAssembly::CONST_I32), TempReg)
- .addImm(0);
- LIS.InsertMachineInstrInMaps(*Const);
+ MI->setDesc(TII->get(WebAssembly::CONST_V128_I64x2));
+ MI->addOperand(MachineOperand::CreateImm(0));
+ MI->addOperand(MachineOperand::CreateImm(0));
} else {
llvm_unreachable("Unexpected reg class");
}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 6456026..388c0f9 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -889,14 +889,15 @@
// TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed
// other than here, just roll its logic into this version.
void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
- const char *Name,
+ StringRef Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params) {
auto &Map = LibcallNameMap->Map;
auto Val = Map.find(Name);
#ifndef NDEBUG
if (Val == Map.end()) {
- auto message = std::string("unexpected runtime library name: ") + Name;
+ auto message = std::string("unexpected runtime library name: ") +
+ std::string(Name);
llvm_unreachable(message.c_str());
}
#endif
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index 6ae8aaa..f7a94aa 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -28,7 +28,7 @@
SmallVectorImpl<wasm::ValType> &Params);
extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
- const char *Name,
+ StringRef Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params);
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 135055a..746a759 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -77,6 +77,7 @@
initializeWebAssemblyMemIntrinsicResultsPass(PR);
initializeWebAssemblyRegStackifyPass(PR);
initializeWebAssemblyRegColoringPass(PR);
+ initializeWebAssemblyNullifyDebugValueListsPass(PR);
initializeWebAssemblyFixIrreducibleControlFlowPass(PR);
initializeWebAssemblyLateEHPreparePass(PR);
initializeWebAssemblyExceptionInfoPass(PR);
@@ -87,6 +88,7 @@
initializeWebAssemblyRegNumberingPass(PR);
initializeWebAssemblyDebugFixupPass(PR);
initializeWebAssemblyPeepholePass(PR);
+ initializeWebAssemblyMCLowerPrePassPass(PR);
}
//===----------------------------------------------------------------------===//
@@ -118,11 +120,17 @@
const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
const TargetOptions &Options, Optional<Reloc::Model> RM,
Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
- : LLVMTargetMachine(T,
- TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
- : "e-m:e-p:32:32-i64:64-n32:64-S128",
- TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT),
- getEffectiveCodeModel(CM, CodeModel::Large), OL),
+ : LLVMTargetMachine(
+ T,
+ TT.isArch64Bit()
+ ? (TT.isOSEmscripten()
+ ? "e-m:e-p:64:64-i64:64-f128:64-n32:64-S128-ni:1:10:20"
+ : "e-m:e-p:64:64-i64:64-n32:64-S128-ni:1:10:20")
+ : (TT.isOSEmscripten()
+ ? "e-m:e-p:32:32-i64:64-f128:64-n32:64-S128-ni:1:10:20"
+ : "e-m:e-p:32:32-i64:64-n32:64-S128-ni:1:10:20"),
+ TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT),
+ getEffectiveCodeModel(CM, CodeModel::Large), OL),
TLOF(new WebAssemblyTargetObjectFile()) {
// WebAssembly type-checks instructions, but a noreturn function with a return
// type that doesn't match the context will cause a check failure. So we lower
@@ -441,6 +449,9 @@
void WebAssemblyPassConfig::addPreEmitPass() {
TargetPassConfig::addPreEmitPass();
+ // Nullify DBG_VALUE_LISTs that we cannot handle.
+ addPass(createWebAssemblyNullifyDebugValueLists());
+
// Eliminate multiple-entry loops.
addPass(createWebAssemblyFixIrreducibleControlFlow());
@@ -502,6 +513,9 @@
// Fix debug_values whose defs have been stackified.
if (!WasmDisableExplicitLocals)
addPass(createWebAssemblyDebugFixup());
+
+ // Collect information to prepare for MC lowering / asm printing.
+ addPass(createWebAssemblyMCLowerPrePass());
}
yaml::MachineFunctionInfo *
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index be1cfba..d9bc7c6 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -36,22 +36,30 @@
return Result;
}
-unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) const {
- if (Vector && getST()->hasSIMD128())
- return 128;
+TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
+ TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(64);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
+ }
- return 64;
+ llvm_unreachable("Unsupported register kind");
}
-unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
+InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
- unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
- Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+ InstructionCost Cost =
+ BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
+ Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
switch (Opcode) {
@@ -74,9 +82,11 @@
return Cost;
}
-unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
- unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode,
+ Type *Val,
+ unsigned Index) {
+ InstructionCost Cost =
+ BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
// SIMD128's insert/extract currently only take constant indices.
if (Index == -1u)
@@ -102,3 +112,29 @@
return (CallerBits & CalleeBits) == CalleeBits;
}
+
+void WebAssemblyTTIImpl::getUnrollingPreferences(
+ Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) const {
+ // Scan the loop: don't unroll loops with calls. This is a standard approach
+ // for most (all?) targets.
+ for (BasicBlock *BB : L->blocks())
+ for (Instruction &I : *BB)
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ if (const Function *F = cast<CallBase>(I).getCalledFunction())
+ if (isLoweredToCall(F))
+ return;
+
+ // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
+ // the various microarchitectures that use the BasicTTI implementation and
+ // has been selected through heuristics across multiple cores and runtimes.
+ UP.Partial = UP.Runtime = UP.UpperBound = true;
+ UP.PartialThreshold = 30;
+
+ // Avoid unrolling when optimizing for size.
+ UP.OptSizeThreshold = 0;
+ UP.PartialOptSizeThreshold = 0;
+
+ // Set number of instructions optimized when "back edge"
+ // becomes "fall through" to default value of 2.
+ UP.BEInsns = 2;
+}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 41e358c..1a33bd2 100644
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -48,14 +48,17 @@
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP) const;
+
/// @}
/// \name Vector TTI Implementations
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
- unsigned getArithmeticInstrCost(
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -64,7 +67,8 @@
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
/// @}
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
deleted file mode 100644
index 41ad786..0000000
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the declaration of the WebAssembly-specific
-/// utility functions.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
-#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineInstr;
-class MachineOperand;
-class MCContext;
-class MCSymbolWasm;
-class StringRef;
-class WebAssemblyFunctionInfo;
-
-namespace WebAssembly {
-
-bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
-bool mayThrow(const MachineInstr &MI);
-
-// Exception-related function names
-extern const char *const ClangCallTerminateFn;
-extern const char *const CxaBeginCatchFn;
-extern const char *const CxaRethrowFn;
-extern const char *const StdTerminateFn;
-extern const char *const PersonalityWrapperFn;
-
-/// Returns the operand number of a callee, assuming the argument is a call
-/// instruction.
-const MachineOperand &getCalleeOp(const MachineInstr &MI);
-
-/// Returns the operand number of a callee, assuming the argument is a call
-/// instruction.
-MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
- const StringRef &Name);
-
-/// Find a catch instruction from an EH pad. Returns null if no catch
-/// instruction found or the catch is in an invalid location.
-MachineInstr *findCatch(MachineBasicBlock *EHPad);
-
-} // end namespace WebAssembly
-
-} // end namespace llvm
-
-#endif
diff --git a/src/llvm-project/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/src/llvm-project/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
deleted file mode 100644
index c9f7574..0000000
--- a/src/llvm-project/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-# Tests which are known to fail from the GCC torture test suite.
-
-# Syntax: Each line has a single test to be marked as a 'known failure' (or
-# 'exclusion'. Known failures are expected to fail, and will cause an error if
-# they pass. (Known failures that do not run at all will not cause an
-# error). The format is
-# <name> <attributes> # comment
-
-# blockaddress without an indirectbr still can't be supported
-20071220-1.c O2 # Relocation against a BB address
-20071220-2.c
-990208-1.c
-label13.C O0
-label13a.C O0
-label3.C
-
-# WebAssembly hasn't implemented (will never?) __builtin_return_address
-20010122-1.c
-20030323-1.c
-20030811-1.c
-pr17377.c
-
-# Error: invalid output constraint '=t' in asm.
-990413-2.c
-
-# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
-built-in-setjmp.c
-pr60003.c
-
-# Error in the program / unsupported by Clang.
-20000822-1.c
-20010209-1.c
-20010605-1.c
-20030501-1.c
-20040520-1.c
-20061220-1.c
-20090219-1.c
-920415-1.c
-920428-2.c
-920501-7.c
-920612-2.c
-920721-4.c
-921017-1.c
-921215-1.c
-931002-1.c
-comp-goto-2.c
-nest-align-1.c
-nest-stdar-1.c
-nestfunc-1.c
-nestfunc-2.c
-nestfunc-3.c
-nestfunc-5.c
-nestfunc-6.c
-nestfunc-7.c
-pr22061-3.c
-pr22061-4.c
-pr24135.c
-pr51447.c
-20020412-1.c
-20040308-1.c
-20040423-1.c
-20041218-2.c
-20070919-1.c
-align-nest.c
-pr41935.c
-920302-1.c
-920501-3.c
-920728-1.c
-pr28865.c
-attr-alias-1.C
-attr-alias-2.C
-attr-ifunc-1.C
-attr-ifunc-2.C
-attr-ifunc-3.C
-attr-ifunc-4.C
-complit12.C
-va-arg-pack-1.C
-va-arg-pack-len-1.C
-builtin-line1.C
-devirt-6.C # bad main signature
-devirt-13.C # bad main signature
-devirt-14.C # bad main signature
-devirt-21.C # bad main signature
-devirt-23.C # bad main signature
-lifetime2.C # violates C++ DR1696
-
-# WASI doesn't have stdjmp.h yet
-pr56982.c
-
-# WASI doesn't have pthread.h yet
-thread_local3.C
-thread_local3g.C
-thread_local4.C
-thread_local4g.C
-thread_local5.C
-thread_local5g.C
-
-# Untriaged C++ failures
-spec5.C
-addr1.C
-ef_test.C
-member2.C
-new39.C
-new40.C
-nrv8.C
-offsetof9.C
-opaque-1.C
-pr19650.C
-pr37146-1.C
-pr46149.C
-pr59470.C
-rtti2.C
-self1.C
-type-generic-1.C
-vbase8-10.C
-vbase8-21.C
-vbase8-22.C
-vbase8-4.C
-vector1.C
diff --git a/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9d9a201..67ca67d 100644
--- a/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1784,21 +1784,21 @@
if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
!getParser().isParsingMasm())
return false;
- if (Name.equals_lower("not")) {
+ if (Name.equals_insensitive("not")) {
SM.onNot();
- } else if (Name.equals_lower("or")) {
+ } else if (Name.equals_insensitive("or")) {
SM.onOr();
- } else if (Name.equals_lower("shl")) {
+ } else if (Name.equals_insensitive("shl")) {
SM.onLShift();
- } else if (Name.equals_lower("shr")) {
+ } else if (Name.equals_insensitive("shr")) {
SM.onRShift();
- } else if (Name.equals_lower("xor")) {
+ } else if (Name.equals_insensitive("xor")) {
SM.onXor();
- } else if (Name.equals_lower("and")) {
+ } else if (Name.equals_insensitive("and")) {
SM.onAnd();
- } else if (Name.equals_lower("mod")) {
+ } else if (Name.equals_insensitive("mod")) {
SM.onMod();
- } else if (Name.equals_lower("offset")) {
+ } else if (Name.equals_insensitive("offset")) {
SMLoc OffsetLoc = getTok().getLoc();
const MCExpr *Val = nullptr;
StringRef ID;
@@ -1814,24 +1814,24 @@
} else {
return false;
}
- if (!Name.equals_lower("offset"))
+ if (!Name.equals_insensitive("offset"))
End = consumeToken();
return true;
}
bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
IntelExprStateMachine &SM,
bool &ParseError, SMLoc &End) {
- if (Name.equals_lower("eq")) {
+ if (Name.equals_insensitive("eq")) {
SM.onEq();
- } else if (Name.equals_lower("ne")) {
+ } else if (Name.equals_insensitive("ne")) {
SM.onNE();
- } else if (Name.equals_lower("lt")) {
+ } else if (Name.equals_insensitive("lt")) {
SM.onLT();
- } else if (Name.equals_lower("le")) {
+ } else if (Name.equals_insensitive("le")) {
SM.onLE();
- } else if (Name.equals_lower("gt")) {
+ } else if (Name.equals_insensitive("gt")) {
SM.onGT();
- } else if (Name.equals_lower("ge")) {
+ } else if (Name.equals_insensitive("ge")) {
SM.onGE();
} else {
return false;
@@ -1933,7 +1933,7 @@
if (Parser.isParsingMasm()) {
const AsmToken &NextTok = getLexer().peekTok();
if (NextTok.is(AsmToken::Identifier) &&
- NextTok.getIdentifier().equals_lower("ptr")) {
+ NextTok.getIdentifier().equals_insensitive("ptr")) {
AsmTypeInfo Info;
if (Parser.lookUpType(Identifier, Info))
return Error(Tok.getLoc(), "unknown type");
@@ -2594,9 +2594,9 @@
End, Size, SM.getSymName(),
SM.getIdentifierInfo(), Operands);
- // When parsing x64 MS-style assembly, all memory operands default to
- // RIP-relative when interpreted as non-absolute references.
- if (Parser.isParsingMasm() && is64BitMode()) {
+ // When parsing x64 MS-style assembly, all non-absolute references to a named
+ // variable default to RIP-relative.
+ if (Parser.isParsingMasm() && is64BitMode() && SM.getElementSize() > 0) {
Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
BaseReg, IndexReg, Scale, Start,
End, Size,
@@ -3068,13 +3068,13 @@
}
// Parse MASM style pseudo prefixes.
if (isParsingMSInlineAsm()) {
- if (Name.equals_lower("vex"))
+ if (Name.equals_insensitive("vex"))
ForcedVEXEncoding = VEXEncoding_VEX;
- else if (Name.equals_lower("vex2"))
+ else if (Name.equals_insensitive("vex2"))
ForcedVEXEncoding = VEXEncoding_VEX2;
- else if (Name.equals_lower("vex3"))
+ else if (Name.equals_insensitive("vex3"))
ForcedVEXEncoding = VEXEncoding_VEX3;
- else if (Name.equals_lower("evex"))
+ else if (Name.equals_insensitive("evex"))
ForcedVEXEncoding = VEXEncoding_EVEX;
if (ForcedVEXEncoding != VEXEncoding_Default) {
@@ -3101,11 +3101,12 @@
// Hack to skip "short" following Jcc.
if (isParsingIntelSyntax() &&
(PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" ||
- PatchedName == "jcxz" || PatchedName == "jexcz" ||
+ PatchedName == "jcxz" || PatchedName == "jecxz" ||
(PatchedName.startswith("j") &&
ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) {
StringRef NextTok = Parser.getTok().getString();
- if (NextTok == "short") {
+ if (Parser.isParsingMasm() ? NextTok.equals_insensitive("short")
+ : NextTok == "short") {
SMLoc NameEndLoc =
NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
// Eat the short keyword.
@@ -4648,19 +4649,19 @@
else if (IDVal == ".cv_fpo_endproc")
return parseDirectiveFPOEndProc(DirectiveID.getLoc());
else if (IDVal == ".seh_pushreg" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
+ (Parser.isParsingMasm() && IDVal.equals_insensitive(".pushreg")))
return parseDirectiveSEHPushReg(DirectiveID.getLoc());
else if (IDVal == ".seh_setframe" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
+ (Parser.isParsingMasm() && IDVal.equals_insensitive(".setframe")))
return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
else if (IDVal == ".seh_savereg" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
+ (Parser.isParsingMasm() && IDVal.equals_insensitive(".savereg")))
return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
else if (IDVal == ".seh_savexmm" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
+ (Parser.isParsingMasm() && IDVal.equals_insensitive(".savexmm128")))
return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
else if (IDVal == ".seh_pushframe" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
+ (Parser.isParsingMasm() && IDVal.equals_insensitive(".pushframe")))
return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
return true;
@@ -4776,31 +4777,27 @@
return true;
if (!isUIntN(32, ParamsSize))
return Parser.TokError("parameters size out of range");
- if (Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_proc' directive");
+ if (parseEOL())
+ return true;
MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
return getTargetStreamer().emitFPOProc(ProcSym, ParamsSize, L);
}
// .cv_fpo_setframe ebp
bool X86AsmParser::parseDirectiveFPOSetFrame(SMLoc L) {
- MCAsmParser &Parser = getParser();
unsigned Reg;
SMLoc DummyLoc;
- if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
- Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_setframe' directive");
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) || parseEOL())
+ return true;
return getTargetStreamer().emitFPOSetFrame(Reg, L);
}
// .cv_fpo_pushreg ebx
bool X86AsmParser::parseDirectiveFPOPushReg(SMLoc L) {
- MCAsmParser &Parser = getParser();
unsigned Reg;
SMLoc DummyLoc;
- if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
- Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_pushreg' directive");
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) || parseEOL())
+ return true;
return getTargetStreamer().emitFPOPushReg(Reg, L);
}
@@ -4808,9 +4805,8 @@
bool X86AsmParser::parseDirectiveFPOStackAlloc(SMLoc L) {
MCAsmParser &Parser = getParser();
int64_t Offset;
- if (Parser.parseIntToken(Offset, "expected offset") ||
- Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_stackalloc' directive");
+ if (Parser.parseIntToken(Offset, "expected offset") || parseEOL())
+ return true;
return getTargetStreamer().emitFPOStackAlloc(Offset, L);
}
@@ -4818,25 +4814,24 @@
bool X86AsmParser::parseDirectiveFPOStackAlign(SMLoc L) {
MCAsmParser &Parser = getParser();
int64_t Offset;
- if (Parser.parseIntToken(Offset, "expected offset") ||
- Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_stackalign' directive");
+ if (Parser.parseIntToken(Offset, "expected offset") || parseEOL())
+ return true;
return getTargetStreamer().emitFPOStackAlign(Offset, L);
}
// .cv_fpo_endprologue
bool X86AsmParser::parseDirectiveFPOEndPrologue(SMLoc L) {
MCAsmParser &Parser = getParser();
- if (Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_endprologue' directive");
+ if (Parser.parseEOL())
+ return true;
return getTargetStreamer().emitFPOEndPrologue(L);
}
// .cv_fpo_endproc
bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
MCAsmParser &Parser = getParser();
- if (Parser.parseEOL("unexpected tokens"))
- return addErrorSuffix(" in '.cv_fpo_endproc' directive");
+ if (Parser.parseEOL())
+ return true;
return getTargetStreamer().emitFPOEndProc(L);
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h b/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
index e323353..2bc6492 100644
--- a/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/src/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -78,7 +78,8 @@
};
X86Operand(KindTy K, SMLoc Start, SMLoc End)
- : Kind(K), StartLoc(Start), EndLoc(End), CallOperand(false) {}
+ : Kind(K), StartLoc(Start), EndLoc(End), OpDecl(nullptr),
+ AddressOf(false), CallOperand(false) {}
StringRef getSymName() override { return SymName; }
void *getOpDecl() override { return OpDecl; }
diff --git a/src/llvm-project/llvm/lib/Target/X86/CMakeLists.txt b/src/llvm-project/llvm/lib/Target/X86/CMakeLists.txt
index 5a50026..a2816f6 100644
--- a/src/llvm-project/llvm/lib/Target/X86/CMakeLists.txt
+++ b/src/llvm-project/llvm/lib/Target/X86/CMakeLists.txt
@@ -32,8 +32,12 @@
X86CmovConversion.cpp
X86DomainReassignment.cpp
X86DiscriminateMemOps.cpp
+ X86LowerTileCopy.cpp
X86LowerAMXType.cpp
+ X86PreAMXConfig.cpp
+ X86LowerAMXIntrinsics.cpp
X86TileConfig.cpp
+ X86FastTileConfig.cpp
X86PreTileConfig.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
@@ -91,6 +95,7 @@
SelectionDAG
Support
Target
+ TransformUtils
X86Desc
X86Info
GlobalISel
diff --git a/src/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/src/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4e6d8e8..82581eb 100644
--- a/src/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1119,6 +1119,8 @@
switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
case VEX_PREFIX_66:
attrMask |= ATTR_OPSIZE;
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
break;
case VEX_PREFIX_F3:
attrMask |= ATTR_XS;
@@ -1175,6 +1177,8 @@
case 0x66:
if (insn->mode != MODE_16BIT)
attrMask |= ATTR_OPSIZE;
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
break;
case 0x67:
attrMask |= ATTR_ADSIZE;
diff --git a/src/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h b/src/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
index 56738e9..e60497c 100644
--- a/src/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
+++ b/src/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
@@ -5,7 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-///
+/// \file
/// Description: ImmutableGraph is a fast DAG implementation that cannot be
/// modified, except by creating a new ImmutableGraph. ImmutableGraph is
/// implemented as two arrays: one containing nodes, and one containing edges.
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 95012a1..83f3614 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -680,12 +680,21 @@
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_X86_64_NONE)
+ .Case("BFD_RELOC_8", ELF::R_X86_64_8)
+ .Case("BFD_RELOC_16", ELF::R_X86_64_16)
+ .Case("BFD_RELOC_32", ELF::R_X86_64_32)
+ .Case("BFD_RELOC_64", ELF::R_X86_64_64)
.Default(-1u);
} else {
Type = llvm::StringSwitch<unsigned>(Name)
#define ELF_RELOC(X, Y) .Case(#X, Y)
#include "llvm/BinaryFormat/ELFRelocs/i386.def"
#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_386_NONE)
+ .Case("BFD_RELOC_8", ELF::R_386_8)
+ .Case("BFD_RELOC_16", ELF::R_386_16)
+ .Case("BFD_RELOC_32", ELF::R_386_32)
.Default(-1u);
}
if (Type == -1u)
@@ -1073,6 +1082,8 @@
}
unsigned X86AsmBackend::getMaximumNopSize() const {
+ if (STI.hasFeature(X86::Mode16Bit))
+ return 4;
if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
return 1;
if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
@@ -1091,29 +1102,44 @@
/// bytes.
/// \return - true on success, false on failure
bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
- static const char Nops[10][11] = {
- // nop
- "\x90",
- // xchg %ax,%ax
- "\x66\x90",
- // nopl (%[re]ax)
- "\x0f\x1f\x00",
- // nopl 0(%[re]ax)
- "\x0f\x1f\x40\x00",
- // nopl 0(%[re]ax,%[re]ax,1)
- "\x0f\x1f\x44\x00\x00",
- // nopw 0(%[re]ax,%[re]ax,1)
- "\x66\x0f\x1f\x44\x00\x00",
- // nopl 0L(%[re]ax)
- "\x0f\x1f\x80\x00\x00\x00\x00",
- // nopl 0L(%[re]ax,%[re]ax,1)
- "\x0f\x1f\x84\x00\x00\x00\x00\x00",
- // nopw 0L(%[re]ax,%[re]ax,1)
- "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
- // nopw %cs:0L(%[re]ax,%[re]ax,1)
- "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ static const char Nops32Bit[10][11] = {
+ // nop
+ "\x90",
+ // xchg %ax,%ax
+ "\x66\x90",
+ // nopl (%[re]ax)
+ "\x0f\x1f\x00",
+ // nopl 0(%[re]ax)
+ "\x0f\x1f\x40\x00",
+ // nopl 0(%[re]ax,%[re]ax,1)
+ "\x0f\x1f\x44\x00\x00",
+ // nopw 0(%[re]ax,%[re]ax,1)
+ "\x66\x0f\x1f\x44\x00\x00",
+ // nopl 0L(%[re]ax)
+ "\x0f\x1f\x80\x00\x00\x00\x00",
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ "\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
};
+ // 16-bit mode uses different nop patterns than 32-bit.
+ static const char Nops16Bit[4][11] = {
+ // nop
+ "\x90",
+ // xchg %eax,%eax
+ "\x66\x90",
+ // lea 0(%si),%si
+ "\x8d\x74\x00",
+ // lea 0w(%si),%si
+ "\x8d\xb4\x00\x00",
+ };
+
+ const char(*Nops)[11] =
+ STI.getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit;
+
uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
// Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
@@ -1428,6 +1454,7 @@
unsigned StackAdjust = 0;
unsigned StackSize = 0;
unsigned NumDefCFAOffsets = 0;
+ int MinAbsOffset = std::numeric_limits<int>::max();
for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
const MCCFIInstruction &Inst = Instrs[i];
@@ -1456,6 +1483,7 @@
memset(SavedRegs, 0, sizeof(SavedRegs));
StackAdjust = 0;
SavedRegIdx = 0;
+ MinAbsOffset = std::numeric_limits<int>::max();
InstrOffset += MoveInstrSize;
break;
}
@@ -1499,6 +1527,7 @@
unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
SavedRegs[SavedRegIdx++] = Reg;
StackAdjust += OffsetSize;
+ MinAbsOffset = std::min(MinAbsOffset, abs(Inst.getOffset()));
InstrOffset += PushInstrSize(Reg);
break;
}
@@ -1512,6 +1541,11 @@
// Offset was too big for a compact unwind encoding.
return CU::UNWIND_MODE_DWARF;
+ // We don't attempt to track a real StackAdjust, so if the saved registers
+ // aren't adjacent to rbp we can't cope.
+ if (SavedRegIdx != 0 && MinAbsOffset != 3 * (int)OffsetSize)
+ return CU::UNWIND_MODE_DWARF;
+
// Get the encoding of the saved registers when we have a frame pointer.
uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
@@ -1597,7 +1631,7 @@
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
- if (TheTriple.getEnvironment() == Triple::GNUX32)
+ if (TheTriple.isX32())
return new ELFX86_X32AsmBackend(T, OSABI, STI);
return new ELFX86_64AsmBackend(T, OSABI, STI);
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 4db1bfc..58e233d 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -115,6 +115,7 @@
Cmp,
// AND
And,
+ // FIXME: Zen 3 support branch fusion for OR/XOR.
// ADD, SUB
AddSub,
// INC, DEC
@@ -183,6 +184,7 @@
case X86::AND8rr:
case X86::AND8rr_REV:
return FirstMacroFusionInstKind::And;
+ // FIXME: Zen 3 support branch fusion for OR/XOR.
// CMP
case X86::CMP16i16:
case X86::CMP16mr:
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index c294da6..b535135 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -81,7 +81,7 @@
X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
bool is64Bit = T.getArch() == Triple::x86_64;
- bool isX32 = T.getEnvironment() == Triple::GNUX32;
+ bool isX32 = T.isX32();
// For ELF, x86-64 pointer size depends on the ABI.
// For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
@@ -144,7 +144,10 @@
DollarIsPC = true;
SeparatorString = "\n";
CommentString = ";";
- AllowSymbolAtNameStart = true;
+ AllowAdditionalComments = false;
+ AllowQuestionAtStartOfIdentifier = true;
+ AllowDollarAtStartOfIdentifier = true;
+ AllowAtAtStartOfIdentifier = true;
}
void X86MCAsmInfoGNUCOFF::anchor() { }
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 260253a..714d2d8 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1231,8 +1231,7 @@
// FIXME: The caller of determineREXPrefix slaps this prefix onto
// anything that returns non-zero.
REX |= 0x40; // REX fixed encoding prefix
- } else if (MO.isExpr() &&
- STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
+ } else if (MO.isExpr() && STI.getTargetTriple().isX32()) {
// GOTTPOFF and TLSDESC relocations require a REX prefix to allow
// linker optimizations: even if the instructions we see may not require
// any prefix, they may be replaced by instructions that do. This is
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5cf8d77..12dc053 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -332,7 +332,7 @@
MAI = new X86ELFMCAsmInfo(TheTriple);
} else if (TheTriple.isWindowsMSVCEnvironment() ||
TheTriple.isWindowsCoreCLREnvironment()) {
- if (Options.getAssemblyLanguage().equals_lower("masm"))
+ if (Options.getAssemblyLanguage().equals_insensitive("masm"))
MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple);
else
MAI = new X86MCAsmInfoMicrosoft(TheTriple);
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index b98e58d..dff9b31 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -537,6 +537,7 @@
uint32_t Offset = Target.getConstant();
if (IsPCRel)
Offset += 1 << Log2Size;
+
// Try to record the scattered relocation if needed. Fall back to non
// scattered if necessary (see comments in recordScatteredRelocation()
// for details).
@@ -558,6 +559,8 @@
// find a case where they are actually emitted.
Type = MachO::GENERIC_RELOC_VANILLA;
} else {
+ assert(A && "Unknown symbol data");
+
// Resolve constant variables.
if (A->isVariable()) {
int64_t Res;
diff --git a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 760239f..075e85f 100644
--- a/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -42,19 +42,26 @@
const MCFixup &Fixup,
bool IsCrossSection,
const MCAsmBackend &MAB) const {
+ const bool Is64Bit = getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64;
unsigned FixupKind = Fixup.getKind();
if (IsCrossSection) {
- if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) {
+ // IMAGE_REL_AMD64_REL64 does not exist. We treat FK_Data_8 as FK_PCRel_4 so
+ // that .quad a-b can lower to IMAGE_REL_AMD64_REL32. This allows generic
+ // instrumentation to not bother with the COFF limitation. A negative value
+ // needs attention.
+ if (FixupKind == FK_Data_4 || FixupKind == llvm::X86::reloc_signed_4byte ||
+ (FixupKind == FK_Data_8 && Is64Bit)) {
+ FixupKind = FK_PCRel_4;
+ } else {
Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
return COFF::IMAGE_REL_AMD64_ADDR32;
}
- FixupKind = FK_PCRel_4;
}
MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
- if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+ if (Is64Bit) {
switch (FixupKind) {
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86.h b/src/llvm-project/llvm/lib/Target/X86/X86.h
index e17b9ba..eba5b6c 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86.h
@@ -76,10 +76,18 @@
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
+/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();
+/// Return a pass that config the tile registers after fast reg allocation.
+FunctionPass *createX86FastTileConfigPass();
+
+/// Return a pass that insert pseudo tile config instruction.
FunctionPass *createX86PreTileConfigPass();
+/// Return a pass that lower the tile copy instruction.
+FunctionPass *createX86LowerTileCopyPass();
+
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@@ -167,8 +175,12 @@
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86FastTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86PreAMXConfigPassPass(PassRegistry &);
+void initializeX86LowerTileCopyPass(PassRegistry &);
+void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86.td b/src/llvm-project/llvm/lib/Target/X86/X86.td
index c492d68..53bbd93 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86.td
@@ -323,10 +323,15 @@
"platform configuration instruction">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
-def FeatureFastVariableShuffle
- : SubtargetFeature<"fast-variable-shuffle",
- "HasFastVariableShuffle",
- "true", "Shuffles with variable masks are fast">;
+def FeatureFastVariableCrossLaneShuffle
+ : SubtargetFeature<"fast-variable-crosslane-shuffle",
+ "HasFastVariableCrossLaneShuffle",
+ "true", "Cross-lane shuffles with variable masks are fast">;
+def FeatureFastVariablePerLaneShuffle
+ : SubtargetFeature<"fast-variable-perlane-shuffle",
+ "HasFastVariablePerLaneShuffle",
+ "true", "Per-lane shuffles with variable masks are fast">;
+
// On some X86 processors, a vzeroupper instruction should be inserted after
// using ymm/zmm registers before executing code that may use SSE instructions.
def FeatureInsertVZEROUPPER
@@ -514,6 +519,10 @@
"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
"Prefer a left/right vector logical shift pair over a shift+and pair">;
+def FeatureFastMOVBE
+ : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
+ "Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
+
def FeatureUseGLMDivSqrtCosts
: SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
"Use Goldmont specific floating point div/sqrt costs">;
@@ -555,6 +564,7 @@
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleZnver2.td"
+include "X86ScheduleZnver3.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
@@ -634,7 +644,8 @@
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
+ FeatureFastVariableCrossLaneShuffle,
+ FeatureFastVariablePerLaneShuffle,
FeaturePOPCNTFalseDeps,
FeatureLZCNTFalseDeps,
FeatureInsertVZEROUPPER];
@@ -653,8 +664,7 @@
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
FeatureXSAVEC,
FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureSGX];
+ FeatureCLFLUSHOPT];
list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
FeatureMacroFusion,
FeatureSlow3OpsLEA,
@@ -663,7 +673,8 @@
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
+ FeatureFastVariableCrossLaneShuffle,
+ FeatureFastVariablePerLaneShuffle,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKLFeatures =
@@ -689,7 +700,8 @@
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
+ FeatureFastVariableCrossLaneShuffle,
+ FeatureFastVariablePerLaneShuffle,
FeaturePrefer256Bit,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
@@ -726,7 +738,8 @@
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
+ FeatureFastVariableCrossLaneShuffle,
+ FeatureFastVariablePerLaneShuffle,
FeaturePrefer256Bit,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> CNLFeatures =
@@ -740,7 +753,6 @@
FeatureVPCLMULQDQ,
FeatureVPOPCNTDQ,
FeatureGFNI,
- FeatureCLWB,
FeatureRDPID,
FeatureFSRM];
list<SubtargetFeature> ICLTuning = CNLTuning;
@@ -749,13 +761,15 @@
// Icelake Server
list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
+ FeatureCLWB,
FeatureWBNOINVD];
list<SubtargetFeature> ICXTuning = CNLTuning;
list<SubtargetFeature> ICXFeatures =
!listconcat(ICLFeatures, ICXAdditionalFeatures);
- //Tigerlake
+ // Tigerlake
list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
+ FeatureCLWB,
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureSHSTK];
@@ -763,7 +777,7 @@
list<SubtargetFeature> TGLFeatures =
!listconcat(ICLFeatures, TGLAdditionalFeatures );
- //Sapphirerapids
+ // Sapphirerapids
list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
FeatureAMXINT8,
FeatureAMXBF16,
@@ -784,17 +798,6 @@
list<SubtargetFeature> SPRFeatures =
!listconcat(ICXFeatures, SPRAdditionalFeatures);
- // Alderlake
- list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
- FeatureCLDEMOTE,
- FeatureHRESET,
- FeaturePTWRITE,
- FeatureSERIALIZE,
- FeatureWAITPKG];
- list<SubtargetFeature> ADLTuning = SKLTuning;
- list<SubtargetFeature> ADLFeatures =
- !listconcat(SKLFeatures, ADLAdditionalFeatures);
-
// Atom
list<SubtargetFeature> AtomFeatures = [FeatureX87,
FeatureCMPXCHG8B,
@@ -830,6 +833,7 @@
FeatureSlowDivide64,
FeatureSlowPMULLD,
FeatureFast7ByteNOP,
+ FeatureFastMOVBE,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> SLMFeatures =
@@ -849,6 +853,7 @@
FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
+ FeatureFastMOVBE,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLMFeatures =
@@ -856,12 +861,12 @@
// Goldmont Plus
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
- FeatureRDPID,
- FeatureSGX];
+ FeatureRDPID];
list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
+ FeatureFastMOVBE,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLPFeatures =
!listconcat(GLMFeatures, GLPAdditionalFeatures);
@@ -873,6 +878,31 @@
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);
+ // Alderlake
+ list<SubtargetFeature> ADLAdditionalFeatures = [FeatureSERIALIZE,
+ FeaturePCONFIG,
+ FeatureSHSTK,
+ FeatureWIDEKL,
+ FeatureINVPCID,
+ FeatureADX,
+ FeatureFMA,
+ FeatureVAES,
+ FeatureVPCLMULQDQ,
+ FeatureF16C,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureLZCNT,
+ FeatureAVXVNNI,
+ FeaturePKU,
+ FeatureHRESET,
+ FeatureCLDEMOTE,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureWAITPKG];
+ list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLFeatures =
+ !listconcat(TRMFeatures, ADLAdditionalFeatures);
+
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
FeatureCMPXCHG8B,
@@ -910,6 +940,7 @@
FeatureSlowTwoMemOps,
FeaturePreferMaskRegisters,
FeatureHasFastGather,
+ FeatureFastMOVBE,
FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
@@ -969,6 +1000,7 @@
FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks,
+ FeatureFastMOVBE,
FeatureSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1003,7 +1035,9 @@
FeatureTBM,
FeatureFMA,
FeatureFastBEXTR];
- list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+ list<SubtargetFeature> BdVer2AdditionalTuning = [FeatureFastMOVBE];
+ list<SubtargetFeature> BdVer2Tuning =
+ !listconcat(BdVer1Tuning, BdVer2AdditionalTuning);
list<SubtargetFeature> BdVer2Features =
!listconcat(BdVer1Features, BdVer2AdditionalFeatures);
@@ -1063,6 +1097,7 @@
FeatureFast15ByteNOP,
FeatureBranchFusion,
FeatureFastScalarShiftMasks,
+ FeatureFastMOVBE,
FeatureSlowSHLD,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
@@ -1076,7 +1111,11 @@
FeaturePKU,
FeatureVAES,
FeatureVPCLMULQDQ];
- list<SubtargetFeature> ZN3Tuning = ZNTuning;
+ list<SubtargetFeature> ZN3AdditionalTuning =
+ [FeatureMacroFusion,
+ FeatureFastVariablePerLaneShuffle];
+ list<SubtargetFeature> ZN3Tuning =
+ !listconcat(ZNTuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
!listconcat(ZN2Features, ZN3AdditionalFeatures);
}
@@ -1291,6 +1330,8 @@
ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
def : ProcModel<"icelake-client", SkylakeServerModel,
ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
+def : ProcModel<"rocketlake", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
def : ProcModel<"icelake-server", SkylakeServerModel,
ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
def : ProcModel<"tigerlake", SkylakeServerModel,
@@ -1365,7 +1406,7 @@
ProcessorFeatures.ZNTuning>;
def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
ProcessorFeatures.ZN2Tuning>;
-def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
+def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
ProcessorFeatures.ZN3Tuning>;
def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
@@ -1407,7 +1448,7 @@
def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
ProcessorFeatures.HSWTuning>;
// Close to the AVX-512 level implemented by Xeon Scalable Processors.
-def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
+def : ProcModel<"x86-64-v4", SkylakeServerModel, ProcessorFeatures.X86_64V4Features,
ProcessorFeatures.SKXTuning>;
//===----------------------------------------------------------------------===//
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/src/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 2d434bd..a276453 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -643,7 +643,7 @@
OutStreamer->SwitchSection(Nt);
// Emitting note header.
- int WordSize = TT.isArch64Bit() ? 8 : 4;
+ const int WordSize = TT.isArch64Bit() && !TT.isX32() ? 8 : 4;
emitAlignment(WordSize == 4 ? Align(4) : Align(8));
OutStreamer->emitIntValue(4, 4 /*size*/); // data size for "GNU\0"
OutStreamer->emitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
@@ -683,8 +683,13 @@
Feat00Flags |= 1;
}
- if (M.getModuleFlag("cfguard"))
+ if (M.getModuleFlag("cfguard")) {
Feat00Flags |= 0x800; // Object is CFG-aware.
+ }
+
+ if (M.getModuleFlag("ehcontguard")) {
+ Feat00Flags |= 0x4000; // Object also has EHCont.
+ }
OutStreamer->emitSymbolAttribute(S, MCSA_Global);
OutStreamer->emitAssignment(
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/src/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index fdc65ac..b6a37f0 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -529,10 +529,9 @@
int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
- AliasResult AAResult =
- AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
- MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
- return AAResult != NoAlias;
+ return !AA->isNoAlias(
+ MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
+ MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
}
void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/src/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index fae4e68..1fa559d 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -499,7 +499,7 @@
MachineBasicBlock &MBB = *(FrameSetup->getParent());
TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);
- DebugLoc DL = FrameSetup->getDebugLoc();
+ const DebugLoc &DL = FrameSetup->getDebugLoc();
bool Is64Bit = STI->is64Bit();
// Now, iterate through the vector in reverse order, and replace the store to
// stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
index 53f5756..c8bffb4 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -50,61 +50,47 @@
X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
: CallLowering(&TLI) {}
-bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- MachineRegisterInfo &MRI,
- SplitArgTy PerformArgSplit) const {
- const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
- LLVMContext &Context = OrigArg.Ty->getContext();
-
- SmallVector<EVT, 4> SplitVTs;
- SmallVector<uint64_t, 4> Offsets;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
- assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet");
-
- if (OrigArg.Ty->isVoidTy())
- return true;
-
- EVT VT = SplitVTs[0];
- unsigned NumParts = TLI.getNumRegisters(Context, VT);
-
- if (NumParts == 1) {
- // replace the original type ( pointer -> GPR ).
- SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context),
- OrigArg.Flags, OrigArg.IsFixed);
- return true;
- }
-
- SmallVector<Register, 8> SplitRegs;
-
- EVT PartVT = TLI.getRegisterType(Context, VT);
- Type *PartTy = PartVT.getTypeForEVT(Context);
-
- for (unsigned i = 0; i < NumParts; ++i) {
- ArgInfo Info =
- ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
- PartTy, OrigArg.Flags};
- SplitArgs.push_back(Info);
- SplitRegs.push_back(Info.Regs[0]);
- }
-
- PerformArgSplit(SplitRegs);
- return true;
-}
-
namespace {
+struct X86OutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
+private:
+ uint64_t StackSize = 0;
+ unsigned NumXMMRegs = 0;
+
+public:
+ uint64_t getStackSize() { return StackSize; }
+ unsigned getNumXmmRegs() { return NumXMMRegs; }
+
+ X86OutgoingValueAssigner(CCAssignFn *AssignFn_)
+ : CallLowering::OutgoingValueAssigner(AssignFn_) {}
+
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ StackSize = State.getNextStackOffset();
+
+ static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5,
+ X86::XMM6, X86::XMM7};
+ if (!Info.IsFixed)
+ NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
+
+ return Res;
+ }
+};
+
struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
- CCAssignFn *AssignFn)
- : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB)
+ : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB),
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
auto SPReg =
@@ -121,65 +107,24 @@
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
-
- Register ExtReg;
- // If we are copying the value to a physical register with the
- // size larger than the size of the value itself - build AnyExt
- // to the size of the register first and only then do the copy.
- // The example of that would be copying from s32 to xmm0, for which
- // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
- // we expect normal extendRegister mechanism to work.
- unsigned PhysRegSize =
- MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
- unsigned ValSize = VA.getValVT().getSizeInBits();
- unsigned LocSize = VA.getLocVT().getSizeInBits();
- if (PhysRegSize > ValSize && LocSize == ValSize) {
- assert((PhysRegSize == 128 || PhysRegSize == 80) &&
- "We expect that to be 128 bit");
- ExtReg =
- MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
- } else
- ExtReg = extendRegister(ValVReg, VA);
-
+ Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
Register ExtReg = extendRegister(ValVReg, VA);
- auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
- VA.getLocVT().getStoreSize(),
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
- CCState &State) override {
- bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
- StackSize = State.getNextStackOffset();
-
- static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
- X86::XMM3, X86::XMM4, X86::XMM5,
- X86::XMM6, X86::XMM7};
- if (!Info.IsFixed)
- NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
-
- return Res;
- }
-
- uint64_t getStackSize() { return StackSize; }
- uint64_t getNumXmmRegs() { return NumXMMRegs; }
-
protected:
MachineInstrBuilder &MIB;
- uint64_t StackSize = 0;
const DataLayout &DL;
const X86Subtarget &STI;
- unsigned NumXMMRegs = 0;
};
} // end anonymous namespace
@@ -196,27 +141,18 @@
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
const DataLayout &DL = MF.getDataLayout();
- LLVMContext &Ctx = Val->getType()->getContext();
- const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
- SmallVector<EVT, 4> SplitEVTs;
- ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
- assert(VRegs.size() == SplitEVTs.size() &&
- "For each split Type there should be exactly one VReg.");
+ ArgInfo OrigRetInfo(VRegs, Val->getType(), 0);
+ setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
- SmallVector<ArgInfo, 8> SplitArgs;
- for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
- setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
- if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI,
- [&](ArrayRef<Register> Regs) {
- MIRBuilder.buildUnmerge(Regs, VRegs[i]);
- }))
- return false;
- }
+ SmallVector<ArgInfo, 4> SplitRetInfos;
+ splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, F.getCallingConv());
- X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ X86OutgoingValueAssigner Assigner(RetCC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(Handler, Assigner, SplitRetInfos,
+ MIRBuilder, F.getCallingConv(),
+ F.isVarArg()))
return false;
}
@@ -228,14 +164,20 @@
struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
+ MachineRegisterInfo &MRI)
+ : IncomingValueHandler(MIRBuilder, MRI),
DL(MIRBuilder.getMF().getDataLayout()) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- int FI = MFI.CreateFixedObject(Size, Offset, true);
+
+ // Byval is assumed to be writable memory, but other stack passed arguments
+ // are not.
+ const bool IsImmutable = !Flags.isByVal();
+
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
return MIRBuilder
@@ -243,11 +185,11 @@
.getReg(0);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
auto *MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -255,36 +197,7 @@
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
-
- switch (VA.getLocInfo()) {
- default: {
- // If we are copying the value from a physical register with the
- // size larger than the size of the value itself - build the copy
- // of the phys reg first and then build the truncation of that copy.
- // The example of that would be copying from xmm0 to s32, for which
- // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
- // we expect this to be handled in SExt/ZExt/AExt case.
- unsigned PhysRegSize =
- MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
- unsigned ValSize = VA.getValVT().getSizeInBits();
- unsigned LocSize = VA.getLocVT().getSizeInBits();
- if (PhysRegSize > ValSize && LocSize == ValSize) {
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(PhysRegSize), PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- return;
- }
-
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
- }
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- }
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
}
/// How the physical register gets marked varies between formal
@@ -297,9 +210,8 @@
};
struct FormalArgHandler : public X86IncomingValueHandler {
- FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : X86IncomingValueHandler(MIRBuilder, MRI) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -309,8 +221,8 @@
struct CallReturnHandler : public X86IncomingValueHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
- : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder &MIB)
+ : X86IncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -349,13 +261,9 @@
Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1)
return false;
- ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ ArgInfo OrigArg(VRegs[Idx], Arg.getType(), Idx);
setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
- if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<Register> Regs) {
- MIRBuilder.buildMerge(VRegs[Idx][0], Regs);
- }))
- return false;
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
Idx++;
}
@@ -363,8 +271,10 @@
if (!MBB.empty())
MIRBuilder.setInstr(*MBB.begin());
- FormalArgHandler Handler(MIRBuilder, MRI, CC_X86);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ X86OutgoingValueAssigner Assigner(CC_X86);
+ FormalArgHandler Handler(MIRBuilder, MRI);
+ if (!determineAndHandleAssignments(Handler, Assigner, SplitArgs, MIRBuilder,
+ F.getCallingConv(), F.isVarArg()))
return false;
// Move back to the end of the basic block.
@@ -412,15 +322,13 @@
if (OrigArg.Regs.size() > 1)
return false;
- if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<Register> Regs) {
- MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]);
- }))
- return false;
+ splitToValueTypes(OrigArg, SplitArgs, DL, Info.CallConv);
}
// Do the actual argument marshalling.
- X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ X86OutgoingValueAssigner Assigner(CC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(Handler, Assigner, SplitArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
return false;
bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
@@ -435,7 +343,7 @@
MIRBuilder.buildInstr(X86::MOV8ri)
.addDef(X86::AL)
- .addImm(Handler.getNumXmmRegs());
+ .addImm(Assigner.getNumXmmRegs());
MIB.addUse(X86::AL, RegState::Implicit);
}
@@ -462,27 +370,25 @@
SplitArgs.clear();
SmallVector<Register, 8> NewRegs;
- if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
- [&](ArrayRef<Register> Regs) {
- NewRegs.assign(Regs.begin(), Regs.end());
- }))
- return false;
+ splitToValueTypes(Info.OrigRet, SplitArgs, DL, Info.CallConv);
- CallReturnHandler Handler(MIRBuilder, MRI, RetCC_X86, MIB);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ X86OutgoingValueAssigner Assigner(RetCC_X86);
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(Handler, Assigner, SplitArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
return false;
if (!NewRegs.empty())
MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs);
}
- CallSeqStart.addImm(Handler.getStackSize())
+ CallSeqStart.addImm(Assigner.getStackSize())
.addImm(0 /* see getFrameTotalSize */)
.addImm(0 /* see getFrameAdjustment */);
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
MIRBuilder.buildInstr(AdjStackUp)
- .addImm(Handler.getStackSize())
+ .addImm(Assigner.getStackSize())
.addImm(0 /* NumBytesForCalleeToPop */);
return true;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
index 9390122..ac5b92b 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
@@ -38,15 +38,6 @@
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
-
-private:
- /// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<Register>)>;
-
- bool splitToValueTypes(const ArgInfo &OrigArgInfo,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI,
- SplitArgTy SplitArg) const;
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/src/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
index 3735fab..98883bb 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
@@ -462,6 +462,7 @@
// Handle Swift calls.
CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
+ CCIfCC<"CallingConv::SwiftTail", CCDelegateTo<RetCC_X86_64_Swift>>,
// Handle explicit CC selection
CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
@@ -517,9 +518,15 @@
// A SwiftError is passed in R12.
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
- // For Swift Calling Convention, pass sret in %rax.
+ // Pass SwiftAsync in an otherwise callee saved register so that calls to
+ // normal functions don't need to save it somewhere.
+ CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[R14]>>>,
+
+ // For Swift Calling Conventions, pass sret in %rax.
CCIfCC<"CallingConv::Swift",
CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+ CCIfCC<"CallingConv::SwiftTail",
+ CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
// Pointers are always passed in full 64-bit registers.
CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
@@ -617,6 +624,13 @@
// A SwiftError is passed in R12.
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>,
+
+ // Pass SwiftAsync in an otherwise callee saved register so that calls to
+ // normal functions don't need to save it somewhere.
+ CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[R14]>>>,
+
// The 'CFGuardTarget' parameter, if any, is passed in RAX.
CCIfCFGuardTarget<CCAssignToReg<[RAX]>>,
@@ -848,6 +862,10 @@
// The 'nest' parameter, if any, is passed in ECX.
CCIfNest<CCAssignToReg<[ECX]>>,
+ // On swifttailcc pass swiftself in ECX.
+ CCIfCC<"CallingConv::SwiftTail",
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
+
// The first 3 integer arguments, if marked 'inreg' and if the call is not
// a vararg call, are passed in integer registers.
CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
@@ -1077,6 +1095,7 @@
def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
+def CSR_64_SwiftTail : CalleeSavedRegs<(sub CSR_64, R13, R14)>;
def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
@@ -1087,6 +1106,7 @@
(sequence "XMM%u", 6, 15))>;
def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>;
+def CSR_Win64_SwiftTail : CalleeSavedRegs<(sub CSR_Win64, R13, R14)>;
// The function used by Darwin to obtain the address of a thread-local variable
// uses rdi to pass a single parameter and rax for the return value. All other
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp b/src/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
index a2de0dc..05349a7 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -115,6 +115,7 @@
MachineRegisterInfo *MRI = nullptr;
const TargetInstrInfo *TII = nullptr;
const TargetRegisterInfo *TRI = nullptr;
+ MachineLoopInfo *MLI = nullptr;
TargetSchedModel TSchedModel;
/// List of consecutive CMOV instructions.
@@ -165,7 +166,7 @@
<< "**********\n");
bool Changed = false;
- MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+ MLI = &getAnalysis<MachineLoopInfo>();
const TargetSubtargetInfo &STI = MF.getSubtarget();
MRI = &MF.getRegInfo();
TII = STI.getInstrInfo();
@@ -221,7 +222,7 @@
//===--------------------------------------------------------------------===//
// Build up the loops in pre-order.
- SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end());
+ SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end());
// Note that we need to check size on each iteration as we accumulate child
// loops.
for (int i = 0; i < (int)Loops.size(); ++i)
@@ -848,6 +849,12 @@
// Now remove the CMOV(s).
MBB->erase(MIItBegin, MIItEnd);
+
+ // Add new basic blocks to MachineLoopInfo.
+ if (MachineLoop *L = MLI->getLoopFor(MBB)) {
+ L->addBasicBlockToLoop(FalseMBB, MLI->getBase());
+ L->addBasicBlockToLoop(SinkMBB, MLI->getBase());
+ }
}
INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/src/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
index 97f843f..c7a013a 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -151,24 +151,6 @@
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
- case X86::VPDPBUSDSZ256m:
- case X86::VPDPBUSDSZ256r:
- case X86::VPDPBUSDSZ128m:
- case X86::VPDPBUSDSZ128r:
- case X86::VPDPBUSDZ256m:
- case X86::VPDPBUSDZ256r:
- case X86::VPDPBUSDZ128m:
- case X86::VPDPBUSDZ128r:
- case X86::VPDPWSSDSZ256m:
- case X86::VPDPWSSDSZ256r:
- case X86::VPDPWSSDSZ128m:
- case X86::VPDPWSSDSZ128r:
- case X86::VPDPWSSDZ256m:
- case X86::VPDPWSSDZ256r:
- case X86::VPDPWSSDZ128m:
- case X86::VPDPWSSDZ128r:
- // These can only VEX convert if AVXVNNI is enabled.
- return ST->hasAVXVNNI();
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
@@ -280,6 +262,9 @@
if (usesExtendedRegister(MI))
return false;
+ if (!CheckVEXInstPredicate(MI, ST))
+ return false;
+
if (!performCustomAdjustments(MI, NewOpc, ST))
return false;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 15af0fb..4add8d3 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
#include "llvm/IR/GlobalValue.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
#define DEBUG_TYPE "x86-pseudo"
@@ -61,9 +62,22 @@
private:
void ExpandICallBranchFunnel(MachineBasicBlock *MBB,
MachineBasicBlock::iterator MBBI);
-
+ void expandCALL_RVMARKER(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool ExpandMBB(MachineBasicBlock &MBB);
+
+ /// This function expands pseudos which affects control flow.
+ /// It is done in separate pass to simplify blocks navigation in main
+ /// pass(calling ExpandMBB).
+ bool ExpandPseudosWhichAffectControlFlow(MachineFunction &MF);
+
+ /// Expand X86::VASTART_SAVE_XMM_REGS into set of xmm copying instructions,
+ /// placed into separate block guarded by check for al register(for SystemV
+ /// abi).
+ void ExpandVastartSaveXmmRegs(
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator VAStartPseudoInstr) const;
};
char X86ExpandPseudo::ID = 0;
@@ -82,7 +96,7 @@
++InsPt;
std::vector<std::pair<MachineBasicBlock *, unsigned>> TargetMBBs;
- DebugLoc DL = JTInst->getDebugLoc();
+ const DebugLoc &DL = JTInst->getDebugLoc();
MachineOperand Selector = JTInst->getOperand(0);
const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
@@ -173,6 +187,77 @@
JTMBB->erase(JTInst);
}
+void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ // Expand CALL_RVMARKER pseudo to call instruction, followed by the special
+ //"movq %rax, %rdi" marker.
+ // TODO: Mark the sequence as bundle, to avoid passes moving other code
+ // in between.
+ MachineInstr &MI = *MBBI;
+
+ MachineInstr *OriginalCall;
+ assert((MI.getOperand(1).isGlobal() || MI.getOperand(1).isReg()) &&
+ "invalid operand for regular call");
+ unsigned Opc = -1;
+ if (MI.getOpcode() == X86::CALL64m_RVMARKER)
+ Opc = X86::CALL64m;
+ else if (MI.getOpcode() == X86::CALL64r_RVMARKER)
+ Opc = X86::CALL64r;
+ else if (MI.getOpcode() == X86::CALL64pcrel32_RVMARKER)
+ Opc = X86::CALL64pcrel32;
+ else
+ llvm_unreachable("unexpected opcode");
+
+ OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
+ unsigned OpStart = 1;
+ bool RAXImplicitDead = false;
+ for (; OpStart < MI.getNumOperands(); ++OpStart) {
+ MachineOperand &Op = MI.getOperand(OpStart);
+ // RAX may be 'implicit dead', if there are no other users of the return
+ // value. We introduce a new use, so change it to 'implicit def'.
+ if (Op.isReg() && Op.isImplicit() && Op.isDead() &&
+ TRI->regsOverlap(Op.getReg(), X86::RAX)) {
+ Op.setIsDead(false);
+ Op.setIsDef(true);
+ RAXImplicitDead = true;
+ }
+ OriginalCall->addOperand(Op);
+ }
+
+ // Emit marker "movq %rax, %rdi". %rdi is not callee-saved, so it cannot be
+ // live across the earlier call. The call to the ObjC runtime function returns
+ // the first argument, so the value of %rax is unchanged after the ObjC
+ // runtime call.
+ auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::MOV64rr))
+ .addReg(X86::RDI, RegState::Define)
+ .addReg(X86::RAX)
+ .getInstr();
+ if (MI.shouldUpdateCallSiteInfo())
+ MBB.getParent()->moveCallSiteInfo(&MI, Marker);
+
+ // Emit call to ObjC runtime.
+ unsigned RuntimeCallType = MI.getOperand(0).getImm();
+ assert(RuntimeCallType <= 1 && "objc runtime call type must be 0 or 1");
+ Module *M = MBB.getParent()->getFunction().getParent();
+ auto &Context = M->getContext();
+ auto *I8PtrTy = PointerType::get(IntegerType::get(Context, 8), 0);
+ FunctionCallee Fn = M->getOrInsertFunction(
+ RuntimeCallType == 0 ? "objc_retainAutoreleasedReturnValue"
+ : "objc_unsafeClaimAutoreleasedReturnValue",
+ FunctionType::get(I8PtrTy, {I8PtrTy}, false));
+ const uint32_t *RegMask =
+ TRI->getCallPreservedMask(*MBB.getParent(), CallingConv::C);
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32))
+ .addGlobalAddress(cast<GlobalValue>(Fn.getCallee()), 0, 0)
+ .addRegMask(RegMask)
+ .addReg(X86::RAX,
+ RegState::Implicit |
+ (RAXImplicitDead ? (RegState::Dead | RegState::Define)
+ : RegState::Define))
+ .getInstr();
+ MI.eraseFromParent();
+}
+
/// If \p MBBI is a pseudo instruction, this method expands
/// it to the corresponding (sequence of) actual instruction(s).
/// \returns true if \p MBBI has been expanded.
@@ -180,7 +265,7 @@
MachineBasicBlock::iterator MBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
- DebugLoc DL = MBBI->getDebugLoc();
+ const DebugLoc &DL = MBBI->getDebugLoc();
switch (Opcode) {
default:
return false;
@@ -303,8 +388,12 @@
int64_t StackAdj = MBBI->getOperand(0).getImm();
X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true);
// Replace pseudo with machine iret
- BuildMI(MBB, MBBI, DL,
- TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+ unsigned RetOp = STI->is64Bit() ? X86::IRET64 : X86::IRET32;
+ // Use UIRET if UINTR is present (except for building kernel)
+ if (STI->is64Bit() && STI->hasUINTR() &&
+ MBB.getParent()->getTarget().getCodeModel() != CodeModel::Kernel)
+ RetOp = X86::UIRET;
+ BuildMI(MBB, MBBI, DL, TII->get(RetOp));
MBB.erase(MBBI);
return true;
}
@@ -461,49 +550,170 @@
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
- case X86::PLDTILECFG: {
- MI.RemoveOperand(0);
+ case X86::PLDTILECFGV: {
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
- case X86::PSTTILECFG: {
- MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
- MI.setDesc(TII->get(X86::STTILECFG));
- return true;
- }
- case X86::PTILELOADDV: {
- MI.RemoveOperand(8); // Remove $tmmcfg
+ case X86::PTILELOADDV:
+ case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TILELOADD));
+ unsigned Opc =
+ Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
+ MI.setDesc(TII->get(Opc));
return true;
}
- case X86::PTDPBSSDV: {
- MI.RemoveOperand(7); // Remove $tmmcfg
+ case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV:
+ case X86::PTDPBF16PSV: {
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TDPBSSD));
+ unsigned Opc;
+ switch (Opcode) {
+ case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break;
+ case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break;
+ default: llvm_unreachable("Impossible Opcode!");
+ }
+ MI.setDesc(TII->get(Opc));
MI.tieOperands(0, 1);
return true;
}
case X86::PTILESTOREDV: {
- MI.RemoveOperand(8); // Remove $tmmcfg
for (int i = 1; i >= 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
return true;
}
case X86::PTILEZEROV: {
- for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+ for (int i = 2; i > 0; --i) // Remove row, col
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILEZERO));
return true;
}
+ case X86::CALL64pcrel32_RVMARKER:
+ case X86::CALL64r_RVMARKER:
+ case X86::CALL64m_RVMARKER:
+ expandCALL_RVMARKER(MBB, MBBI);
+ return true;
}
llvm_unreachable("Previous switch has a fallthrough?");
}
+// This function creates additional block for storing varargs guarded
+// registers. It adds check for %al into entry block, to skip
+// GuardedRegsBlk if xmm registers should not be stored.
+//
+// EntryBlk[VAStartPseudoInstr] EntryBlk
+// | | .
+// | | .
+// | | GuardedRegsBlk
+// | => | .
+// | | .
+// | TailBlk
+// | |
+// | |
+//
+void X86ExpandPseudo::ExpandVastartSaveXmmRegs(
+ MachineBasicBlock *EntryBlk,
+ MachineBasicBlock::iterator VAStartPseudoInstr) const {
+ assert(VAStartPseudoInstr->getOpcode() == X86::VASTART_SAVE_XMM_REGS);
+
+ MachineFunction *Func = EntryBlk->getParent();
+ const TargetInstrInfo *TII = STI->getInstrInfo();
+ const DebugLoc &DL = VAStartPseudoInstr->getDebugLoc();
+ Register CountReg = VAStartPseudoInstr->getOperand(0).getReg();
+
+ // Calculate liveins for newly created blocks.
+ LivePhysRegs LiveRegs(*STI->getRegisterInfo());
+ SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
+
+ LiveRegs.addLiveIns(*EntryBlk);
+ for (MachineInstr &MI : EntryBlk->instrs()) {
+ if (MI.getOpcode() == VAStartPseudoInstr->getOpcode())
+ break;
+
+ LiveRegs.stepForward(MI, Clobbers);
+ }
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and another block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVMBlk = EntryBlk->getBasicBlock();
+ MachineFunction::iterator EntryBlkIter = ++EntryBlk->getIterator();
+ MachineBasicBlock *GuardedRegsBlk = Func->CreateMachineBasicBlock(LLVMBlk);
+ MachineBasicBlock *TailBlk = Func->CreateMachineBasicBlock(LLVMBlk);
+ Func->insert(EntryBlkIter, GuardedRegsBlk);
+ Func->insert(EntryBlkIter, TailBlk);
+
+ // Transfer the remainder of EntryBlk and its successor edges to TailBlk.
+ TailBlk->splice(TailBlk->begin(), EntryBlk,
+ std::next(MachineBasicBlock::iterator(VAStartPseudoInstr)),
+ EntryBlk->end());
+ TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk);
+
+ int64_t FrameIndex = VAStartPseudoInstr->getOperand(1).getImm();
+ Register BaseReg;
+ uint64_t FrameOffset =
+ X86FL->getFrameIndexReference(*Func, FrameIndex, BaseReg).getFixed();
+ uint64_t VarArgsRegsOffset = VAStartPseudoInstr->getOperand(2).getImm();
+
+ // TODO: add support for YMM and ZMM here.
+ unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+
+ // In the XMM save block, save all the XMM argument registers.
+ for (int64_t OpndIdx = 3, RegIdx = 0;
+ OpndIdx < VAStartPseudoInstr->getNumOperands() - 1;
+ OpndIdx++, RegIdx++) {
+
+ int64_t Offset = FrameOffset + VarArgsRegsOffset + RegIdx * 16;
+
+ MachineMemOperand *MMO = Func->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*Func, FrameIndex, Offset),
+ MachineMemOperand::MOStore,
+ /*Size=*/16, Align(16));
+
+ BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc))
+ .addReg(BaseReg)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(VAStartPseudoInstr->getOperand(OpndIdx).getReg())
+ .addMemOperand(MMO);
+ assert(Register::isPhysicalRegister(
+ VAStartPseudoInstr->getOperand(OpndIdx).getReg()));
+ }
+
+ // The original block will now fall through to the GuardedRegsBlk.
+ EntryBlk->addSuccessor(GuardedRegsBlk);
+ // The GuardedRegsBlk will fall through to the TailBlk.
+ GuardedRegsBlk->addSuccessor(TailBlk);
+
+ if (!STI->isCallingConvWin64(Func->getFunction().getCallingConv())) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(EntryBlk, DL, TII->get(X86::TEST8rr))
+ .addReg(CountReg)
+ .addReg(CountReg);
+ BuildMI(EntryBlk, DL, TII->get(X86::JCC_1))
+ .addMBB(TailBlk)
+ .addImm(X86::COND_E);
+ EntryBlk->addSuccessor(TailBlk);
+ }
+
+ // Add liveins to the created block.
+ addLiveIns(*GuardedRegsBlk, LiveRegs);
+ addLiveIns(*TailBlk, LiveRegs);
+
+ // Delete the pseudo.
+ VAStartPseudoInstr->eraseFromParent();
+}
+
/// Expand all pseudo instructions contained in \p MBB.
/// \returns true if any expansion occurred for \p MBB.
bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
@@ -520,6 +730,20 @@
return Modified;
}
+bool X86ExpandPseudo::ExpandPseudosWhichAffectControlFlow(MachineFunction &MF) {
+ // Currently pseudo which affects control flow is only
+ // X86::VASTART_SAVE_XMM_REGS which is located in Entry block.
+ // So we do not need to evaluate other blocks.
+ for (MachineInstr &Instr : MF.front().instrs()) {
+ if (Instr.getOpcode() == X86::VASTART_SAVE_XMM_REGS) {
+ ExpandVastartSaveXmmRegs(&(MF.front()), Instr);
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
TII = STI->getInstrInfo();
@@ -527,7 +751,8 @@
X86FI = MF.getInfo<X86MachineFunctionInfo>();
X86FL = STI->getFrameLowering();
- bool Modified = false;
+ bool Modified = ExpandPseudosWhichAffectControlFlow(MF);
+
for (MachineBasicBlock &MBB : MF)
Modified |= ExpandMBB(MBB);
return Modified;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
index a1a16a1..bb95ed3 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
@@ -89,8 +89,7 @@
bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
MachineMemOperand *MMO = nullptr, bool Aligned = false);
- bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
- X86AddressMode &AM,
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
MachineMemOperand *MMO = nullptr, bool Aligned = false);
bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
@@ -176,9 +175,7 @@
unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC, unsigned Op0,
- bool Op0IsKill, unsigned Op1, bool Op1IsKill,
- unsigned Op2, bool Op2IsKill, unsigned Op3,
- bool Op3IsKill);
+ unsigned Op1, unsigned Op2, unsigned Op3);
};
} // end anonymous namespace.
@@ -487,8 +484,7 @@
/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
/// and a displacement offset, or a GlobalAddress,
/// i.e. V. Return true if it is possible.
-bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
- X86AddressMode &AM,
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
MachineMemOperand *MMO, bool Aligned) {
bool HasSSE1 = Subtarget->hasSSE1();
bool HasSSE2 = Subtarget->hasSSE2();
@@ -508,7 +504,7 @@
Register AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::AND8ri), AndResult)
- .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+ .addReg(ValReg).addImm(1);
ValReg = AndResult;
LLVM_FALLTHROUGH; // handle i1 as i8.
}
@@ -654,7 +650,7 @@
ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
- addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+ addFullAddress(MIB, AM).addReg(ValReg);
if (MMO)
MIB->addMemOperand(*FuncInfo.MF, MMO);
@@ -702,8 +698,7 @@
if (ValReg == 0)
return false;
- bool ValKill = hasTrivialKill(Val);
- return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+ return X86FastEmitStore(VT, ValReg, AM, MMO, Aligned);
}
/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -712,8 +707,7 @@
bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
unsigned Src, EVT SrcVT,
unsigned &ResultReg) {
- unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
- Src, /*TODO: Kill=*/false);
+ unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
if (RR == 0)
return false;
@@ -945,7 +939,7 @@
(S == 1 || S == 2 || S == 4 || S == 8)) {
// Scaled-index addressing.
Scale = S;
- IndexReg = getRegForGEPIndex(Op).first;
+ IndexReg = getRegForGEPIndex(Op);
if (IndexReg == 0)
return false;
break;
@@ -1189,6 +1183,7 @@
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
CC != CallingConv::Tail &&
+ CC != CallingConv::SwiftTail &&
CC != CallingConv::X86_FastCall &&
CC != CallingConv::X86_StdCall &&
CC != CallingConv::X86_ThisCall &&
@@ -1203,7 +1198,7 @@
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
- CC == CallingConv::Tail)
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
return false;
// Let SDISel handle vararg functions.
@@ -1262,14 +1257,13 @@
if (Outs[0].Flags.isSExt())
return false;
// TODO
- SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg);
SrcVT = MVT::i8;
}
unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
ISD::SIGN_EXTEND;
// TODO
- SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
- /*Op0IsKill=*/false);
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg);
}
// Make the copy.
@@ -1292,7 +1286,8 @@
// the sret argument into %rax/%eax (depending on ABI) for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
+ if (F.hasStructRetAttr() && CC != CallingConv::Swift &&
+ CC != CallingConv::SwiftTail) {
Register Reg = X86MFInfo->getSRetReturnReg();
assert(Reg &&
"SRetReturnReg should have been set in LowerFormalArguments()!");
@@ -1454,6 +1449,10 @@
if (!isTypeLegal(I->getOperand(0)->getType(), VT))
return false;
+ // Below code only works for scalars.
+ if (VT.isVector())
+ return false;
+
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
unsigned ResultReg = 0;
@@ -1463,8 +1462,7 @@
ResultReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
- /*Op0IsKill=*/true, X86::sub_8bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, X86::sub_8bit);
if (!ResultReg)
return false;
break;
@@ -1554,7 +1552,7 @@
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
if (SrcVT == MVT::i1) {
// Set the high bits to zero.
- ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
SrcVT = MVT::i8;
if (ResultReg == 0)
@@ -1587,11 +1585,10 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
- /*Op0IsKill=*/true, X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
- ResultReg, /*Op0IsKill=*/true);
+ ResultReg);
if (ResultReg == 0)
return false;
}
@@ -1613,8 +1610,7 @@
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
if (SrcVT == MVT::i1) {
// Set the high bits to zero.
- Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
- /*TODO: Kill=*/false);
+ Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
if (ZExtReg == 0)
return false;
@@ -1633,11 +1629,10 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
- /*Op0IsKill=*/true, X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
- ResultReg, /*Op0IsKill=*/true);
+ ResultReg);
if (ResultReg == 0)
return false;
}
@@ -1789,8 +1784,7 @@
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), OpReg)
.addReg(KOpReg);
- OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
- X86::sub_8bit);
+ OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(OpReg)
@@ -2021,7 +2015,7 @@
// Now reference the 8-bit subreg of the result.
ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
- /*Op0IsKill=*/true, X86::sub_8bit);
+ X86::sub_8bit);
}
// Copy the result out of the physreg if we haven't already.
if (!ResultReg) {
@@ -2126,7 +2120,6 @@
Register CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
- bool CondIsKill = hasTrivialKill(Cond);
// In case OpReg is a K register, COPY to a GPR
if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
@@ -2134,12 +2127,11 @@
CondReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
- .addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
- X86::sub_8bit);
+ .addReg(KCondReg);
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill))
+ .addReg(CondReg)
.addImm(1);
}
@@ -2147,18 +2139,13 @@
const Value *RHS = I->getOperand(2);
Register RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
Register LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
if (!LHSReg || !RHSReg)
return false;
const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
- Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill, CC);
+ Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -2207,17 +2194,9 @@
const Value *RHS = I->getOperand(2);
Register LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
Register RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
Register CmpLHSReg = getRegForValue(CmpLHS);
- bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
-
Register CmpRHSReg = getRegForValue(CmpRHS);
- bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
-
if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
return false;
@@ -2231,8 +2210,8 @@
unsigned CmpOpcode =
(RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
- Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
- CmpRHSReg, CmpRHSIsKill, CC);
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpRHSReg,
+ CC);
// Need an IMPLICIT_DEF for the input that is used to generate the upper
// bits of the result register since its not based on any of the inputs.
@@ -2244,9 +2223,8 @@
// LHS in the input. The mask input comes from the compare.
unsigned MovOpcode =
(RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
- unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
- CmpReg, true, ImplicitDefReg, true,
- LHSReg, LHSIsKill);
+ unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, CmpReg,
+ ImplicitDefReg, LHSReg);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2265,10 +2243,10 @@
unsigned BlendOpcode =
(RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
- Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
- CmpRHSReg, CmpRHSIsKill, CC);
- Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill, CmpReg, true);
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
+ CC);
+ Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, LHSReg,
+ CmpReg);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
@@ -2287,14 +2265,10 @@
}
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
- Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
- CmpRHSReg, CmpRHSIsKill, CC);
- Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
- /*Op0IsKill=*/false, LHSReg, LHSIsKill);
- Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
- /*Op0IsKill=*/true, RHSReg, RHSIsKill);
- Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
- AndReg, /*Op1IsKill=*/true);
+ Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpRHSReg, CC);
+ Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, LHSReg);
+ Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, RHSReg);
+ Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, AndReg);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
@@ -2344,7 +2318,6 @@
Register CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
- bool CondIsKill = hasTrivialKill(Cond);
// In case OpReg is a K register, COPY to a GPR
if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
@@ -2352,12 +2325,11 @@
CondReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
- .addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
- X86::sub_8bit);
+ .addReg(KCondReg);
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill))
+ .addReg(CondReg)
.addImm(1);
}
@@ -2365,18 +2337,14 @@
const Value *RHS = I->getOperand(2);
Register LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
Register RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
if (!LHSReg || !RHSReg)
return false;
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
Register ResultReg =
- fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+ fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -2400,12 +2368,11 @@
Register OpReg = getRegForValue(Opnd);
if (OpReg == 0)
return false;
- bool OpIsKill = hasTrivialKill(Opnd);
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
Register ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(OpReg, getKillRegState(OpIsKill));
+ .addReg(OpReg);
updateValueMap(I, ResultReg);
return true;
}
@@ -2475,8 +2442,7 @@
Register ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
- Register ResultReg =
- fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+ Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, OpReg);
updateValueMap(I, ResultReg);
return true;
}
@@ -2573,8 +2539,7 @@
}
// Issue an extract_subreg.
- Register ResultReg = fastEmitInst_extractsubreg(MVT::i8,
- InputReg, false,
+ Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg,
X86::sub_8bit);
if (!ResultReg)
return false;
@@ -2610,8 +2575,9 @@
unsigned Reg;
bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
- RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
+ RV &= X86FastEmitStore(VT, Reg, DestAM);
assert(RV && "Failed to emit load or store??");
+ (void)RV;
unsigned Size = VT.getSizeInBits()/8;
Len -= Size;
@@ -2658,7 +2624,7 @@
// controlled by MXCSR.
unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
: X86::VCVTPS2PHrr;
- InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4);
+ InputReg = fastEmitInst_ri(Opc, RC, InputReg, 4);
// Move the lower 32-bits of ResultReg to another register of class GR32.
Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
@@ -2669,20 +2635,19 @@
// The result value is in the lower 16-bits of ResultReg.
unsigned RegIdx = X86::sub_16bit;
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, RegIdx);
} else {
assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
// Explicitly zero-extend the input to 32-bit.
- InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
- /*Op0IsKill=*/false);
+ InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg);
// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
- InputReg, /*Op0IsKill=*/true);
+ InputReg);
unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
: X86::VCVTPH2PSrr;
- InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
+ InputReg = fastEmitInst_r(Opc, RC, InputReg);
// The result value is in the lower 32-bits of ResultReg.
// Emit an explicit copy from register class VR128 to register class FR32.
@@ -2933,7 +2898,6 @@
Register LHSReg = getRegForValue(LHS);
if (LHSReg == 0)
return false;
- bool LHSIsKill = hasTrivialKill(LHS);
unsigned ResultReg = 0;
// Check if we have an immediate version.
@@ -2950,21 +2914,17 @@
bool IsDec = BaseOpc == ISD::SUB;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill));
+ .addReg(LHSReg);
} else
- ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
- CI->getZExtValue());
+ ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, CI->getZExtValue());
}
unsigned RHSReg;
- bool RHSIsKill;
if (!ResultReg) {
RHSReg = getRegForValue(RHS);
if (RHSReg == 0)
return false;
- RHSIsKill = hasTrivialKill(RHS);
- ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill);
+ ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, RHSReg);
}
// FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
@@ -2977,9 +2937,9 @@
// the X86::MUL*r instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
- .addReg(LHSReg, getKillRegState(LHSIsKill));
+ .addReg(LHSReg);
ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
- TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+ TLI.getRegClassFor(VT), RHSReg);
} else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
static const uint16_t MULOpc[] =
{ X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
@@ -2988,13 +2948,11 @@
// X86::IMUL8r instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), X86::AL)
- .addReg(LHSReg, getKillRegState(LHSIsKill));
- ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
- RHSIsKill);
+ .addReg(LHSReg);
+ ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg);
} else
ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
- TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
- RHSReg, RHSIsKill);
+ TLI.getRegClassFor(VT), LHSReg, RHSReg);
}
if (!ResultReg)
@@ -3111,6 +3069,7 @@
Arg.hasAttribute(Attribute::InReg) ||
Arg.hasAttribute(Attribute::StructRet) ||
Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftAsync) ||
Arg.hasAttribute(Attribute::SwiftError) ||
Arg.hasAttribute(Attribute::Nest))
return false;
@@ -3187,7 +3146,8 @@
if (Subtarget->getTargetTriple().isOSMSVCRT())
return 0;
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE || CC == CallingConv::Tail)
+ CC == CallingConv::HiPE || CC == CallingConv::Tail ||
+ CC == CallingConv::SwiftTail)
return 0;
if (CB)
@@ -3208,23 +3168,23 @@
bool &IsTailCall = CLI.IsTailCall;
bool IsVarArg = CLI.IsVarArg;
const Value *Callee = CLI.Callee;
- MCSymbol *Symbol = CLI.Symbol;
+ MCSymbol *Symbol = CLI.Symbol;
+ const auto *CB = CLI.CB;
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CC);
- const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB);
- const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
-
// Call / invoke instructions with NoCfCheck attribute require special
// handling.
- const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
- if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
+ if (CB && CB->doesNoCfCheck())
return false;
// Functions with no_caller_saved_registers that need special handling.
- if ((CI && CI->hasFnAttr("no_caller_saved_registers")) ||
- (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
+ if ((CB && isa<CallInst>(CB) && CB->hasFnAttr("no_caller_saved_registers")))
+ return false;
+
+ // Functions with no_callee_saved_registers that need special handling.
+ if ((CB && CB->hasFnAttr("no_callee_saved_registers")))
return false;
// Functions using thunks for indirect calls need to use SDISel.
@@ -3239,6 +3199,7 @@
case CallingConv::Tail:
case CallingConv::WebKit_JS:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_ThisCall:
@@ -3255,7 +3216,7 @@
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
- CC == CallingConv::Tail)
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
return false;
// Don't know how to handle Win64 varargs yet. Nothing special needed for
@@ -3305,8 +3266,7 @@
if (!isTypeLegal(PrevVal->getType(), VT))
return false;
- ResultReg =
- fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+ ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, 1);
} else {
if (!isTypeLegal(Val->getType(), VT) ||
(VT.isVector() && VT.getVectorElementType() == MVT::i1))
@@ -3374,7 +3334,7 @@
// Handle zero-extension from i1 to i8, which is common.
if (ArgVT == MVT::i1) {
// Set the high bits to zero.
- ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
+ ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg);
ArgVT = MVT::i8;
if (ArgReg == 0)
@@ -3404,8 +3364,7 @@
break;
}
case CCValAssign::BCvt: {
- ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
- /*TODO: Kill=*/false);
+ ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg);
assert(ArgReg && "Failed to emit a bitcast!");
ArgVT = VA.getLocVT();
break;
@@ -3458,8 +3417,7 @@
if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
return false;
} else {
- bool ValIsKill = hasTrivialKill(ArgVal);
- if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+ if (!X86FastEmitStore(ArgVT, ArgReg, AM, MMO))
return false;
}
}
@@ -3723,11 +3681,9 @@
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
case MVT::i8:
- return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
- X86::sub_8bit);
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, X86::sub_8bit);
case MVT::i16:
- return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
- X86::sub_16bit);
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, X86::sub_16bit);
case MVT::i32:
return SrcReg;
case MVT::i64: {
@@ -3886,6 +3842,31 @@
return X86MaterializeFP(CFP, VT);
else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
return X86MaterializeGV(GV, VT);
+ else if (isa<UndefValue>(C)) {
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default:
+ break;
+ case MVT::f32:
+ if (!X86ScalarSSEf32)
+ Opc = X86::LD_Fp032;
+ break;
+ case MVT::f64:
+ if (!X86ScalarSSEf64)
+ Opc = X86::LD_Fp064;
+ break;
+ case MVT::f80:
+ Opc = X86::LD_Fp080;
+ break;
+ }
+
+ if (Opc) {
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+ return ResultReg;
+ }
+ }
return 0;
}
@@ -3997,10 +3978,8 @@
unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
- unsigned Op0, bool Op0IsKill,
- unsigned Op1, bool Op1IsKill,
- unsigned Op2, bool Op2IsKill,
- unsigned Op3, bool Op3IsKill) {
+ unsigned Op0, unsigned Op1,
+ unsigned Op2, unsigned Op3) {
const MCInstrDesc &II = TII.get(MachineInstOpcode);
Register ResultReg = createResultReg(RC);
@@ -4011,16 +3990,16 @@
if (II.getNumDefs() >= 1)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
- .addReg(Op0, getKillRegState(Op0IsKill))
- .addReg(Op1, getKillRegState(Op1IsKill))
- .addReg(Op2, getKillRegState(Op2IsKill))
- .addReg(Op3, getKillRegState(Op3IsKill));
+ .addReg(Op0)
+ .addReg(Op1)
+ .addReg(Op2)
+ .addReg(Op3);
else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(Op0, getKillRegState(Op0IsKill))
- .addReg(Op1, getKillRegState(Op1IsKill))
- .addReg(Op2, getKillRegState(Op2IsKill))
- .addReg(Op3, getKillRegState(Op3IsKill));
+ .addReg(Op0)
+ .addReg(Op1)
+ .addReg(Op2)
+ .addReg(Op3);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FastTileConfig.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FastTileConfig.cpp
new file mode 100644
index 0000000..7031bd4
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -0,0 +1,307 @@
+//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. Before FastRegAllocation pass
+/// the ldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after register allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fasttileconfig"
+
+namespace {
+
+class X86FastTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86FastTileConfig() : MachineFunctionPass(ID) {}
+
+ bool fastTileConfig();
+ bool isTileLoad(MachineInstr &MI);
+ bool isTileStore(MachineInstr &MI);
+ bool isAMXInstr(MachineInstr &MI);
+ void getTileStoreShape(MachineInstr &MI,
+ SmallVector<MachineOperand *> &ShapedTiles);
+
+ MachineInstr *getKeyAMXInstr(MachineInstr *MI);
+ void getTileShapesCfg(MachineInstr *MI,
+ SmallVector<MachineOperand *> &ShapedTiles);
+ void getShapeCfgInstrs(MachineInstr *MI,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs);
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Fast Tile Register Configure";
+ }
+
+ void materializeTileCfg(MachineInstr *MI);
+
+ void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs);
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &MFunc) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86FastTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Configure", false, false)
+INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Configure", false, false)
+
+static bool isTilePhysReg(MachineOperand &Op) {
+ if (!Op.isReg())
+ return false;
+
+ Register Reg = Op.getReg();
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return true;
+ return false;
+}
+
+static unsigned getTilePhysRegIdx(MachineOperand *Op) {
+ assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
+ return Op->getReg() - X86::TMM0;
+}
+
+static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
+ unsigned Offset = 48 + TIdx;
+ MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
+ unsigned Offset = 16 + TIdx * 2;
+ MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
+ return MI.getOpcode() == X86::PTILELOADDV ||
+ MI.getOpcode() == X86::PTILELOADDT1V;
+}
+bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
+ return MI.getOpcode() == X86::PTILESTOREDV;
+}
+bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
+ // TODO: May need to handle some special nontile amx instrucion.
+ if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
+ return false;
+
+ for (MachineOperand &MO : MI.operands())
+ if (isTilePhysReg(MO))
+ return true;
+
+ return false;
+}
+
+MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
+ auto Cfg = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *KeyMI = nullptr;
+ int KeyAMXNum = 0;
+
+ for (auto II = Cfg; II != MBB->end(); II++) {
+ if (isTileLoad(*II)) {
+ KeyMI = &*II;
+ continue;
+ }
+
+ if (isTileStore(*II)) {
+ assert(KeyMI && "Key AMX Should be found before!");
+ break;
+ }
+
+ if (isAMXInstr(*II)) {
+ assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
+ KeyAMXNum++;
+ KeyMI = &*II;
+ }
+ }
+ assert(KeyMI && "There must be an AMX instruction.");
+ return KeyMI;
+}
+
+// Orderly get the tiles in key amx instruction, uses before defs.
+void X86FastTileConfig::getTileShapesCfg(
+ MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
+ MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
+
+ SmallVector<MachineOperand *> DefTiles;
+ for (MachineOperand &MO : KeyMI->operands()) {
+ if (!isTilePhysReg(MO))
+ continue;
+ if (MO.isDef())
+ DefTiles.push_back(&MO);
+ else
+ ShapedTiles.push_back(&MO);
+ }
+ ShapedTiles.append(DefTiles);
+}
+
+// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
+// amx.shape.N.col*" at pass "Pre AMX Tile Config".
+// The 'N' implies the order of tiles in key amx intrinsic.
+void X86FastTileConfig::getShapeCfgInstrs(
+ MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs) {
+ auto Cfg = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock *MBB = MI->getParent();
+
+ for (auto II = Cfg; II != MBB->begin(); II--) {
+ if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
+ break;
+ if (!II->mayStore() || !II->hasOneMemOperand())
+ continue;
+ const Value *MemPtr = II->memoperands()[0]->getValue();
+ if (!MemPtr)
+ continue;
+
+ StringRef Name = MemPtr->getName();
+ if (!Name.startswith("amx.tmm."))
+ continue;
+
+ // Get the 'N'th tile shape config in key amx instruction.
+ auto N = Name.find(".shape");
+ StringRef STileIdx = Name.slice(8, N);
+ unsigned Idx;
+ STileIdx.getAsInteger(10, Idx);
+
+ // And related them with their store instructions.
+ if (Name.contains("row"))
+ RowCfgs[Idx] = &*II;
+ else if (Name.contains("col"))
+ ColCfgs[Idx] = &*II;
+ else
+ llvm_unreachable("Invalid tile shape info!");
+ }
+ assert((RowCfgs.size() == ColCfgs.size()) &&
+ "The number of tile row and col must be equal!");
+}
+
+// Here is the data format for the tile config.
+// 0 palette = 1 now.
+// 1 start_row = 0 now.
+// 2-15 reserved, must be zero
+// 16-17 tile0.colsb Tile 0 bytes per row.
+// 18-19 tile1.colsb Tile 1 bytes per row.
+// 20-21 tile2.colsb Tile 2 bytes per row.
+// ... (sequence continues)
+// 30-31 tile7.colsb Tile 7 bytes per row.
+// 32-47 reserved, must be zero
+// 48 tile0.rows Tile 0 rows.
+// 49 tile1.rows Tile 1 rows.
+// 50 tile2.rows Tile 2 rows.
+// ... (sequence continues)
+// 55 tile7.rows Tile 7 rows.
+// 56-63 reserved, must be zero
+void X86FastTileConfig::rewriteTileCfg(
+ SmallVector<MachineOperand *> &ShapedTiles,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs) {
+ assert((RowCfgs.size() == ShapedTiles.size()) &&
+ "The number of tile shapes not equal with the number of tiles!");
+
+ // Orderly get the tiles and adjust the shape config.
+ for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
+ MachineOperand *MO = ShapedTiles[I];
+ unsigned TmmIdx = getTilePhysRegIdx(MO);
+ if (I == TmmIdx)
+ continue;
+ adjustRowCfg(TmmIdx, RowCfgs[I]);
+ adjustColCfg(TmmIdx, ColCfgs[I]);
+ }
+}
+
+// We have already preconfig the shapes before fast register allocation at
+// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
+// allocation, the shapes pre-written before may not rightly corresponding
+// to the correct tmm registers, so we need adjust them.
+void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
+ SmallVector<MachineOperand *> ShapedTiles;
+ std::map<unsigned, MachineInstr *> RowCfgs;
+ std::map<unsigned, MachineInstr *> ColCfgs;
+
+ // Orderly keep the tile uses and def in ShapedTiles;
+ getTileShapesCfg(CfgMI, ShapedTiles);
+ assert(ShapedTiles.size() && "Not find shapes config!");
+
+ getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
+
+ rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
+}
+
+bool X86FastTileConfig::fastTileConfig() {
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ SmallVector<MachineInstr *, 2> CFGs;
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == X86::PLDTILECFGV)
+ CFGs.push_back(&MI);
+ for (auto *MI : CFGs)
+ materializeTileCfg(MI);
+ if (!CFGs.empty())
+ Changed = true;
+ }
+ return Changed;
+}
+
+bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+ MF = &MFunc;
+ MRI = &MFunc.getRegInfo();
+ ST = &MFunc.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = MFunc.getSubtarget().getInstrInfo();
+
+ return fastTileConfig();
+}
+
+FunctionPass *llvm::createX86FastTileConfigPass() {
+ return new X86FastTileConfig();
+}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index f8d822a..e1d4b4c 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -137,6 +137,8 @@
/// Machine instruction info used throughout the class.
const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+
/// Local member for function's OptForSize attribute.
bool OptForSize = false;
@@ -162,6 +164,7 @@
this->MF = &MF;
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getRegInfo().getTargetRegisterInfo();
MLI = &getAnalysis<MachineLoopInfo>();
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
MBFI = (PSI && PSI->hasProfileSummary()) ?
@@ -303,6 +306,14 @@
MIB.setMemRefs(MI->memoperands());
+ // If it was debug tracked, record a substitution.
+ if (unsigned OldInstrNum = MI->peekDebugInstrNum()) {
+ unsigned Subreg = TRI->getSubRegIndex(MIB->getOperand(0).getReg(),
+ MI->getOperand(0).getReg());
+ unsigned NewInstrNum = MIB->getDebugInstrNum(*MF);
+ MF->makeDebugValueSubstitution({OldInstrNum, 0}, {NewInstrNum, 0}, Subreg);
+ }
+
return MIB;
}
@@ -366,6 +377,13 @@
MIB.setMemRefs(MI->memoperands());
+ if (unsigned OldInstrNum = MI->peekDebugInstrNum()) {
+ unsigned Subreg = TRI->getSubRegIndex(MIB->getOperand(0).getReg(),
+ MI->getOperand(0).getReg());
+ unsigned NewInstrNum = MIB->getDebugInstrNum(*MF);
+ MF->makeDebugValueSubstitution({OldInstrNum, 0}, {NewInstrNum, 0}, Subreg);
+ }
+
return MIB;
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 0054d58..05cab77 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -79,6 +79,30 @@
MachineBasicBlock &MBB, bool OptIncDec,
bool UseLEAForSP) const;
+ /// Look for and transform the sequence
+ /// lea (reg1, reg2), reg3
+ /// sub reg3, reg4
+ /// to
+ /// sub reg1, reg4
+ /// sub reg2, reg4
+ /// It can also optimize the sequence lea/add similarly.
+ bool optLEAALU(MachineBasicBlock::iterator &I, MachineBasicBlock &MBB) const;
+
+ /// Step forwards in MBB, looking for an ADD/SUB instruction which uses
+ /// the dest register of LEA instruction I.
+ MachineBasicBlock::iterator searchALUInst(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) const;
+
+ /// Check instructions between LeaI and AluI (exclusively).
+ /// Set BaseIndexDef to true if base or index register from LeaI is defined.
+ /// Set AluDestRef to true if the dest register of AluI is used or defined.
+ /// *KilledBase is set to the killed base register usage.
+ /// *KilledIndex is set to the killed index register usage.
+ void checkRegUsage(MachineBasicBlock::iterator &LeaI,
+ MachineBasicBlock::iterator &AluI, bool &BaseIndexDef,
+ bool &AluDestRef, MachineOperand **KilledBase,
+ MachineOperand **KilledIndex) const;
+
/// Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
@@ -338,6 +362,18 @@
}
}
+static inline unsigned getSUBrrFromLEA(unsigned LEAOpcode) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return X86::SUB32rr;
+ case X86::LEA64r:
+ return X86::SUB64rr;
+ }
+}
+
static inline unsigned getADDriFromLEA(unsigned LEAOpcode,
const MachineOperand &Offset) {
bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
@@ -364,6 +400,162 @@
}
}
+MachineBasicBlock::iterator
+FixupLEAPass::searchALUInst(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) const {
+ const int InstrDistanceThreshold = 5;
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst = std::next(I);
+
+ unsigned LEAOpcode = I->getOpcode();
+ unsigned AddOpcode = getADDrrFromLEA(LEAOpcode);
+ unsigned SubOpcode = getSUBrrFromLEA(LEAOpcode);
+ Register DestReg = I->getOperand(0).getReg();
+
+ while (CurInst != MBB.end()) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > InstrDistanceThreshold)
+ break;
+
+ // Check if the lea dest register is used in an add/sub instruction only.
+ for (unsigned I = 0, E = CurInst->getNumOperands(); I != E; ++I) {
+ MachineOperand &Opnd = CurInst->getOperand(I);
+ if (Opnd.isReg()) {
+ if (Opnd.getReg() == DestReg) {
+ if (Opnd.isDef() || !Opnd.isKill())
+ return MachineBasicBlock::iterator();
+
+ unsigned AluOpcode = CurInst->getOpcode();
+ if (AluOpcode != AddOpcode && AluOpcode != SubOpcode)
+ return MachineBasicBlock::iterator();
+
+ MachineOperand &Opnd2 = CurInst->getOperand(3 - I);
+ MachineOperand AluDest = CurInst->getOperand(0);
+ if (Opnd2.getReg() != AluDest.getReg())
+ return MachineBasicBlock::iterator();
+
+ // X - (Y + Z) may generate different flags than (X - Y) - Z when
+ // there is overflow. So we can't change the alu instruction if the
+ // flags register is live.
+ if (!CurInst->registerDefIsDead(X86::EFLAGS, TRI))
+ return MachineBasicBlock::iterator();
+
+ return CurInst;
+ }
+ if (TRI->regsOverlap(DestReg, Opnd.getReg()))
+ return MachineBasicBlock::iterator();
+ }
+ }
+
+ InstrDistance++;
+ ++CurInst;
+ }
+ return MachineBasicBlock::iterator();
+}
+
+void FixupLEAPass::checkRegUsage(MachineBasicBlock::iterator &LeaI,
+ MachineBasicBlock::iterator &AluI,
+ bool &BaseIndexDef, bool &AluDestRef,
+ MachineOperand **KilledBase,
+ MachineOperand **KilledIndex) const {
+ BaseIndexDef = AluDestRef = false;
+ *KilledBase = *KilledIndex = nullptr;
+ Register BaseReg = LeaI->getOperand(1 + X86::AddrBaseReg).getReg();
+ Register IndexReg = LeaI->getOperand(1 + X86::AddrIndexReg).getReg();
+ Register AluDestReg = AluI->getOperand(0).getReg();
+
+ MachineBasicBlock::iterator CurInst = std::next(LeaI);
+ while (CurInst != AluI) {
+ for (unsigned I = 0, E = CurInst->getNumOperands(); I != E; ++I) {
+ MachineOperand &Opnd = CurInst->getOperand(I);
+ if (!Opnd.isReg())
+ continue;
+ Register Reg = Opnd.getReg();
+ if (TRI->regsOverlap(Reg, AluDestReg))
+ AluDestRef = true;
+ if (TRI->regsOverlap(Reg, BaseReg)) {
+ if (Opnd.isDef())
+ BaseIndexDef = true;
+ else if (Opnd.isKill())
+ *KilledBase = &Opnd;
+ }
+ if (TRI->regsOverlap(Reg, IndexReg)) {
+ if (Opnd.isDef())
+ BaseIndexDef = true;
+ else if (Opnd.isKill())
+ *KilledIndex = &Opnd;
+ }
+ }
+ ++CurInst;
+ }
+}
+
+bool FixupLEAPass::optLEAALU(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) const {
+ // Look for an add/sub instruction which uses the result of lea.
+ MachineBasicBlock::iterator AluI = searchALUInst(I, MBB);
+ if (AluI == MachineBasicBlock::iterator())
+ return false;
+
+ // Check if there are any related register usage between lea and alu.
+ bool BaseIndexDef, AluDestRef;
+ MachineOperand *KilledBase, *KilledIndex;
+ checkRegUsage(I, AluI, BaseIndexDef, AluDestRef, &KilledBase, &KilledIndex);
+
+ MachineBasicBlock::iterator InsertPos = AluI;
+ if (BaseIndexDef) {
+ if (AluDestRef)
+ return false;
+ InsertPos = I;
+ KilledBase = KilledIndex = nullptr;
+ }
+
+ // Check if there are same registers.
+ Register AluDestReg = AluI->getOperand(0).getReg();
+ Register BaseReg = I->getOperand(1 + X86::AddrBaseReg).getReg();
+ Register IndexReg = I->getOperand(1 + X86::AddrIndexReg).getReg();
+ if (I->getOpcode() == X86::LEA64_32r) {
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+ }
+ if (AluDestReg == IndexReg) {
+ if (BaseReg == IndexReg)
+ return false;
+ std::swap(BaseReg, IndexReg);
+ std::swap(KilledBase, KilledIndex);
+ }
+ if (BaseReg == IndexReg)
+ KilledBase = nullptr;
+
+ // Now it's safe to change instructions.
+ MachineInstr *NewMI1, *NewMI2;
+ unsigned NewOpcode = AluI->getOpcode();
+ NewMI1 = BuildMI(MBB, InsertPos, AluI->getDebugLoc(), TII->get(NewOpcode),
+ AluDestReg)
+ .addReg(AluDestReg, RegState::Kill)
+ .addReg(BaseReg, KilledBase ? RegState::Kill : 0);
+ NewMI1->addRegisterDead(X86::EFLAGS, TRI);
+ NewMI2 = BuildMI(MBB, InsertPos, AluI->getDebugLoc(), TII->get(NewOpcode),
+ AluDestReg)
+ .addReg(AluDestReg, RegState::Kill)
+ .addReg(IndexReg, KilledIndex ? RegState::Kill : 0);
+ NewMI2->addRegisterDead(X86::EFLAGS, TRI);
+
+ // Clear the old Kill flags.
+ if (KilledBase)
+ KilledBase->setIsKill(false);
+ if (KilledIndex)
+ KilledIndex->setIsKill(false);
+
+ MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI1, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI2, 1);
+ MBB.erase(I);
+ MBB.erase(AluI);
+ I = NewMI1;
+ return true;
+}
+
bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
MachineBasicBlock &MBB, bool OptIncDec,
bool UseLEAForSP) const {
@@ -398,6 +590,7 @@
MachineInstr *NewMI = nullptr;
+ // Case 1.
// Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1
// which can be turned into add %reg2, %reg1
if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 &&
@@ -417,6 +610,7 @@
.addReg(BaseReg).addReg(IndexReg);
}
} else if (DestReg == BaseReg && IndexReg == 0) {
+ // Case 2.
// This is an LEA with only a base register and a displacement,
// We can use ADDri or INC/DEC.
@@ -447,6 +641,12 @@
.addReg(BaseReg).addImm(Disp.getImm());
}
}
+ } else if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0) {
+ // Case 3.
+ // Look for and transform the sequence
+ // lea (reg1, reg2), reg3
+ // sub reg3, reg4
+ return optLEAALU(I, MBB);
} else
return false;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index d43fd80..2d9886e 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -99,35 +99,35 @@
Register promoteCondToReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, X86::CondCode Cond);
- std::pair<unsigned, bool>
- getCondOrInverseInReg(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
- X86::CondCode Cond, CondRegArray &CondRegs);
+ const DebugLoc &TestLoc, X86::CondCode Cond);
+ std::pair<unsigned, bool> getCondOrInverseInReg(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs);
void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
- DebugLoc Loc, unsigned Reg);
+ const DebugLoc &Loc, unsigned Reg);
void rewriteArithmetic(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
- MachineInstr &MI, MachineOperand &FlagUse,
- CondRegArray &CondRegs);
+ MachineBasicBlock::iterator TestPos,
+ const DebugLoc &TestLoc, MachineInstr &MI,
+ MachineOperand &FlagUse, CondRegArray &CondRegs);
void rewriteCMov(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc,
MachineInstr &CMovI, MachineOperand &FlagUse,
CondRegArray &CondRegs);
void rewriteFCMov(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
- MachineInstr &CMovI, MachineOperand &FlagUse,
- CondRegArray &CondRegs);
+ MachineBasicBlock::iterator TestPos,
+ const DebugLoc &TestLoc, MachineInstr &CMovI,
+ MachineOperand &FlagUse, CondRegArray &CondRegs);
void rewriteCondJmp(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
- MachineInstr &JmpI, CondRegArray &CondRegs);
+ MachineBasicBlock::iterator TestPos,
+ const DebugLoc &TestLoc, MachineInstr &JmpI,
+ CondRegArray &CondRegs);
void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
MachineInstr &CopyDefI);
void rewriteSetCC(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
- MachineInstr &SetCCI, MachineOperand &FlagUse,
- CondRegArray &CondRegs);
+ MachineBasicBlock::iterator TestPos,
+ const DebugLoc &TestLoc, MachineInstr &SetCCI,
+ MachineOperand &FlagUse, CondRegArray &CondRegs);
};
} // end anonymous namespace
@@ -755,7 +755,7 @@
Register X86FlagsCopyLoweringPass::promoteCondToReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, X86::CondCode Cond) {
+ const DebugLoc &TestLoc, X86::CondCode Cond) {
Register Reg = MRI->createVirtualRegister(PromoteRC);
auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
TII->get(X86::SETCCr), Reg).addImm(Cond);
@@ -767,7 +767,7 @@
std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
+ const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
unsigned &CondReg = CondRegs[Cond];
unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
if (!CondReg && !InvCondReg)
@@ -781,7 +781,7 @@
void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Pos,
- DebugLoc Loc, unsigned Reg) {
+ const DebugLoc &Loc, unsigned Reg) {
auto TestI =
BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
(void)TestI;
@@ -791,7 +791,7 @@
void X86FlagsCopyLoweringPass::rewriteArithmetic(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
+ const DebugLoc &TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
CondRegArray &CondRegs) {
// Arithmetic is either reading CF or OF. Figure out which condition we need
// to preserve in a register.
@@ -845,7 +845,7 @@
void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc,
+ const DebugLoc &TestLoc,
MachineInstr &CMovI,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
@@ -871,7 +871,7 @@
void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc,
+ const DebugLoc &TestLoc,
MachineInstr &CMovI,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
@@ -916,7 +916,7 @@
void X86FlagsCopyLoweringPass::rewriteCondJmp(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+ const DebugLoc &TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
// First get the register containing this specific condition.
X86::CondCode Cond = X86::getCondFromBranch(JmpI);
unsigned CondReg;
@@ -947,7 +947,7 @@
void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc,
+ const DebugLoc &TestLoc,
MachineInstr &SetCCI,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
index e6ee4695..e0f30f0 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -610,7 +610,7 @@
{ \
static std::atomic<bool> TABLE##Checked(false); \
if (!TABLE##Checked.load(std::memory_order_relaxed)) { \
- assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
+ assert(is_sorted(TABLE) && \
"All lookup tables must be sorted for efficient access!"); \
TABLE##Checked.store(true, std::memory_order_relaxed); \
} \
@@ -785,6 +785,9 @@
{ X86::UCOM_Fpr32 , X86::UCOM_Fr },
{ X86::UCOM_Fpr64 , X86::UCOM_Fr },
{ X86::UCOM_Fpr80 , X86::UCOM_Fr },
+ { X86::XAM_Fp32 , X86::XAM_F },
+ { X86::XAM_Fp64 , X86::XAM_F },
+ { X86::XAM_Fp80 , X86::XAM_F },
};
static unsigned getConcreteOpcode(unsigned Opcode) {
@@ -848,6 +851,7 @@
I->setDesc(TII->get(Opcode));
if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
I->RemoveOperand(0);
+ MI.dropDebugNumber();
} else { // Insert an explicit pop
I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
}
@@ -979,8 +983,24 @@
MachineInstr &MI = *I;
unsigned STReturns = 0;
+ bool ClobbersFPStack = false;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &Op = MI.getOperand(i);
+ // Check if this call clobbers the FP stack.
+ // is sufficient.
+ if (Op.isRegMask()) {
+ bool ClobbersFP0 = Op.clobbersPhysReg(X86::FP0);
+#ifndef NDEBUG
+ static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
+ for (unsigned i = 1; i != 8; ++i)
+ assert(Op.clobbersPhysReg(X86::FP0 + i) == ClobbersFP0 &&
+ "Inconsistent FP register clobber");
+#endif
+
+ if (ClobbersFP0)
+ ClobbersFPStack = true;
+ }
+
if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
@@ -995,6 +1015,14 @@
--e;
}
+ // Most calls should have a regmask that clobbers the FP registers. If it
+ // isn't present then the register allocator didn't spill the FP registers
+ // so they are still on the stack.
+ assert((ClobbersFPStack || STReturns == 0) &&
+ "ST returns without FP stack clobber");
+ if (!ClobbersFPStack)
+ return;
+
unsigned N = countTrailingOnes(STReturns);
// FP registers used for function return must be consecutive starting at
@@ -1009,6 +1037,10 @@
for (unsigned I = 0; I < N; ++I)
pushReg(N - I - 1);
+
+ // Drop all variable values defined by this call -- we can't track them
+ // once they've been stackified.
+ I->dropDebugNumber();
}
/// If RET has an FP register use operand, pass the first one in ST(0) and
@@ -1112,6 +1144,8 @@
// Result gets pushed on the stack.
pushReg(DestReg);
+
+ MI.dropDebugNumber();
}
/// handleOneArgFP - fst <mem>, ST(0)
@@ -1165,6 +1199,8 @@
} else if (KillsSrc) { // Last use of operand?
popStackAfter(I);
}
+
+ MI.dropDebugNumber();
}
@@ -1205,6 +1241,7 @@
MI.RemoveOperand(1); // Drop the source operand.
MI.RemoveOperand(0); // Drop the destination operand.
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.dropDebugNumber();
}
@@ -1297,7 +1334,7 @@
unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
- DebugLoc dl = MI.getDebugLoc();
+ const DebugLoc &dl = MI.getDebugLoc();
unsigned TOS = getStackEntry(0);
@@ -1404,6 +1441,7 @@
MI.getOperand(0).setReg(getSTReg(Op1));
MI.RemoveOperand(1);
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.dropDebugNumber();
// If any of the operands are killed by this instruction, free them.
if (KillsOp0) freeStackSlotAfter(I, Op0);
@@ -1430,6 +1468,7 @@
MI.RemoveOperand(1);
MI.getOperand(0).setReg(getSTReg(Op1));
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.dropDebugNumber();
// If we kill the second operand, make sure to pop it from the stack.
if (Op0 != Op1 && KillsOp1) {
@@ -1526,7 +1565,7 @@
// Scan the assembly for ST registers used, defined and clobbered. We can
// only tell clobbers from defs by looking at the asm descriptor.
- unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+ unsigned STUses = 0, STDefs = 0, STClobbers = 0;
unsigned NumOps = 0;
SmallSet<unsigned, 1> FRegIdx;
unsigned RCID;
@@ -1559,8 +1598,6 @@
case InlineAsm::Kind_RegDef:
case InlineAsm::Kind_RegDefEarlyClobber:
STDefs |= (1u << STReg);
- if (MO.isDead())
- STDeadDefs |= (1u << STReg);
break;
case InlineAsm::Kind_Clobber:
STClobbers |= (1u << STReg);
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
index 15d8502..4cde797 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -70,7 +70,7 @@
X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
return hasReservedCallFrame(MF) ||
MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
- (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+ (hasFP(MF) && !TRI->hasStackRealignment(MF)) ||
TRI->hasBasePointer(MF);
}
@@ -93,7 +93,7 @@
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
- TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
@@ -409,7 +409,13 @@
return 0;
PI = MBB.erase(PI);
- if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI);
+ if (PI != MBB.end() && PI->isCFIInstruction()) {
+ auto CIs = MBB.getParent()->getFrameInstructions();
+ MCCFIInstruction CI = CIs[PI->getOperand(0).getCFIIndex()];
+ if (CI.getOperation() == MCCFIInstruction::OpDefCfaOffset ||
+ CI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
+ PI = MBB.erase(PI);
+ }
if (!doMergeWithPrevious)
MBBI = skipDebugInstructionsForward(PI, MBB.end());
@@ -534,7 +540,7 @@
uint64_t ProbeChunk = StackProbeSize * 8;
uint64_t MaxAlign =
- TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
+ TRI->hasStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
// Synthesize a loop or unroll it, depending on the number of iterations.
// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
@@ -1356,6 +1362,14 @@
STI.getTargetLowering()->hasStackProbeSymbol(MF);
unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
+ if (HasFP && X86FI->hasSwiftAsyncContext()) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8),
+ MachineFramePtr)
+ .addUse(MachineFramePtr)
+ .addImm(60)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
// Re-align the stack on 64-bit if the x86-interrupt calling convention is
// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
// stack alignment.
@@ -1371,7 +1385,7 @@
// pointer, calls, or dynamic alloca then we do not need to adjust the
// stack pointer (we fit in the Red Zone). We also check that we don't
// push and pop from the stack.
- if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) &&
+ if (has128ByteRedZone(MF) && !TRI->hasStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
!EmitStackProbeCall && // No stack probes.
@@ -1440,7 +1454,7 @@
NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
// Callee-saved registers are pushed on stack before the stack is realigned.
- if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
NumBytes = alignTo(NumBytes, MaxAlign);
// Save EBP/RBP into the appropriate stack slot.
@@ -1468,29 +1482,74 @@
.setMIFlag(MachineInstr::FrameSetup);
}
- if (!IsWin64Prologue && !IsFunclet) {
- // Update EBP with the new base value.
- BuildMI(MBB, MBBI, DL,
- TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
- FramePtr)
- .addReg(StackPtr)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (!IsFunclet) {
+ if (X86FI->hasSwiftAsyncContext()) {
+ const auto &Attrs = MF.getFunction().getAttributes();
- if (NeedsDwarfCFI) {
- // Mark effective beginning of when frame pointer becomes valid.
- // Define the current CFA to use the EBP/RBP register.
- unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
- BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
- nullptr, DwarfFramePtr));
+ // Before we update the live frame pointer we have to ensure there's a
+ // valid (or null) asynchronous context in its slot just before FP in
+ // the frame record, so store it now.
+ if (Attrs.hasAttrSomewhere(Attribute::SwiftAsync)) {
+ // We have an initial context in r14, store it just before the frame
+ // pointer.
+ MBB.addLiveIn(X86::R14);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+ .addReg(X86::R14)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ // No initial context, store null so that there's no pointer that
+ // could be misused.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64i8))
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(X86::R14)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr)
+ .addUse(X86::RSP)
+ .addImm(1)
+ .addUse(X86::NoRegister)
+ .addImm(8)
+ .addUse(X86::NoRegister)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64ri8), X86::RSP)
+ .addUse(X86::RSP)
+ .addImm(8)
+ .setMIFlag(MachineInstr::FrameSetup);
}
- if (NeedsWinFPO) {
- // .cv_fpo_setframe $FramePtr
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
- .addImm(FramePtr)
- .addImm(0)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (!IsWin64Prologue && !IsFunclet) {
+ // Update EBP with the new base value.
+ if (!X86FI->hasSwiftAsyncContext())
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+ FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the EBP/RBP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(
+ MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+ }
+
+ if (NeedsWinFPO) {
+ // .cv_fpo_setframe $FramePtr
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
}
} else {
@@ -1501,7 +1560,7 @@
// Update the offset adjustment, which is mainly used by codeview to translate
// from ESP to VFRAME relative local variable offsets.
if (!IsFunclet) {
- if (HasFP && TRI->needsStackRealignment(MF))
+ if (HasFP && TRI->hasStackRealignment(MF))
MFI.setOffsetAdjustment(-NumBytes);
else
MFI.setOffsetAdjustment(-StackSize);
@@ -1545,7 +1604,7 @@
// Realign stack after we pushed callee-saved registers (so that we'll be
// able to calculate their offsets from the frame pointer).
// Don't do this for Win64, it needs to realign the stack after the prologue.
- if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+ if (!IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF)) {
assert(HasFP && "There should be a frame pointer if stack is realigned.");
BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
@@ -1573,7 +1632,7 @@
// increments is necessary to ensure that the guard pages used by the OS
// virtual memory manager are allocated in correct sequence.
uint64_t AlignedNumBytes = NumBytes;
- if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+ if (IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF))
AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
assert(!X86FI->getUsesRedZone() &&
@@ -1768,7 +1827,7 @@
// Realign stack after we spilled callee-saved registers (so that we'll be
// able to calculate their offsets from the frame pointer).
// Win64 requires aligning the stack after the prologue.
- if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+ if (IsWin64Prologue && TRI->hasStackRealignment(MF)) {
assert(HasFP && "There should be a frame pointer if stack is realigned.");
BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
}
@@ -1969,7 +2028,7 @@
// Callee-saved registers were pushed on stack before the stack was
// realigned.
- if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
NumBytes = alignTo(FrameSize, MaxAlign);
} else {
NumBytes = StackSize - CSSize;
@@ -1979,10 +2038,26 @@
// AfterPop is the position to insert .cfi_restore.
MachineBasicBlock::iterator AfterPop = MBBI;
if (HasFP) {
+ if (X86FI->hasSwiftAsyncContext()) {
+ // Discard the context.
+ int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
+ emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/true);
+ }
// Pop EBP.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
+
+ // We need to reset FP to its untagged state on return. Bit 60 is currently
+ // used to show the presence of an extended frame.
+ if (X86FI->hasSwiftAsyncContext()) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8),
+ MachineFramePtr)
+ .addUse(MachineFramePtr)
+ .addImm(60)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
if (NeedsDwarfCFI) {
unsigned DwarfStackPtr =
TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
@@ -2007,7 +2082,9 @@
if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::ADD64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)))
break;
FirstCSPop = PI;
}
@@ -2031,14 +2108,17 @@
// slot before popping them off! Same applies for the case, when stack was
// realigned. Don't do this if this was a funclet epilogue, since the funclets
// will not do realignment or dynamic stack allocation.
- if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) &&
+ if (((TRI->hasStackRealignment(MF)) || MFI.hasVarSizedObjects()) &&
!IsFunclet) {
- if (TRI->needsStackRealignment(MF))
+ if (TRI->hasStackRealignment(MF))
MBBI = FirstCSPop;
unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
uint64_t LEAAmount =
IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+ if (X86FI->hasSwiftAsyncContext())
+ LEAAmount -= 16;
+
// There are only two legal forms of epilogue:
// - add SEHAllocationSize, %rsp
// - lea SEHAllocationSize(%FramePtr), %rsp
@@ -2114,8 +2194,12 @@
// Emit tilerelease for AMX kernel.
const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (!MRI.reg_nodbg_empty(X86::TMMCFG))
- BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+ const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+ for (unsigned I = 0; I < RC->getNumRegs(); I++)
+ if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+ break;
+ }
}
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
@@ -2129,7 +2213,7 @@
// have dynamic allocas in addition to dynamic realignment.
if (TRI->hasBasePointer(MF))
FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
- else if (TRI->needsStackRealignment(MF))
+ else if (TRI->hasStackRealignment(MF))
FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
else
FrameReg = TRI->getFrameRegister(MF);
@@ -2188,7 +2272,7 @@
assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
return StackOffset::getFixed(Offset + StackSize);
}
- } else if (TRI->needsStackRealignment(MF)) {
+ } else if (TRI->hasStackRealignment(MF)) {
if (FI < 0) {
// Skip the saved EBP.
return StackOffset::getFixed(Offset + SlotSize + FPDelta);
@@ -2279,7 +2363,7 @@
// answer we give is relative to the SP after the prologue, and not the
// SP in the middle of the function.
- if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+ if (MFI.isFixedObjectIndex(FI) && TRI->hasStackRealignment(MF) &&
!STI.isTargetWin64())
return getFrameIndexReference(MF, FI, FrameReg);
@@ -2363,6 +2447,14 @@
SpillSlotOffset -= SlotSize;
MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ // The async context lives directly before the frame pointer, and we
+ // allocate a second slot to preserve stack alignment.
+ if (X86FI->hasSwiftAsyncContext()) {
+ SpillSlotOffset -= SlotSize;
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ SpillSlotOffset -= SlotSize;
+ }
+
// Since emitPrologue and emitEpilogue will handle spilling and restoring of
// the frame register, we can delete it from CSI list and not have to worry
// about avoiding it later.
@@ -2504,7 +2596,7 @@
assert(!isAsynchronousEHPersonality(classifyEHPersonality(
MBB.getParent()->getFunction().getPersonalityFn())) &&
"SEH should not use CATCHRET");
- DebugLoc DL = CatchRet->getDebugLoc();
+ const DebugLoc &DL = CatchRet->getDebugLoc();
MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();
// Fill EAX/RAX with the address of the target block.
@@ -3156,7 +3248,7 @@
bool reserveCallFrame = hasReservedCallFrame(MF);
unsigned Opcode = I->getOpcode();
bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
- DebugLoc DL = I->getDebugLoc();
+ DebugLoc DL = I->getDebugLoc(); // copy DebugLoc as I will be erased.
uint64_t Amount = TII.getFrameSize(*I);
uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
I = MBB.erase(I);
@@ -3263,7 +3355,11 @@
bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
assert(MBB.getParent() && "Block is not attached to a function!");
const MachineFunction &MF = *MBB.getParent();
- return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS);
+ if (!MBB.isLiveIn(X86::EFLAGS))
+ return true;
+
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ return !TRI->hasStackRealignment(MF) && !X86FI->hasSwiftAsyncContext();
}
bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
@@ -3276,6 +3372,12 @@
if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
return false;
+ // Swift async context epilogue has a BTR instruction that clobbers parts of
+ // EFLAGS.
+ const MachineFunction &MF = *MBB.getParent();
+ if (MF.getInfo<X86MachineFunctionInfo>()->hasSwiftAsyncContext())
+ return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+
if (canUseLEAForSPInEpilogue(*MBB.getParent()))
return true;
@@ -3513,7 +3615,7 @@
}
// Flip it if we're accessing off of the FP.
- if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+ if (!TRI->hasStackRealignment(MF) && hasFP(MF))
std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
index 322aa6f..6309b8a 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -65,8 +65,7 @@
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL,
- bool IsPrologue) const override;
+ const DebugLoc &DL, bool IsPrologue) const;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/src/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1df9a0d..e9c7ba4 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -30,7 +30,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
-#include <stdint.h>
+#include <cstdint>
+
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
@@ -3844,23 +3845,42 @@
if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
SDValue Add0 = ShiftAmt->getOperand(0);
SDValue Add1 = ShiftAmt->getOperand(1);
+ auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
+ auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
// to avoid the ADD/SUB.
- if (isa<ConstantSDNode>(Add1) &&
- cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
+ if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
NewShiftAmt = Add0;
- // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
- // generate a NEG instead of a SUB of a constant.
- } else if (ShiftAmt->getOpcode() == ISD::SUB &&
- isa<ConstantSDNode>(Add0) &&
- cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
- cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
+ // If we are shifting by N-X where N == 0 mod Size, then just shift by -X
+ // to generate a NEG instead of a SUB of a constant.
+ } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
+ Add0C->getZExtValue() != 0) {
+ EVT SubVT = ShiftAmt.getValueType();
+ SDValue X;
+ if (Add0C->getZExtValue() % Size == 0)
+ X = Add1;
+ else if (ShiftAmt.hasOneUse() && Size == 64 &&
+ Add0C->getZExtValue() % 32 == 0) {
+ // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
+ // This is mainly beneficial if we already compute (x+n*32).
+ if (Add1.getOpcode() == ISD::TRUNCATE) {
+ Add1 = Add1.getOperand(0);
+ SubVT = Add1.getValueType();
+ }
+ if (Add0.getValueType() != SubVT) {
+ Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
+ insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
+ }
+
+ X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
+ insertDAGNode(*CurDAG, OrigShiftAmt, X);
+ } else
+ return false;
// Insert a negate op.
// TODO: This isn't guaranteed to replace the sub if there is a logic cone
// that uses it that's not a shift.
- EVT SubVT = ShiftAmt.getValueType();
SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
- SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
+ SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
NewShiftAmt = Neg;
// Insert these operands into a valid topological order so they can
@@ -4597,17 +4617,19 @@
ReplaceNode(Node, Res);
return;
}
- case Intrinsic::x86_tileloadd64_internal: {
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
break;
- unsigned Opc = X86::PTILELOADDV;
+ unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
+ ? X86::PTILELOADDV
+ : X86::PTILELOADDT1V;
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@@ -4617,43 +4639,11 @@
Index,
Disp,
Segment,
- CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
return;
}
- case Intrinsic::x86_tdpbssd_internal: {
- if (!Subtarget->hasAMXTILE())
- break;
- SDValue Chain = Node->getOperand(0);
- unsigned Opc = X86::PTDPBSSDV;
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Ops[] = {Node->getOperand(2),
- Node->getOperand(3),
- Node->getOperand(4),
- Node->getOperand(5),
- Node->getOperand(6),
- Node->getOperand(7),
- CFG,
- Chain};
- MachineSDNode *CNode =
- CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_tilezero_internal: {
- if (!Subtarget->hasAMXTILE())
- break;
- unsigned Opc = X86::PTILEZEROV;
- SDValue Chain = Node->getOperand(0);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
- MachineSDNode *CNode =
- CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
}
break;
}
@@ -4719,7 +4709,6 @@
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@@ -4730,7 +4719,6 @@
Disp,
Segment,
Node->getOperand(6),
- CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);
@@ -4771,7 +4759,8 @@
}
break;
}
- case ISD::BRIND: {
+ case ISD::BRIND:
+ case X86ISD::NT_BRIND: {
if (Subtarget->isTargetNaCl())
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
// leave the instruction alone.
@@ -4783,7 +4772,7 @@
SDValue Target = Node->getOperand(1);
assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
- SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+ SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
Node->getOperand(0), ZextTarget);
ReplaceNode(Node, Brind.getNode());
SelectCode(ZextTarget.getNode());
@@ -5381,24 +5370,20 @@
break;
}
- SDValue Cmp;
SDValue Chain =
IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
+ SDValue Glue;
if (IsStrictCmp) {
- SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other);
- Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
- Chain = Cmp.getValue(1);
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
+ Glue = Chain.getValue(1);
} else {
- Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0);
+ Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
}
// Move FPSW to AX.
- SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue());
- Chain = FPSW;
SDValue FNSTSW =
- SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW,
- FPSW.getValue(1)),
- 0);
+ SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
// Extract upper 8-bits of AX.
SDValue Extract =
@@ -5461,6 +5446,9 @@
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C) break;
uint64_t Mask = C->getZExtValue();
+ // We may have looked through a truncate so mask off any bits that
+ // shouldn't be part of the compare.
+ Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
// Check if we can replace AND+IMM64 with a shift. This is possible for
// masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
@@ -5550,11 +5538,9 @@
if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
if (!LoadN->isSimple()) {
unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
- if (MOpc == X86::TEST8mi && NumVolBits != 8)
- break;
- else if (MOpc == X86::TEST16mi && NumVolBits != 16)
- break;
- else if (MOpc == X86::TEST32mi && NumVolBits != 32)
+ if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
+ (MOpc == X86::TEST16mi && NumVolBits != 16) ||
+ (MOpc == X86::TEST32mi && NumVolBits != 32))
break;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1e2407c..3a64b34 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
@@ -49,6 +50,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -358,14 +360,23 @@
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FREM , MVT::f128 , Expand);
- setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+ setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
+ }
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
- if (!Subtarget.hasBMI()) {
- setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+
+ if (Subtarget.hasBMI()) {
+ // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
+ // is enabled.
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
+ } else {
+ setOperationAction(ISD::CTTZ, MVT::i16, Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
@@ -931,6 +942,10 @@
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+
+ setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
+
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
@@ -998,7 +1013,9 @@
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
@@ -1233,6 +1250,7 @@
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
@@ -1331,6 +1349,9 @@
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
+
setOperationAction(ISD::ABS, MVT::v4i64, Custom);
setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
@@ -1467,16 +1488,8 @@
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Expand);
- }
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
+ setOperationAction(ISD::VSELECT, VT, Expand);
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
@@ -1635,6 +1648,9 @@
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
+ setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
+
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
@@ -1762,7 +1778,6 @@
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
@@ -1853,15 +1868,7 @@
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
-
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2102,7 +2109,7 @@
!Subtarget.hasBWI())
return TypeSplitVector;
- if (VT.getVectorNumElements() != 1 &&
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
@@ -2214,8 +2221,6 @@
return MVT::i8;
if (Subtarget.hasAVX512()) {
- const unsigned NumElts = VT.getVectorNumElements();
-
// Figure out what this type will be legalized to.
EVT LegalVT = VT;
while (getTypeAction(Context, LegalVT) != TypeLegal)
@@ -2223,7 +2228,7 @@
// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
if (LegalVT.getSimpleVT().is512BitVector())
- return EVT::getVectorVT(Context, MVT::i1, NumElts);
+ return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
// If we legalized to less than a 512-bit vector, then we will use a vXi1
@@ -2231,7 +2236,7 @@
// vXi16/vXi8.
MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
- return EVT::getVectorVT(Context, MVT::i1, NumElts);
+ return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
}
}
@@ -2336,13 +2341,13 @@
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
return X86ScalarSSEf32;
- else if (VT == MVT::f64)
+ if (VT == MVT::f64)
return X86ScalarSSEf64;
return true;
}
bool X86TargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
@@ -2366,7 +2371,7 @@
// well use a regular unaligned vector load.
// We don't have any NT loads pre-SSE41.
if (!!(Flags & MachineMemOperand::MOLoad))
- return (Align < 16 || !Subtarget.hasSSE41());
+ return (Alignment < 16 || !Subtarget.hasSSE41());
return false;
}
// Misaligned accesses of any size are always allowed.
@@ -2403,8 +2408,8 @@
ParamRegs = M->getNumberRegisterParameters();
// Mark the first N int arguments as having reg
- for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
- Type *T = Args[Idx].Ty;
+ for (auto &Arg : Args) {
+ Type *T = Arg.Ty;
if (T->isIntOrPtrTy())
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
unsigned numRegs = 1;
@@ -2413,7 +2418,7 @@
if (ParamRegs < numRegs)
return;
ParamRegs -= numRegs;
- Args[Idx].IsInReg = true;
+ Arg.IsInReg = true;
}
}
}
@@ -2491,14 +2496,14 @@
(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
}
-static Constant* SegmentOffset(IRBuilder<> &IRB,
- unsigned Offset, unsigned AddressSpace) {
+static Constant* SegmentOffset(IRBuilderBase &IRB,
+ int Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
}
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
// glibc, bionic, and Fuchsia have a special slot for the stack guard in
// tcbhead_t; use it instead of the usual global variable (see
// sysdeps/{i386,x86_64}/nptl/tls.h)
@@ -2508,15 +2513,16 @@
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
unsigned AddressSpace = getAddressSpace();
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
// Specially, some users may customize the base reg and offset.
- unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
+ int Offset = M->getStackProtectorGuardOffset();
// If we don't set -stack-protector-guard-offset value:
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
- if (Offset == (unsigned)-1)
+ if (Offset == INT_MAX)
Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
- const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
+ StringRef GuardReg = M->getStackProtectorGuardReg();
if (GuardReg == "fs")
AddressSpace = X86AS::FS;
else if (GuardReg == "gs")
@@ -2546,12 +2552,11 @@
return;
}
- auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
+ StringRef GuardMode = M.getStackProtectorGuard();
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
- if ((GuardMode == llvm::StackProtectorGuards::TLS ||
- GuardMode == llvm::StackProtectorGuards::None)
- && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
+ if ((GuardMode == "tls" || GuardMode.empty()) &&
+ hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
@@ -2574,7 +2579,8 @@
return TargetLowering::getSSPStackGuardCheck(M);
}
-Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+Value *
+X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
@@ -2584,7 +2590,7 @@
if (Subtarget.isTargetAndroid()) {
// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
// %gs:0x24 on i386
- unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
return SegmentOffset(IRB, Offset, getAddressSpace());
}
@@ -3183,7 +3189,8 @@
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
- CC == CallingConv::HHVM || CC == CallingConv::Tail);
+ CC == CallingConv::HHVM || CC == CallingConv::Tail ||
+ CC == CallingConv::SwiftTail);
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -3209,7 +3216,8 @@
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
- return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
+ return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
@@ -3371,12 +3379,8 @@
return None;
}
- const Function &F = MF.getFunction();
- bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool isSoftFloat = Subtarget.useSoftFloat();
- assert(!(isSoftFloat && NoImplicitFloatOps) &&
- "SSE register cannot be used when SSE is disabled!");
- if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
+ if (isSoftFloat || !Subtarget.hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
@@ -3448,11 +3452,6 @@
FrameInfo.CreateFixedObject(1, StackSize, true));
}
- // Figure out if XMM registers are in use.
- assert(!(Subtarget.useSoftFloat() &&
- TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
- "SSE register cannot be used when SSE is disabled!");
-
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
if (is64Bit()) {
@@ -3501,9 +3500,12 @@
Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
for (MCPhysReg Reg : AvailableXmms) {
- Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
- LiveXMMRegs.push_back(
- DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
+ // FastRegisterAllocator spills virtual registers at basic
+ // block boundary. That leads to usages of xmm registers
+ // outside of check for %al. Pass physical registers to
+ // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
+ TheMachineFunction.getRegInfo().addLiveIn(Reg);
+ LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
}
}
@@ -3741,9 +3743,23 @@
}
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
+ if (Ins[I].Flags.isSwiftAsync()) {
+ auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit())
+ X86FI->setHasSwiftAsyncContext(true);
+ else {
+ int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
+ X86FI->setSwiftAsyncContextFrameIdx(FI);
+ SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
+ DAG.getFrameIndex(FI, MVT::i32),
+ MachinePointerInfo::getFixedStack(MF, FI));
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
+ }
+ }
+
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
- if (CallConv == CallingConv::Swift)
+ if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
continue;
// All x86 ABIs require that for returning structs by value we copy the
@@ -3900,6 +3916,7 @@
CallingConv::ID CallConv = CLI.CallConv;
bool &isTailCall = CLI.IsTailCall;
bool isVarArg = CLI.IsVarArg;
+ const auto *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget.is64Bit();
@@ -3907,16 +3924,12 @@
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
- CallConv == CallingConv::Tail;
+ CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
- const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
- bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
- (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
- const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
- bool HasNoCfCheck =
- (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
- bool IsIndirectCall = (CI && CI->isIndirectCall());
+ bool HasNCSR = (CB && isa<CallInst>(CB) &&
+ CB->hasFnAttr("no_caller_saved_registers"));
+ bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
+ bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
@@ -3924,7 +3937,8 @@
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
- if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+ if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
@@ -3936,13 +3950,8 @@
isTailCall = false;
}
- bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
- if (IsMustTail) {
- // Force this to be a tail call. The verifier rules are enough to ensure
- // that we can lower this successfully without moving the return address
- // around.
- isTailCall = true;
- } else if (isTailCall) {
+
+ if (isTailCall && !IsMustTail) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
@@ -3958,6 +3967,10 @@
++NumTailCalls;
}
+ if (IsMustTail && !isTailCall)
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
@@ -3987,7 +4000,9 @@
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
- if (isTailCall && !IsSibcall && !IsMustTail) {
+ if (isTailCall &&
+ shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
// Lower arguments at fp - stackoffset + fpdiff.
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -4338,11 +4353,19 @@
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
- // set X86_INTR calling convention because it has the same CSR mask
- // (same preserved registers).
- const uint32_t *Mask = RegInfo->getCallPreservedMask(
- MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
+ const uint32_t *Mask = [&]() {
+ auto AdaptedCC = CallConv;
+ // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
+ // use X86_INTR calling convention because it has the same CSR mask
+ // (same preserved registers).
+ if (HasNCSR)
+ AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
+ // If NoCalleeSavedRegisters is requested, than use GHC since it happens
+ // to use the CSR_NoRegs_RegMask.
+ if (CB && CB->hasFnAttr("no_callee_saved_registers"))
+ AdaptedCC = (CallingConv::ID)CallingConv::GHC;
+ return RegInfo->getCallPreservedMask(MF, AdaptedCC);
+ }();
assert(Mask && "Missing call preserved mask for calling convention");
// If this is an invoke in a 32-bit function using a funclet-based
@@ -4405,9 +4428,27 @@
if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
+ } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
+ // Calls with a "clang.arc.attachedcall" bundle are special. They should be
+ // expanded to the call, directly followed by a special marker sequence and
+ // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
+ assert(!isTailCall &&
+ "tail calls cannot be marked with clang.arc.attachedcall");
+ assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
+
+ // Add target constant to select ObjC runtime call just before the call
+ // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
+ // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
+ // epxanding the pseudo.
+ unsigned RuntimeCallType =
+ objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
+ Ops.insert(Ops.begin() + 1,
+ DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
+ Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
+
InFlag = Chain.getValue(1);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -4613,7 +4654,7 @@
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
- CalleeCC == CallingConv::Tail;
+ CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
@@ -4633,7 +4674,7 @@
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- if (RegInfo->needsStackRealignment(MF))
+ if (RegInfo->hasStackRealignment(MF))
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
@@ -5842,9 +5883,9 @@
const SDLoc &dl, unsigned vectorWidth) {
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ unsigned Factor = VT.getSizeInBits() / vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
- VT.getVectorNumElements()/Factor);
+ VT.getVectorNumElements() / Factor);
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
@@ -6348,8 +6389,8 @@
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
- V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
@@ -7226,6 +7267,14 @@
return true;
}
+// Wrapper for getTargetShuffleMask with InUnary;
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+ SmallVectorImpl<SDValue> &Ops,
+ SmallVectorImpl<int> &Mask) {
+ bool IsUnary;
+ return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
+}
+
/// Compute whether each element of a shuffle is zeroable.
///
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
@@ -7534,9 +7583,11 @@
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (int i = 0; i != (int)MaskSize; ++i) {
- if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
- Mask.push_back(SM_SentinelUndef);
- else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
+ // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
+ // loops converting between OR and BLEND shuffles due to
+ // canWidenShuffleElements merging away undef elements, meaning we
+ // fail to recognise the OR as the undef element isn't known zero.
+ if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(i);
@@ -7818,8 +7869,14 @@
}
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
- if (!Src.getSimpleValueType().isVector())
- return false;
+ if (!Src.getSimpleValueType().isVector()) {
+ if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(Src.getOperand(1)) ||
+ Src.getOperand(0).getValueType().getScalarType() !=
+ VT.getScalarType())
+ return false;
+ Src = Src.getOperand(0);
+ }
Ops.push_back(Src);
Mask.append(NumElts, 0);
return true;
@@ -7931,6 +7988,30 @@
KnownZero, DAG, Depth, ResolveKnownElts);
}
+// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
+static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
+ EVT MemVT, MemSDNode *Mem, unsigned Offset,
+ SelectionDAG &DAG) {
+ assert((Opcode == X86ISD::VBROADCAST_LOAD ||
+ Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
+ "Unknown broadcast load type");
+
+ // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
+ if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
+ return SDValue();
+
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Mem->getChain(), Ptr};
+ SDValue BcstLd = DAG.getMemIntrinsicNode(
+ Opcode, DL, Tys, Ops, MemVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
+ return BcstLd;
+}
+
/// Returns the scalar element that will make up the i'th
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
@@ -7960,10 +8041,8 @@
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
- bool IsUnary;
-
if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
- ShuffleMask, IsUnary))
+ ShuffleMask))
return SDValue();
int Elt = ShuffleMask[Index];
@@ -8422,7 +8501,7 @@
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
- bool isAfterLegalize) {
+ bool IsAfterLegalize) {
if ((VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
@@ -8552,7 +8631,7 @@
if (FirstLoadedElt == 0 &&
(NumLoadedElts == (int)NumElems || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
- if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
+ if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
@@ -8569,7 +8648,7 @@
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
- if (!isAfterLegalize && VT.isVector()) {
+ if (!IsAfterLegalize && VT.isVector()) {
unsigned NumMaskElts = VT.getVectorNumElements();
if ((NumMaskElts % NumElems) == 0) {
unsigned Scale = NumMaskElts / NumElems;
@@ -8597,7 +8676,7 @@
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
- DAG, Subtarget, isAfterLegalize);
+ DAG, Subtarget, IsAfterLegalize);
if (HalfLD)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
HalfLD, DAG.getIntPtrConstant(0, DL));
@@ -8673,7 +8752,7 @@
VT.getSizeInBits() / ScalarSize);
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
- RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
SDValue Broadcast = RepeatLoad;
if (RepeatSize > ScalarSize) {
while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
@@ -8697,7 +8776,7 @@
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
- bool isAfterLegalize) {
+ bool IsAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
@@ -8708,7 +8787,7 @@
}
assert(Elts.size() == VT.getVectorNumElements());
return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
- isAfterLegalize);
+ IsAfterLegalize);
}
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
@@ -9197,6 +9276,19 @@
return DstVec;
}
+LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
+ switch (Opcode) {
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ return true;
+ }
+ return false;
+}
+
/// This is a helper function of LowerToHorizontalOp().
/// This function checks that the build_vector \p N in input implements a
/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
@@ -9925,9 +10017,19 @@
// Adjust IndicesVec to match VT size.
assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
"Illegal variable permute mask size");
- if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
- IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
- NumElts * VT.getScalarSizeInBits());
+ if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
+ // Narrow/widen the indices vector to the correct size.
+ if (IndicesVec.getValueSizeInBits() > SizeInBits)
+ IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
+ NumElts * VT.getScalarSizeInBits());
+ else if (IndicesVec.getValueSizeInBits() < SizeInBits)
+ IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
+ SDLoc(IndicesVec), SizeInBits);
+ // Zero-extend the index elements within the vector.
+ if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
+ IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
+ IndicesVT, IndicesVec);
+ }
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
// Handle SrcVec that don't match VT type.
@@ -10383,7 +10485,7 @@
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, Item);
@@ -10466,7 +10568,7 @@
SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
DAG.getBuildVector(NarrowVT, dl, Ops));
// Broadcast from v2i64/v2f64 and cast to final VT.
- MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
+ MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
NewBV));
}
@@ -10475,7 +10577,7 @@
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.getSizeInBits() > 128) {
- MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
+ MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
// Build both the lower and upper subvector.
SDValue Lower =
@@ -10928,10 +11030,10 @@
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
- // Ok, handle the in-lane shuffles by detecting if and when they repeat.
- // Adjust second vector indices to start at LaneSize instead of Size.
- int LocalM =
- Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
+ // later vector indices to start at multiples of LaneSize instead of Size.
+ int LaneM = Mask[i] / Size;
+ int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
@@ -11718,7 +11820,7 @@
// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
- SelectionDAG &DAG,
+ const SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
@@ -11729,23 +11831,34 @@
auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
unsigned NumSrcBits = PackVT.getScalarSizeInBits();
unsigned NumPackedBits = NumSrcBits - BitSize;
- SDValue VV1 = DAG.getBitcast(PackVT, N1);
- SDValue VV2 = DAG.getBitcast(PackVT, N2);
+ N1 = peekThroughBitcasts(N1);
+ N2 = peekThroughBitcasts(N2);
+ unsigned NumBits1 = N1.getScalarValueSizeInBits();
+ unsigned NumBits2 = N2.getScalarValueSizeInBits();
+ bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
+ bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
+ if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
+ (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
+ return false;
if (Subtarget.hasSSE41() || BitSize == 8) {
APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
- if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
- (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
- V1 = VV1;
- V2 = VV2;
+ if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
+ (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
+ V1 = N1;
+ V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
}
}
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
- V1 = VV1;
- V2 = VV2;
+ bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
+ bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
+ if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
+ DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
+ (N2.isUndef() || IsZero2 || IsAllOnes2 ||
+ DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
+ V1 = N1;
+ V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
@@ -13703,9 +13816,15 @@
V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
- if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
- DAG.getBitcast(MVT::f64, V));
+ // On AVX we can use VBROADCAST directly for scalar sources.
+ if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
+ V = DAG.getBitcast(MVT::f64, V);
+ if (Subtarget.hasAVX()) {
+ V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
+ return DAG.getBitcast(VT, V);
+ }
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
+ }
// If this is a scalar, do the broadcast on this type and bitcast.
if (!V.getValueType().isVector()) {
@@ -15878,9 +15997,9 @@
if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
return V;
- // If that doesn't work and we have fast variable shuffle,
+ // If that doesn't work and we have fast variable cross-lane shuffle,
// attempt 32-bit sublanes (vpermd).
- if (!Subtarget.hasFastVariableShuffle())
+ if (!Subtarget.hasFastVariableCrossLaneShuffle())
return SDValue();
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
@@ -15959,9 +16078,33 @@
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
- if (Subtarget.hasAVX2() && V2.isUndef())
- return SDValue();
+ if (V2.isUndef()) {
+ // Attempt to match VBROADCAST*128 subvector broadcast load.
+ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
+ bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
+ if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
+ MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+ auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
+ if (!Ld->isNonTemporal()) {
+ MVT MemVT = VT.getHalfNumVectorElementsVT();
+ unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
+ TypeSize::Fixed(Ofs), DL);
+ SDValue Ops[] = {Ld->getChain(), Ptr};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
+ // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+ if (Subtarget.hasAVX2())
+ return SDValue();
+ }
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
@@ -16386,7 +16529,7 @@
if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
!is128BitUnpackShuffleMask(HalfMask) &&
(!isSingleSHUFPSMask(HalfMask) ||
- Subtarget.hasFastVariableShuffle()))
+ Subtarget.hasFastVariableCrossLaneShuffle()))
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
@@ -18353,7 +18496,7 @@
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
- // elements wider than 64 bits, but it might be interesting to form i128
+ // elements wider than 64 bits. It does not seem beneficial to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
@@ -18805,6 +18948,7 @@
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
if (EltVT == MVT::i1)
return InsertBitToMaskVector(Op, DAG, Subtarget);
@@ -18813,9 +18957,37 @@
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
-
auto *N2C = dyn_cast<ConstantSDNode>(N2);
- if (!N2C || N2C->getAPIntValue().uge(NumElts))
+
+ if (!N2C) {
+ // Variable insertion indices, usually we're better off spilling to stack,
+ // but AVX512 can use a variable compare+select by comparing against all
+ // possible vector indices, and FP insertion has less gpr->simd traffic.
+ if (!(Subtarget.hasBWI() ||
+ (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
+ (Subtarget.hasSSE41() && VT.isFloatingPoint())))
+ return SDValue();
+
+ MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
+ MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
+ if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
+ return SDValue();
+
+ SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
+ SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
+ SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
+
+ SmallVector<SDValue, 16> RawIndices;
+ for (unsigned I = 0; I != NumElts; ++I)
+ RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
+ SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
+
+ // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
+ return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
+ ISD::CondCode::SETEQ);
+ }
+
+ if (N2C->getAPIntValue().uge(NumElts))
return SDValue();
uint64_t IdxVal = N2C->getZExtValue();
@@ -18826,7 +18998,7 @@
// a blend shuffle with a rematerializable vector than a costly integer
// insertion.
if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
- 16 <= EltVT.getSizeInBits()) {
+ (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
@@ -18856,7 +19028,7 @@
SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired chunk.
- unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+ unsigned NumEltsIn128 = 128 / EltSizeInBits;
assert(isPowerOf2_32(NumEltsIn128));
// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
@@ -18881,7 +19053,7 @@
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
- MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, N1);
@@ -19505,50 +19677,9 @@
/// and take a 2 x i32 value to shift plus a shift amount.
/// TODO: Can this be moved to general expansion code?
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- MVT VT = Op.getSimpleValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
- // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
- // during isel.
- SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
- DAG.getConstant(VTBits - 1, dl, MVT::i8));
- SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
- DAG.getConstant(VTBits - 1, dl, MVT::i8))
- : DAG.getConstant(0, dl, VT);
-
- SDValue Tmp2, Tmp3;
- if (Op.getOpcode() == ISD::SHL_PARTS) {
- Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
- Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
- } else {
- Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
- Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
- }
-
- // If the shift amount is larger or equal than the width of a part we can't
- // rely on the results of shld/shrd. Insert a test and select the appropriate
- // values for large shift amounts.
- SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i8));
- SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
- DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
-
- SDValue Hi, Lo;
- if (Op.getOpcode() == ISD::SHL_PARTS) {
- Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
- Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
- } else {
- Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
- Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
- }
-
- return DAG.getMergeValues({ Lo, Hi }, dl);
+ SDValue Lo, Hi;
+ DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
+ return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
}
static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
@@ -21148,6 +21279,44 @@
llvm_unreachable("All 256->128 cases should have been handled above!");
}
+// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
+// behaves on out of range inputs to generate optimized conversions.
+static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned DstBits = VT.getScalarSizeInBits();
+ assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
+
+ // Calculate the converted result for values in the range 0 to
+ // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
+ SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
+ SDValue Big =
+ DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
+ DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
+ DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
+
+ // The "CVTTP2SI" instruction conveniently sets the sign bit if
+ // and only if the value was out of range. So we can use that
+ // as our indicator that we rather use "Big" instead of "Small".
+ //
+ // Use "Small" if "IsOverflown" has all bits cleared
+ // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
+
+ // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
+ // use the slightly slower blendv select instead.
+ if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
+ SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
+ return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
+ }
+
+ SDValue IsOverflown =
+ DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
+ DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VT, Small,
+ DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
+}
+
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
@@ -21207,10 +21376,10 @@
// Widen vXi32 fp_to_uint with avx512f to 512-bit source.
if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
- (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
+ (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
+ Subtarget.useAVX512Regs()) {
assert(!IsSigned && "Expected unsigned conversion!");
- assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
- "Unexpected features!");
+ assert(!Subtarget.hasVLX() && "Unexpected features!");
MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
// Need to concat with zero vector for strict fp to avoid spurious
@@ -21240,9 +21409,9 @@
// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
- (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
- assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
- !Subtarget.hasVLX() && "Unexpected features!");
+ (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
+ Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
+ assert(!Subtarget.hasVLX() && "Unexpected features!");
MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
@@ -21269,7 +21438,7 @@
return Res;
}
- if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
+ if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
if (!Subtarget.hasVLX()) {
// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
// legalizer and then widened again by vector op legalization.
@@ -21284,9 +21453,7 @@
SDValue Chain = Tmp.getValue(1);
Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
DAG.getIntPtrConstant(0, dl));
- if (IsStrict)
- return DAG.getMergeValues({Tmp, Chain}, dl);
- return Tmp;
+ return DAG.getMergeValues({Tmp, Chain}, dl);
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
@@ -21301,6 +21468,15 @@
return DAG.getNode(Opc, dl, VT, Tmp);
}
+ // Generate optimized instructions for pre AVX512 unsigned conversions from
+ // vXf32 to vXi32.
+ if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
+ (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
+ (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
+ }
+
return SDValue();
}
@@ -21313,6 +21489,39 @@
if (Subtarget.hasAVX512())
return Op;
+ // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
+ // behaves on out of range inputs to generate optimized conversions.
+ if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
+ (VT == MVT::i64 && Subtarget.is64Bit()))) {
+ unsigned DstBits = VT.getScalarSizeInBits();
+ APInt UIntLimit = APInt::getSignMask(DstBits);
+ SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
+ DAG.getConstant(UIntLimit, dl, VT));
+ MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
+
+ // Calculate the converted result for values in the range:
+ // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
+ // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
+ SDValue Small =
+ DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
+ SDValue Big = DAG.getNode(
+ X86ISD::CVTTS2SI, dl, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
+ DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
+
+ // The "CVTTS2SI" instruction conveniently sets the sign bit if
+ // and only if the value was out of range. So we can use that
+ // as our indicator that we rather use "Big" instead of "Small".
+ //
+ // Use "Small" if "IsOverflown" has all bits cleared
+ // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
+ SDValue IsOverflown = DAG.getNode(
+ ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VT, Small,
+ DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
+ }
+
// Use default expansion for i64.
if (VT == MVT::i64)
return SDValue();
@@ -21475,7 +21684,8 @@
if (!isScalarFPTypeInSSEReg(SrcVT))
return SDValue();
- unsigned SatWidth = Node->getConstantOperandVal(1);
+ EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+ unsigned SatWidth = SatVT.getScalarSizeInBits();
unsigned DstWidth = DstVT.getScalarSizeInBits();
unsigned TmpWidth = TmpVT.getScalarSizeInBits();
assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
@@ -21871,7 +22081,8 @@
// And if it is bigger, shrink it first.
if (Sign.getSimpleValueType().bitsGT(VT))
- Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
+ Sign =
+ DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
@@ -22015,12 +22226,9 @@
SrcMask->push_back(SrcOpMap[SrcOp]);
} else {
// Quit if not all elements are used.
- for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
- E = SrcOpMap.end();
- I != E; ++I) {
- if (!I->second.isAllOnesValue())
+ for (const auto &I : SrcOpMap)
+ if (!I.second.isAllOnesValue())
return false;
- }
}
return true;
@@ -22681,23 +22889,21 @@
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
-static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
+ ISD::CondCode Cond, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(VT.isInteger() && VT == LHS.getValueType() &&
+ VT == RHS.getValueType() && "Unsupported VTs!");
- assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
- assert(Op.getOperand(0).getValueType().isInteger() &&
- VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
-
- SDLoc dl(Op);
- SDValue CC = Op.getOperand(2);
+ SDValue CC = DAG.getCondCode(Cond);
// Extract the LHS Lo/Hi vectors
SDValue LHS1, LHS2;
- std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+ std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
// Extract the RHS Lo/Hi vectors
SDValue RHS1, RHS2;
- std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+ std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
EVT LoVT, HiVT;
@@ -23047,17 +23253,18 @@
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return splitIntVSETCC(Op, DAG);
+ return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
if (VT == MVT::v32i16 || VT == MVT::v64i8) {
assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
- return splitIntVSETCC(Op, DAG);
+ return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
}
- // If this is a SETNE against the signed minimum value, change it to SETGT.
- // If this is a SETNE against the signed maximum value, change it to SETLT.
- // which will be swapped to SETGT.
- // Otherwise we use PCMPEQ+invert.
+ // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
+ // not-of-PCMPEQ:
+ // X != INT_MIN --> X >s INT_MIN
+ // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
+ // +X != 0 --> +X >s 0
APInt ConstValue;
if (Cond == ISD::SETNE &&
ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
@@ -23065,6 +23272,8 @@
Cond = ISD::SETGT;
else if (ConstValue.isMaxSignedValue())
Cond = ISD::SETLT;
+ else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
+ Cond = ISD::SETGT;
}
// If both operands are known non-negative, then an unsigned compare is the
@@ -23404,6 +23613,33 @@
}
if (Op0.getSimpleValueType().isInteger()) {
+ // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
+ // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
+ // this may translate to less uops depending on uarch implementation. The
+ // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
+ // canonicalize to that CondCode.
+ // NOTE: Only do this if incrementing the constant doesn't increase the bit
+ // encoding size - so it must either already be a i8 or i32 immediate, or it
+ // shrinks down to that. We don't do this for any i64's to avoid additional
+ // constant materializations.
+ // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
+ if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
+ const APInt &Op1Val = Op1C->getAPIntValue();
+ if (!Op1Val.isNullValue()) {
+ // Ensure the constant+1 doesn't overflow.
+ if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
+ (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
+ APInt Op1ValPlusOne = Op1Val + 1;
+ if (Op1ValPlusOne.isSignedIntN(32) &&
+ (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
+ Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
+ CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
+ : ISD::CondCode::SETUGE;
+ }
+ }
+ }
+ }
+
SDValue X86CC;
SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
@@ -24996,6 +25232,9 @@
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+ // Propagate flags from original node to transformed node(s).
+ SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
+
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP: {
@@ -25815,7 +26054,7 @@
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
else { // Handles the SP or FP case.
- bool CantUseFP = RegInfo->needsStackRealignment(MF);
+ bool CantUseFP = RegInfo->hasStackRealignment(MF);
if (CantUseFP)
Reg = RegInfo->getPtrSizedStackRegister(MF);
else
@@ -25823,7 +26062,27 @@
}
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
-
+ case Intrinsic::swift_async_context_addr: {
+ auto &MF = DAG.getMachineFunction();
+ auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+ X86FI->setHasSwiftAsyncContext(true);
+ return SDValue(
+ DAG.getMachineNode(
+ X86::SUB64ri8, dl, MVT::i64,
+ DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
+ DAG.getTargetConstant(8, dl, MVT::i32)),
+ 0);
+ } else {
+ // 32-bit so no special extended frame, create or reuse an existing stack
+ // slot.
+ if (!X86FI->getSwiftAsyncContextFrameIdx())
+ X86FI->setSwiftAsyncContextFrameIdx(
+ MF.getFrameInfo().CreateStackObject(4, Align(4), false));
+ return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
+ }
+ }
case Intrinsic::x86_avx512_vp2intersect_q_512:
case Intrinsic::x86_avx512_vp2intersect_q_256:
case Intrinsic::x86_avx512_vp2intersect_q_128:
@@ -26632,7 +26891,8 @@
Register X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
- assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+ if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+ return X86::NoRegister;
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
}
@@ -26805,6 +27065,7 @@
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
case CallingConv::Tail:
+ case CallingConv::SwiftTail:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::EAX;
@@ -26910,6 +27171,118 @@
return DAG.getMergeValues({RetVal, Chain}, DL);
}
+SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc DL(Op);
+ SDValue Chain = Op.getNode()->getOperand(0);
+
+ // FP control word may be set only from data in memory. So we need to allocate
+ // stack space to save/load FP control word.
+ int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
+ SDValue StackSlot =
+ DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
+
+ // Store FP control word into memory.
+ SDValue Ops[] = {Chain, StackSlot};
+ Chain = DAG.getMemIntrinsicNode(
+ X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
+
+ // Load FP Control Word from stack slot and clear RM field (bits 11:10).
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
+ Chain = CWD.getValue(1);
+ CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
+ DAG.getConstant(0xf3ff, DL, MVT::i16));
+
+ // Calculate new rounding mode.
+ SDValue NewRM = Op.getNode()->getOperand(1);
+ SDValue RMBits;
+ if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
+ uint64_t RM = CVal->getZExtValue();
+ int FieldVal;
+ switch (static_cast<RoundingMode>(RM)) {
+ case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
+ case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
+ case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
+ case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
+ default:
+ llvm_unreachable("rounding mode is not supported by X86 hardware");
+ }
+ RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
+ } else {
+ // Need to convert argument into bits of control word:
+ // 0 Round to 0 -> 11
+ // 1 Round to nearest -> 00
+ // 2 Round to +inf -> 10
+ // 3 Round to -inf -> 01
+ // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
+ // To make the conversion, put all these values into a value 0xc9 and shift
+ // it left depending on the rounding mode:
+ // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
+ // (0xc9 << 6) & 0xc00 = X86::rmToNearest
+ // ...
+ // (0xc9 << (2 * NewRM + 4)) & 0xc00
+ SDValue ShiftValue =
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ DAG.getNode(ISD::ADD, DL, MVT::i32,
+ DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
+ DAG.getConstant(1, DL, MVT::i8)),
+ DAG.getConstant(4, DL, MVT::i32)));
+ SDValue Shifted =
+ DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
+ ShiftValue);
+ RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
+ DAG.getConstant(0xc00, DL, MVT::i16));
+ }
+
+ // Update rounding mode bits and store the new FP Control Word into stack.
+ CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
+ Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
+
+ // Load FP control word from the slot.
+ SDValue OpsLD[] = {Chain, StackSlot};
+ MachineMemOperand *MMOL =
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
+ Chain = DAG.getMemIntrinsicNode(
+ X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
+
+ // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
+ // same way but in bits 14:13.
+ if (Subtarget.hasSSE1()) {
+ // Store MXCSR into memory.
+ Chain = DAG.getNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
+ DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
+ StackSlot);
+
+ // Load MXCSR from stack slot and clear RM field (bits 14:13).
+ SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
+ Chain = CWD.getValue(1);
+ CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
+ DAG.getConstant(0xffff9fff, DL, MVT::i32));
+
+ // Shift X87 RM bits from 11:10 to 14:13.
+ RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
+ RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
+ DAG.getConstant(3, DL, MVT::i8));
+
+ // Update rounding mode bits and store the new FP Control Word into stack.
+ CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
+ Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
+
+ // Load MXCSR from the slot.
+ Chain = DAG.getNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
+ DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
+ StackSlot);
+ }
+
+ return Chain;
+}
+
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
//
// i8/i16 vector implemented using dword LZCNT vector instruction
@@ -27116,10 +27489,6 @@
if (VT == MVT::i16 || VT == MVT::i32)
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
- if (VT.getScalarType() == MVT::i1)
- return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
- Op.getOperand(0), Op.getOperand(1));
-
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
@@ -27136,20 +27505,6 @@
unsigned Opcode = Op.getOpcode();
SDLoc DL(Op);
- if (VT.getScalarType() == MVT::i1) {
- switch (Opcode) {
- default: llvm_unreachable("Expected saturated arithmetic opcode");
- case ISD::UADDSAT:
- case ISD::SADDSAT:
- // *addsat i1 X, Y --> X | Y
- return DAG.getNode(ISD::OR, DL, VT, X, Y);
- case ISD::USUBSAT:
- case ISD::SSUBSAT:
- // *subsat i1 X, Y --> X & ~Y
- return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
- }
- }
-
if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
(VT.is256BitVector() && !Subtarget.hasInt256())) {
assert(Op.getSimpleValueType().isInteger() &&
@@ -27233,9 +27588,6 @@
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- if (VT.getScalarType() == MVT::i1)
- return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
-
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
@@ -27273,7 +27625,7 @@
SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the LHS is a constant, manually unpackl/unpackh.
+ // If the RHS is a constant, manually unpackl/unpackh.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
@@ -27376,6 +27728,94 @@
return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
}
+static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
+ MVT VT, bool IsSigned,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG,
+ SDValue *Low = nullptr) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
+ // to a vXi16 type. Do the multiplies, shift the results and pack the half
+ // lane results back together.
+
+ // We'll take different approaches for signed and unsigned.
+ // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+ // and use pmullw to calculate the full 16-bit product.
+ // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
+ // shift them left into the upper byte of each word. This allows us to use
+ // pmulhw to calculate the full 16-bit product. This trick means we don't
+ // need to sign extend the bytes to use pmullw.
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+
+ SDValue ALo, AHi;
+ if (IsSigned) {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
+ } else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+ }
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the RHS is a constant, manually unpackl/unpackh and extend.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ SDValue LoOp = B.getOperand(i + j);
+ SDValue HiOp = B.getOperand(i + j + 8);
+
+ if (IsSigned) {
+ LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+ LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ } else {
+ LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ }
+
+ LoOps.push_back(LoOp);
+ HiOps.push_back(HiOp);
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else if (IsSigned) {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
+ }
+
+ // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+ // pack back to vXi8.
+ unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
+ SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
+
+ if (Low) {
+ // Mask the lower bits and pack the results to rejoin the halves.
+ SDValue Mask = DAG.getConstant(255, dl, ExVT);
+ SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
+ SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
+ *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
+ }
+
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
+ RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
+
+ // Bitcast back to VT and then pack all the even elements from Lo and Hi.
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -27467,11 +27907,11 @@
// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
// and then ashr/lshr the upper bits down to the lower bits before multiply.
- unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
+ unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
@@ -27479,92 +27919,139 @@
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
}
- // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
- // to a vXi16 type. Do the multiplies, shift the results and pack the half
- // lane results back together.
+ return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
+}
- MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+// Custom lowering for SMULO/UMULO.
+static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
- static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1};
+ // Scalars defer to LowerXALUO.
+ if (!VT.isVector())
+ return LowerXALUO(Op, DAG);
- // Extract the lo parts and zero/sign extend to i16.
- // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
- // shifts to sign extend. Using unpack for unsigned only requires an xor to
- // create zeros and a copy due to tied registers contraints pre-avx. But using
- // zero_extend_vector_inreg would require an additional pshufd for the high
- // part.
+ SDLoc dl(Op);
+ bool IsSigned = Op->getOpcode() == ISD::SMULO;
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+ EVT OvfVT = Op->getValueType(1);
- SDValue ALo, AHi;
- if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
- ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
+ if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHSLo, LHSHi;
+ std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
- AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
- AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
- } else if (IsSigned) {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHSLo, RHSHi;
+ std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
- ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
- AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
- } else {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
- DAG.getConstant(0, dl, VT)));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
- DAG.getConstant(0, dl, VT)));
+ EVT LoOvfVT, HiOvfVT;
+ std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
+ SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
+ SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
+
+ // Issue the split operations.
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
+
+ // Join the separate data results and the overflow results.
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
+ Hi.getValue(1));
+
+ return DAG.getMergeValues({Res, Ovf}, dl);
}
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the LHS is a constant, manually unpackl/unpackh and extend.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- SDValue LoOp = B.getOperand(i + j);
- SDValue HiOp = B.getOperand(i + j + 8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetccVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- if (IsSigned) {
- LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
- } else {
- LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
+ unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
+ SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
+
+ SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+
+ SDValue Ovf;
+ if (IsSigned) {
+ SDValue High, LowSign;
+ if (OvfVT.getVectorElementType() == MVT::i1 &&
+ (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
+ // Rather the truncating try to do the compare on vXi16 or vXi32.
+ // Shift the high down filling with sign bits.
+ High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
+ // Fill all 16 bits with the sign bit from the low.
+ LowSign =
+ getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
+ LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
+ 15, DAG);
+ SetccVT = OvfVT;
+ if (!Subtarget.hasBWI()) {
+ // We can't do a vXi16 compare so sign extend to v16i32.
+ High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
+ LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
}
-
- LoOps.push_back(LoOp);
- HiOps.push_back(HiOp);
+ } else {
+ // Otherwise do the compare at vXi8.
+ High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
+ High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+ LowSign =
+ DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
}
+
+ Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
+ } else {
+ SDValue High =
+ getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
+ if (OvfVT.getVectorElementType() == MVT::i1 &&
+ (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
+ // Rather the truncating try to do the compare on vXi16 or vXi32.
+ SetccVT = OvfVT;
+ if (!Subtarget.hasBWI()) {
+ // We can't do a vXi16 compare so sign extend to v16i32.
+ High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
+ }
+ } else {
+ // Otherwise do the compare at vXi8.
+ High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+ }
+
+ Ovf =
+ DAG.getSetCC(dl, SetccVT, High,
+ DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
}
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
- BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
+ Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
- BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
- BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
- } else if (IsSigned) {
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
-
- BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
- BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
- } else {
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
- DAG.getConstant(0, dl, VT)));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
- DAG.getConstant(0, dl, VT)));
+ return DAG.getMergeValues({Low, Ovf}, dl);
}
- // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
- // pack back to vXi8.
- SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
- RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
- RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
+ SDValue Low;
+ SDValue High =
+ LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
- // Bitcast back to VT and then pack all the even elements from Lo and Hi.
- return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ SDValue Ovf;
+ if (IsSigned) {
+ // SMULO overflows if the high bits don't match the sign of the low.
+ SDValue LowSign =
+ DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
+ Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
+ } else {
+ // UMULO overflows if the high bits are non-zero.
+ Ovf =
+ DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
+ }
+
+ Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
+
+ return DAG.getMergeValues({Low, Ovf}, dl);
}
SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
@@ -27777,8 +28264,8 @@
ShiftAmt, DAG);
SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+ APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
+ return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
}
if (Op.getOpcode() == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
@@ -28574,8 +29061,18 @@
}
// ISD::ROT* uses modulo rotate amounts.
- Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
- DAG.getConstant(EltSizeInBits - 1, DL, VT));
+ if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
+ // If the amount is a splat, perform the modulo BEFORE the splat,
+ // this helps LowerScalarVariableShift to remove the splat later.
+ Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
+ Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+ Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
+ SmallVector<int>(NumElts, 0));
+ } else {
+ Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+ }
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
@@ -28771,9 +29268,8 @@
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
- LoadInst *Loaded =
- Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
- Align(AI->getType()->getPrimitiveSizeInBits()));
+ LoadInst *Loaded = Builder.CreateAlignedLoad(
+ AI->getType(), AI->getPointerOperand(), AI->getAlign());
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -29410,7 +29906,7 @@
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
// a store or load.
- if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
@@ -29443,7 +29939,8 @@
SDLoc dl(Node);
EVT VT = Node->getMemoryVT();
- bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool IsSeqCst =
+ Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
// If this store is not sequentially consistent and the type is legal
@@ -30027,6 +30524,7 @@
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
case ISD::CTTZ:
@@ -30042,9 +30540,9 @@
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
- case ISD::USUBO:
+ case ISD::USUBO: return LowerXALUO(Op, DAG);
case ISD::SMULO:
- case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
case ISD::SADDO_CARRY:
@@ -30405,12 +30903,19 @@
} else
Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
- // Preserve what we know about the size of the original result. Except
- // when the result is v2i32 since we can't widen the assert.
- if (PromoteVT != MVT::v2i32)
- Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
- dl, PromoteVT, Res,
- DAG.getValueType(VT.getVectorElementType()));
+ // Preserve what we know about the size of the original result. If the
+ // result is v2i32, we have to manually widen the assert.
+ if (PromoteVT == MVT::v2i32)
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
+ DAG.getUNDEF(MVT::v2i32));
+
+ Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
+ Res.getValueType(), Res,
+ DAG.getValueType(VT.getVectorElementType()));
+
+ if (PromoteVT == MVT::v2i32)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
// Truncate back to the original width.
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
@@ -30430,12 +30935,19 @@
if (VT == MVT::v2i32) {
- assert((IsSigned || Subtarget.hasAVX512()) &&
- "Can only handle signed conversion without AVX512");
+ assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
+ "Strict unsigned conversion requires AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
+ if (!IsSigned && !Subtarget.hasAVX512()) {
+ SDValue Res =
+ expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
+ Results.push_back(Res);
+ return;
+ }
+
unsigned Opc;
if (IsStrict)
Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
@@ -30981,6 +31493,7 @@
NODE_NAME_CASE(FLD)
NODE_NAME_CASE(FST)
NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(CALL_RVMARKER)
NODE_NAME_CASE(BT)
NODE_NAME_CASE(CMP)
NODE_NAME_CASE(FCMP)
@@ -31047,6 +31560,7 @@
NODE_NAME_CASE(EH_RETURN)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(FNSTCW16m)
+ NODE_NAME_CASE(FLDCW16m)
NODE_NAME_CASE(LCMPXCHG_DAG)
NODE_NAME_CASE(LCMPXCHG8_DAG)
NODE_NAME_CASE(LCMPXCHG16_DAG)
@@ -32040,81 +32554,6 @@
return endMBB;
}
-MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
- MachineInstr &MI, MachineBasicBlock *MBB) const {
- // Emit code to save XMM registers to the stack. The ABI says that the
- // number of registers to save is given in %al, so it's theoretically
- // possible to do an indirect jump trick to avoid saving all of them,
- // however this code takes a simpler approach and just executes all
- // of the stores if %al is non-zero. It's less code, and it's probably
- // easier on the hardware branch predictor, and stores aren't all that
- // expensive anyway.
-
- // Create the new basic blocks. One block contains all the XMM stores,
- // and one block is the final destination regardless of whether any
- // stores were performed.
- const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction *F = MBB->getParent();
- MachineFunction::iterator MBBIter = ++MBB->getIterator();
- MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(MBBIter, XMMSaveMBB);
- F->insert(MBBIter, EndMBB);
-
- // Transfer the remainder of MBB and its successor edges to EndMBB.
- EndMBB->splice(EndMBB->begin(), MBB,
- std::next(MachineBasicBlock::iterator(MI)), MBB->end());
- EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
- // The original block will now fall through to the XMM save block.
- MBB->addSuccessor(XMMSaveMBB);
- // The XMMSaveMBB will fall through to the end block.
- XMMSaveMBB->addSuccessor(EndMBB);
-
- // Now add the instructions.
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
-
- Register CountReg = MI.getOperand(0).getReg();
- int RegSaveFrameIndex = MI.getOperand(1).getImm();
- int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
-
- if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
- // If %al is 0, branch around the XMM save block.
- BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
- MBB->addSuccessor(EndMBB);
- }
-
- // Make sure the last operand is EFLAGS, which gets clobbered by the branch
- // that was just emitted, but clearly shouldn't be "saved".
- assert((MI.getNumOperands() <= 3 ||
- !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
- MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
- "Expected last argument to be EFLAGS");
- unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
- // In the XMM save block, save all the XMM argument registers.
- for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
- int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
- MachineMemOperand *MMO = F->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
- MachineMemOperand::MOStore,
- /*Size=*/16, Align(16));
- BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
- .addFrameIndex(RegSaveFrameIndex)
- .addImm(/*Scale=*/1)
- .addReg(/*IndexReg=*/0)
- .addImm(/*Disp=*/Offset)
- .addReg(/*Segment=*/0)
- .addReg(MI.getOperand(i).getReg())
- .addMemOperand(MMO);
- }
-
- MI.eraseFromParent(); // The pseudo instruction is gone now.
-
- return EndMBB;
-}
-
// The EFLAGS operand of SelectItr might be missing a kill marker
// because there were multiple uses of EFLAGS, and ISel didn't know
// which to mark. Figure out whether SelectItr should have had a
@@ -32178,7 +32617,7 @@
MachineBasicBlock *SinkMBB) {
MachineFunction *MF = TrueMBB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- DebugLoc DL = MIItBegin->getDebugLoc();
+ const DebugLoc &DL = MIItBegin->getDebugLoc();
X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
@@ -32230,7 +32669,7 @@
MachineInstr &SecondCascadedCMOV,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = FirstCMOV.getDebugLoc();
+ const DebugLoc &DL = FirstCMOV.getDebugLoc();
// We lower cascaded CMOVs such as
//
@@ -33948,9 +34387,6 @@
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
- case X86::VASTART_SAVE_XMM_REGS:
- return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
-
case X86::VAARG_64:
case X86::VAARG_X32:
return EmitVAARGWithCustomInserter(MI, BB);
@@ -34377,6 +34813,14 @@
Known = Known.trunc(BitWidth);
break;
}
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ if (!Src.getSimpleValueType().isVector()) {
+ Known = DAG.computeKnownBits(Src, Depth + 1);
+ return;
+ }
+ break;
+ }
case X86ISD::ANDNP: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -34404,6 +34848,16 @@
Known.Zero.setBitsFrom(16);
break;
}
+ case X86ISD::PMULUDQ: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ Known = Known.trunc(BitWidth / 2).zext(BitWidth);
+ Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
+ Known = KnownBits::mul(Known, Known2);
+ break;
+ }
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
// If we don't know any bits, early out.
@@ -34513,11 +34967,9 @@
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opc)) {
- bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
- if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
- IsUnary)) {
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
@@ -34532,7 +34984,8 @@
// of the shuffle result.
Known.resetAll();
break;
- } else if (M == SM_SentinelZero) {
+ }
+ if (M == SM_SentinelZero) {
Known.One.clearAllBits();
continue;
}
@@ -34602,6 +35055,13 @@
return 1;
}
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ if (!Src.getSimpleValueType().isVector())
+ return DAG.ComputeNumSignBits(Src, Depth + 1);
+ break;
+ }
+
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
@@ -34623,6 +35083,13 @@
return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
}
+ case X86ISD::FSETCC:
+ // cmpss/cmpsd return zero/all-bits result values in the bottom element.
+ if (VT == MVT::f32 || VT == MVT::f64 ||
+ ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
+ return VTBits;
+ break;
+
case X86ISD::PCMPGT:
case X86ISD::PCMPEQ:
case X86ISD::CMPP:
@@ -34651,11 +35118,9 @@
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opcode)) {
- bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
- if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
- IsUnary)) {
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
@@ -35271,7 +35736,8 @@
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
- bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ bool HasVariableMask, bool AllowVariableCrossLaneMask,
+ bool AllowVariablePerLaneMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget);
/// Combine an arbitrary chain of shuffles into a single instruction if
@@ -35286,7 +35752,9 @@
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask,
- bool AllowVariableMask, SelectionDAG &DAG,
+ bool AllowVariableCrossLaneMask,
+ bool AllowVariablePerLaneMask,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
@@ -35349,6 +35817,20 @@
return CanonicalizeShuffleInput(RootVT, V1);
}
+ // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
+ // etc. can be simplified.
+ if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
+ SmallVector<int> ScaledMask, IdentityMask;
+ unsigned NumElts = VT1.getVectorNumElements();
+ if (BaseMask.size() <= NumElts &&
+ scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
+ for (unsigned i = 0; i != NumElts; ++i)
+ IdentityMask.push_back(i);
+ if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
+ return CanonicalizeShuffleInput(RootVT, V1);
+ }
+ }
+
// Handle 128/256-bit lane shuffles of 512-bit vectors.
if (RootVT.is512BitVector() &&
(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
@@ -35436,6 +35918,17 @@
DL, 256);
}
+ // If we're splatting the low subvector, an insert-subvector 'concat'
+ // pattern is quicker than VPERM2X128.
+ // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
+ if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ Res = extractSubVector(Res, 0, DAG, DL, 128);
+ return concatSubVectors(Res, Res, DAG, DL);
+ }
+
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
@@ -35718,17 +36211,24 @@
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
- int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
- AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
+ int VariableCrossLaneShuffleDepth =
+ Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
+ int VariablePerLaneShuffleDepth =
+ Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
+ AllowVariableCrossLaneMask &=
+ (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
+ AllowVariablePerLaneMask &=
+ (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
// higher depth before combining them.
- bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
+ bool AllowBWIVPERMV3 =
+ (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+ if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
if (Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
@@ -35753,7 +36253,7 @@
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source (non-VLX will pad to 512-bit shuffles).
- if (UnaryShuffle && AllowVariableMask &&
+ if (UnaryShuffle && AllowVariableCrossLaneMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
@@ -35776,13 +36276,14 @@
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
- Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
- DAG, Subtarget))
+ Inputs, Root, BaseMask, Depth, HasVariableMask,
+ AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
+ Subtarget))
return WideShuffle;
// If we have a dual input lane-crossing shuffle then lower to VPERMV3,
// (non-VLX will pad to 512-bit shuffles).
- if (AllowVariableMask && !MaskContainsZeros &&
+ if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
@@ -35802,7 +36303,7 @@
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
- if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
+ if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
@@ -35830,7 +36331,7 @@
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
- if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
@@ -35847,7 +36348,7 @@
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
// to VPERMIL2PD/VPERMIL2PS.
- if (AllowVariableMask && Subtarget.hasXOP() &&
+ if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
@@ -35885,7 +36386,7 @@
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
- if (UnaryShuffle && AllowVariableMask &&
+ if (UnaryShuffle && AllowVariablePerLaneMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
@@ -35916,7 +36417,8 @@
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
- if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
+ if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
+ Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
@@ -35947,13 +36449,13 @@
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
- Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
- DAG, Subtarget))
+ Inputs, Root, BaseMask, Depth, HasVariableMask,
+ AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
return WideShuffle;
// If we have a dual input shuffle then lower to VPERMV3,
// (non-VLX will pad to 512-bit shuffles)
- if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
@@ -35961,9 +36463,11 @@
MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
MaskVT == MVT::v16i32)) ||
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
+ MaskVT == MVT::v32i16)) ||
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
+ MaskVT == MVT::v64i8)))) {
V1 = CanonicalizeShuffleInput(MaskVT, V1);
V2 = CanonicalizeShuffleInput(MaskVT, V2);
Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
@@ -35984,7 +36488,8 @@
// extract_subvector(shuffle(x,y,m2),0)
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
- bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ bool HasVariableMask, bool AllowVariableCrossLaneMask,
+ bool AllowVariablePerLaneMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumMaskElts = BaseMask.size();
unsigned NumInputs = Inputs.size();
@@ -36072,9 +36577,10 @@
// Attempt to combine wider chain.
// TODO: Can we use a better Root?
SDValue WideRoot = WideInputs[0];
- if (SDValue WideShuffle = combineX86ShuffleChain(
- WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget)) {
+ if (SDValue WideShuffle =
+ combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
+ HasVariableMask, AllowVariableCrossLaneMask,
+ AllowVariablePerLaneMask, DAG, Subtarget)) {
WideShuffle =
extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
return DAG.getBitcast(RootVT, WideShuffle);
@@ -36110,39 +36616,86 @@
if (!isHoriz && !isPack)
return SDValue();
+ // Do all ops have a single use?
+ bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
+ return Op.hasOneUse() &&
+ peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
+ });
+
int NumElts = VT0.getVectorNumElements();
int NumLanes = VT0.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
int NumHalfEltsPerLane = NumEltsPerLane / 2;
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
- // See if we can remove the shuffle by resorting the HOP chain so that
- // the HOP args are pre-shuffled.
- // TODO: Generalize to any sized/depth chain.
- // TODO: Add support for PACKSS/PACKUS.
- if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
- shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
- SmallVector<int> ScaledMask;
- if (scaleShuffleElements(Mask, 4, ScaledMask)) {
- // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
- auto GetHOpSrc = [&](int M) {
- if (M == SM_SentinelUndef)
- return DAG.getUNDEF(VT0);
- if (M == SM_SentinelZero)
- return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
- SDValue Src0 = BC[M / NumElts];
- SDValue Src1 = Src0.getOperand((M % 4) >= 2);
- if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
- return Src1.getOperand(M % 2);
- return SDValue();
- };
- SDValue M0 = GetHOpSrc(ScaledMask[0]);
- SDValue M1 = GetHOpSrc(ScaledMask[1]);
- SDValue M2 = GetHOpSrc(ScaledMask[2]);
- SDValue M3 = GetHOpSrc(ScaledMask[3]);
- if (M0 && M1 && M2 && M3) {
- SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
- SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
- return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ if (NumEltsPerLane >= 4 &&
+ (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
+ SmallVector<int> LaneMask, ScaledMask;
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
+ scaleShuffleElements(LaneMask, 4, ScaledMask)) {
+ // See if we can remove the shuffle by resorting the HOP chain so that
+ // the HOP args are pre-shuffled.
+ // TODO: Generalize to any sized/depth chain.
+ // TODO: Add support for PACKSS/PACKUS.
+ if (isHoriz) {
+ // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
+ auto GetHOpSrc = [&](int M) {
+ if (M == SM_SentinelUndef)
+ return DAG.getUNDEF(VT0);
+ if (M == SM_SentinelZero)
+ return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
+ SDValue Src0 = BC[M / 4];
+ SDValue Src1 = Src0.getOperand((M % 4) >= 2);
+ if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
+ return Src1.getOperand(M % 2);
+ return SDValue();
+ };
+ SDValue M0 = GetHOpSrc(ScaledMask[0]);
+ SDValue M1 = GetHOpSrc(ScaledMask[1]);
+ SDValue M2 = GetHOpSrc(ScaledMask[2]);
+ SDValue M3 = GetHOpSrc(ScaledMask[3]);
+ if (M0 && M1 && M2 && M3) {
+ SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
+ SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
+ return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ }
+ }
+ // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
+ if (Ops.size() >= 2) {
+ SDValue LHS, RHS;
+ auto GetHOpSrc = [&](int M, int &OutM) {
+ // TODO: Support SM_SentinelZero
+ if (M < 0)
+ return M == SM_SentinelUndef;
+ SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
+ if (!LHS || LHS == Src) {
+ LHS = Src;
+ OutM = (M % 2);
+ return true;
+ }
+ if (!RHS || RHS == Src) {
+ RHS = Src;
+ OutM = (M % 2) + 2;
+ return true;
+ }
+ return false;
+ };
+ int PostMask[4] = {-1, -1, -1, -1};
+ if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
+ GetHOpSrc(ScaledMask[1], PostMask[1]) &&
+ GetHOpSrc(ScaledMask[2], PostMask[2]) &&
+ GetHOpSrc(ScaledMask[3], PostMask[3])) {
+ LHS = DAG.getBitcast(SrcVT, LHS);
+ RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
+ SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ // Use SHUFPS for the permute so this will work on SSE3 targets,
+ // shuffle combining and domain handling will simplify this later on.
+ MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
+ Res = DAG.getBitcast(ShuffleVT, Res);
+ return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
+ getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
+ }
}
}
}
@@ -36198,19 +36751,18 @@
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
// represents the LHS/RHS inputs for the lower/upper halves.
- unsigned EltSizeInBits = RootSizeInBits / Mask.size();
SmallVector<int, 16> TargetMask128, WideMask128;
if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
scaleShuffleElements(TargetMask128, 2, WideMask128)) {
assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
bool SingleOp = (Ops.size() == 1);
- if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ if (isPack || OneUseOps ||
+ shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
Lo = Lo.getOperand(WideMask128[0] & 1);
Hi = Hi.getOperand(WideMask128[1] & 1);
if (SingleOp) {
- MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
SDValue Undef = DAG.getUNDEF(SrcVT);
SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
@@ -36356,8 +36908,9 @@
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
- unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
- SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
+ bool AllowVariablePerLaneMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
@@ -36555,12 +37108,15 @@
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
- // Handle the all undef/zero cases early.
+ // Handle the all undef/zero/ones cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
+ if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
+ none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= IsOpVariableMask;
@@ -36584,13 +37140,17 @@
SmallVector<int, 64> ResolvedMask = Mask;
if (EmptyRoot)
resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
- bool AllowVar = false;
+ bool AllowCrossLaneVar = false;
+ bool AllowPerLaneVar = false;
if (Ops[i].getNode()->hasOneUse() ||
- SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
- AllowVar = AllowVariableMask;
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
+ AllowCrossLaneVar = AllowVariableCrossLaneMask;
+ AllowPerLaneVar = AllowVariablePerLaneMask;
+ }
if (SDValue Res = combineX86ShufflesRecursively(
Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
- HasVariableMask, AllowVar, DAG, Subtarget))
+ HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
+ Subtarget))
return Res;
}
}
@@ -36600,6 +37160,18 @@
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
+ // If constant fold failed and we only have constants - then we have
+ // multiple uses by a single non-variable shuffle - just bail.
+ if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
+ APInt UndefElts;
+ SmallVector<APInt> RawBits;
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
+ return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+ RawBits);
+ })) {
+ return SDValue();
+ }
+
// Canonicalize the combined shuffle mask chain with horizontal ops.
// NOTE: This will update the Ops and Mask.
if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
@@ -36641,23 +37213,25 @@
// Finally, try to combine into a single shuffle instruction.
return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+ AllowVariableCrossLaneMask,
+ AllowVariablePerLaneMask, DAG, Subtarget);
}
// If that failed and any input is extracted then try to combine as a
// shuffle with the larger type.
- return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
- HasVariableMask, AllowVariableMask,
- DAG, Subtarget);
+ return combineX86ShuffleChainWithExtract(
+ Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
+ AllowVariablePerLaneMask, DAG, Subtarget);
}
/// Helper entry wrapper to combineX86ShufflesRecursively.
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
- X86::MaxShuffleCombineDepth,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget);
+ return combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
+ /*HasVarMask*/ false,
+ /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
+ Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
@@ -36668,9 +37242,8 @@
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
SmallVector<SDValue, 2> Ops;
- bool IsUnary;
bool HaveMask =
- getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
+ getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
(void)HaveMask;
assert(HaveMask);
@@ -36889,6 +37462,134 @@
return SDValue();
}
+// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
+static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT ShuffleVT = N.getValueType();
+
+ auto IsMergeableWithShuffle = [](SDValue Op) {
+ // AllZeros/AllOnes constants are freely shuffled and will peek through
+ // bitcasts. Other constant build vectors do not peek through bitcasts. Only
+ // merge with target shuffles if it has one use so shuffle combining is
+ // likely to kick in.
+ return ISD::isBuildVectorAllOnes(Op.getNode()) ||
+ ISD::isBuildVectorAllZeros(Op.getNode()) ||
+ ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
+ (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
+ };
+ auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
+ // Ensure we only shuffle whole vector src elements, unless its a logical
+ // binops where we can more aggressively move shuffles from dst to src.
+ return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
+ (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
+ };
+
+ unsigned Opc = N.getOpcode();
+ switch (Opc) {
+ // Unary and Unary+Permute Shuffles.
+ case X86ISD::PSHUFB: {
+ // Don't merge PSHUFB if it contains zero'd elements.
+ SmallVector<int> Mask;
+ SmallVector<SDValue> Ops;
+ if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
+ Mask))
+ break;
+ LLVM_FALLTHROUGH;
+ }
+ case X86ISD::VBROADCAST:
+ case X86ISD::MOVDDUP:
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMI:
+ case X86ISD::VPERMILPI: {
+ if (N.getOperand(0).getValueType() == ShuffleVT &&
+ N->isOnlyUserOf(N.getOperand(0).getNode())) {
+ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+ unsigned SrcOpcode = N0.getOpcode();
+ if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
+ SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
+ SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
+ if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
+ SDValue LHS, RHS;
+ Op00 = DAG.getBitcast(ShuffleVT, Op00);
+ Op01 = DAG.getBitcast(ShuffleVT, Op01);
+ if (N.getNumOperands() == 2) {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
+ } else {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
+ }
+ EVT OpVT = N0.getValueType();
+ return DAG.getBitcast(ShuffleVT,
+ DAG.getNode(SrcOpcode, DL, OpVT,
+ DAG.getBitcast(OpVT, LHS),
+ DAG.getBitcast(OpVT, RHS)));
+ }
+ }
+ }
+ break;
+ }
+ // Binary and Binary+Permute Shuffles.
+ case X86ISD::INSERTPS: {
+ // Don't merge INSERTPS if it contains zero'd elements.
+ unsigned InsertPSMask = N.getConstantOperandVal(2);
+ unsigned ZeroMask = InsertPSMask & 0xF;
+ if (ZeroMask != 0)
+ break;
+ LLVM_FALLTHROUGH;
+ }
+ case X86ISD::MOVSD:
+ case X86ISD::MOVSS:
+ case X86ISD::BLENDI:
+ case X86ISD::SHUFP:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL: {
+ if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
+ N->isOnlyUserOf(N.getOperand(1).getNode())) {
+ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
+ unsigned SrcOpcode = N0.getOpcode();
+ if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
+ IsSafeToMoveShuffle(N0, SrcOpcode) &&
+ IsSafeToMoveShuffle(N1, SrcOpcode)) {
+ SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
+ SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
+ SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
+ SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
+ // Ensure the total number of shuffles doesn't increase by folding this
+ // shuffle through to the source ops.
+ if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
+ (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
+ ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
+ (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
+ SDValue LHS, RHS;
+ Op00 = DAG.getBitcast(ShuffleVT, Op00);
+ Op10 = DAG.getBitcast(ShuffleVT, Op10);
+ Op01 = DAG.getBitcast(ShuffleVT, Op01);
+ Op11 = DAG.getBitcast(ShuffleVT, Op11);
+ if (N.getNumOperands() == 3) {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
+ } else {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
+ }
+ EVT OpVT = N0.getValueType();
+ return DAG.getBitcast(ShuffleVT,
+ DAG.getNode(SrcOpcode, DL, OpVT,
+ DAG.getBitcast(OpVT, LHS),
+ DAG.getBitcast(OpVT, RHS)));
+ }
+ }
+ }
+ break;
+ }
+ }
+ return SDValue();
+}
+
/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
SelectionDAG &DAG,
@@ -36908,12 +37609,11 @@
switch (SrcOpc0) {
case X86ISD::MOVDDUP: {
- SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
- SDValue RHS =
- DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue LHS = Src0.getOperand(0);
+ SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
SDValue Res =
- DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
- Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
+ DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
return DAG.getBitcast(VT, Res);
}
case X86ISD::VPERMILPI:
@@ -36929,13 +37629,11 @@
case X86ISD::VSRAI:
case X86ISD::PSHUFD:
if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
- SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
- SDValue RHS =
- DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
- SDValue Res =
- DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
- Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
- Src0.getOperand(1));
+ SDValue LHS = Src0.getOperand(0);
+ SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
+ SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
+ V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
return DAG.getBitcast(VT, Res);
}
break;
@@ -36956,29 +37654,8 @@
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
- // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
- // help expose the 'NOT' pattern further up the DAG.
- // TODO: This might be beneficial for any binop with a 'splattable' operand.
- switch (Opcode) {
- case X86ISD::MOVDDUP:
- case X86ISD::PSHUFD: {
- SDValue Src = N.getOperand(0);
- if (Src.hasOneUse() && Src.getValueType() == VT) {
- if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
- Not = DAG.getBitcast(VT, Not);
- Not = Opcode == X86ISD::MOVDDUP
- ? DAG.getNode(Opcode, DL, VT, Not)
- : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
- EVT IntVT = Not.getValueType().changeTypeToInteger();
- SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
- Not = DAG.getBitcast(IntVT, Not);
- Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
- return DAG.getBitcast(VT, Not);
- }
- }
- break;
- }
- }
+ if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
+ return R;
// Handle specific target shuffles.
switch (Opcode) {
@@ -37017,7 +37694,8 @@
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
X86::MaxShuffleCombineDepth,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
+ /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
@@ -37026,7 +37704,9 @@
// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
if (Src.getOpcode() == ISD::BITCAST &&
SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
- DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
+ DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+ FixedVectorType::isValidElementType(
+ BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
VT.getVectorNumElements());
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
@@ -37343,7 +38023,7 @@
return Res;
// Fold vperm2x128 subvector shuffle with an inner concat pattern.
- // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
+ // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
auto FindSubVector128 = [&](unsigned Idx) {
if (Idx > 3)
return SDValue();
@@ -37809,120 +38489,6 @@
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
}
-/// Eliminate a redundant shuffle of a horizontal math op.
-static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
- // TODO: Can we use getTargetShuffleInputs instead?
- unsigned Opcode = N->getOpcode();
- if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
- if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
- if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
-
- // For a broadcast, peek through an extract element of index 0 to find the
- // horizontal op: broadcast (ext_vec_elt HOp, 0)
- EVT VT = N->getValueType(0);
- if (Opcode == X86ISD::VBROADCAST) {
- SDValue SrcOp = N->getOperand(0);
- if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- SrcOp.getValueType() == MVT::f64 &&
- SrcOp.getOperand(0).getValueType() == VT &&
- isNullConstant(SrcOp.getOperand(1)))
- N = SrcOp.getNode();
- }
-
- SDValue HOp = N->getOperand(0);
- if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
- HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
- return SDValue();
-
- // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
- // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
- // Don't fold if hop(x,y) == hop(z,w).
- if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
- SDValue HOp2 = N->getOperand(1);
- if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
- return SDValue();
- if (HOp == HOp2)
- return SDValue();
- SDLoc DL(HOp);
- unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
- SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
- HOp2.getOperand(LoHi));
- // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
- // combining and domain handling will simplify this later on.
- EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
- Res = DAG.getBitcast(ShuffleVT, Res);
- Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
- getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
- return DAG.getBitcast(VT, Res);
- }
-
- // 128-bit horizontal math instructions are defined to operate on adjacent
- // lanes of each operand as:
- // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
- // ...similarly for v2f64 and v8i16.
- if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
- HOp.getOperand(0) != HOp.getOperand(1))
- return SDValue();
-
- // The shuffle that we are eliminating may have allowed the horizontal op to
- // have an undemanded (undefined) operand. Duplicate the other (defined)
- // operand to ensure that the results are defined across all lanes without the
- // shuffle.
- auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
- SDValue X;
- if (HorizOp.getOperand(0).isUndef()) {
- assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
- X = HorizOp.getOperand(1);
- } else if (HorizOp.getOperand(1).isUndef()) {
- assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
- X = HorizOp.getOperand(0);
- } else {
- return HorizOp;
- }
- return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
- HorizOp.getValueType(), X, X);
- };
-
- // When the operands of a horizontal math op are identical, the low half of
- // the result is the same as the high half. If a target shuffle is also
- // replicating low and high halves (and without changing the type/length of
- // the vector), we don't need the shuffle.
- if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
- if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector())
- return SDValue();
- if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
- // movddup (hadd X, X) --> hadd X, X
- // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
- assert((HOp.getValueType() == MVT::v2f64 ||
- HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
- return updateHOp(HOp, DAG);
- }
- return SDValue();
- }
-
- // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
-
- // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
- // but this should be tied to whatever horizontal op matching and shuffle
- // canonicalization are producing.
- if (HOp.getValueSizeInBits() == 128 &&
- (isShuffleEquivalent(Mask, {0, 0}) ||
- isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
- isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
- return updateHOp(HOp, DAG);
-
- if (HOp.getValueSizeInBits() == 256 &&
- (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
- isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
- isShuffleEquivalent(
- Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
- return updateHOp(HOp, DAG);
-
- return SDValue();
-}
-
/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
@@ -37968,45 +38534,13 @@
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.isTypeLegal(VT)) {
+ if (TLI.isTypeLegal(VT))
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
- if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
- return HAddSub;
-
- // Merge shuffles through binops if its likely we'll be able to merge it
- // with other shuffles (as long as they aren't splats).
- // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
- // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
- if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
- unsigned SrcOpcode = N->getOperand(0).getOpcode();
- if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
- N->isOnlyUserOf(N->getOperand(0).getNode()) &&
- N->isOnlyUserOf(N->getOperand(1).getNode())) {
- SDValue Op00 = N->getOperand(0).getOperand(0);
- SDValue Op10 = N->getOperand(1).getOperand(0);
- SDValue Op01 = N->getOperand(0).getOperand(1);
- SDValue Op11 = N->getOperand(1).getOperand(1);
- auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
- auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
- auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
- auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
- if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
- ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
- SDLoc DL(N);
- ArrayRef<int> Mask = SVN->getMask();
- SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
- SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
- return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
- }
- }
- }
- }
-
// Attempt to combine into a vector load/broadcast.
- if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
- Subtarget, true))
+ if (SDValue LD = combineToConsecutiveLoads(
+ VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
return LD;
// For AVX2, we sometimes want to combine
@@ -38276,14 +38810,17 @@
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
- APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
+ // TODO - pass on known zero/undef.
+
// Aggressively peek through ops to get at the demanded elts.
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnesValue()) {
@@ -38304,17 +38841,37 @@
case X86ISD::HSUB:
case X86ISD::FHADD:
case X86ISD::FHSUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+
APInt DemandedLHS, DemandedRHS;
getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt LHSUndef, LHSZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
- LHSZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
+ Depth + 1))
return true;
APInt RHSUndef, RHSZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
- RHSZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
+ Depth + 1))
return true;
+
+ // TODO - pass on known zero/undef.
+
+ // Aggressively peek through ops to get at the demanded elts.
+ // TODO: Handle repeated operands.
+ if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
+ TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
+ TLO.DAG, Depth + 1);
+ if (NewN0 || NewN1) {
+ NewN0 = NewN0 ? NewN0 : N0;
+ NewN1 = NewN1 ? NewN1 : N1;
+ return TLO.CombineTo(Op,
+ TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
+ }
+ }
break;
}
case X86ISD::VTRUNC:
@@ -38511,6 +39068,22 @@
}
break;
}
+ case X86ISD::VPERM2X128: {
+ // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
+ SDLoc DL(Op);
+ unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
+ if (LoMask & 0x8)
+ return TLO.CombineTo(
+ Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
+ unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
+ unsigned SrcIdx = (LoMask & 0x2) >> 1;
+ SDValue ExtOp =
+ extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
// Zero upper elements.
case X86ISD::VZEXT_MOVL:
// Target unary shuffles by immediate:
@@ -38635,7 +39208,8 @@
SDValue NewShuffle = combineX86ShufflesRecursively(
{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
/*HasVarMask*/ false,
- /*AllowVarMask*/ true, TLO.DAG, Subtarget);
+ /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
+ Subtarget);
if (NewShuffle)
return TLO.CombineTo(Op, NewShuffle);
}
@@ -38906,6 +39480,29 @@
}
// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
break;
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt DemandedElts = APInt::getOneBitSet(
+ SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
+ if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+ // If we don't need the upper bits, attempt to narrow the broadcast source.
+ // Don't attempt this on AVX512 as it might affect broadcast folding.
+ // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
+ if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
+ OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
+ MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
+ SDValue NewSrc =
+ TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
+ MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
+ SDValue NewBcst =
+ TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
+ }
+ break;
+ }
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
@@ -39127,17 +39724,22 @@
Op, DemandedBits, DemandedElts, DAG, Depth);
}
-// Helper to peek through bitops/setcc to determine size of source vector.
+// Helper to peek through bitops/trunc/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
-static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
+ bool AllowTruncate) {
switch (Src.getOpcode()) {
+ case ISD::TRUNCATE:
+ if (!AllowTruncate)
+ return false;
+ LLVM_FALLTHROUGH;
case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
- return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
- checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
}
return false;
}
@@ -39192,6 +39794,7 @@
SDValue Src, const SDLoc &DL) {
switch (Src.getOpcode()) {
case ISD::SETCC:
+ case ISD::TRUNCATE:
return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
case ISD::AND:
case ISD::XOR:
@@ -39275,7 +39878,8 @@
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
+ if (Subtarget.hasAVX() &&
+ checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
SExtVT = MVT::v4i64;
PropagateSExt = true;
}
@@ -39287,8 +39891,8 @@
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
- checkBitcastSrcVectorSize(Src, 512))) {
+ if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
+ checkBitcastSrcVectorSize(Src, 512, true))) {
SExtVT = MVT::v8i32;
PropagateSExt = true;
}
@@ -39313,7 +39917,7 @@
break;
}
// Split if this is a <64 x i8> comparison result.
- if (checkBitcastSrcVectorSize(Src, 512)) {
+ if (checkBitcastSrcVectorSize(Src, 512, false)) {
SExtVT = MVT::v64i8;
break;
}
@@ -39566,6 +40170,7 @@
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SrcVT = N0.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
@@ -39623,8 +40228,7 @@
// If we're bitcasting from iX to vXi1, see if the integer originally
// began as a vXi1 and whether we can remove the bitcast entirely.
if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
- SrcVT.isScalarInteger() &&
- DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
if (SDValue V =
combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
return V;
@@ -39812,8 +40416,11 @@
default: return SDValue();
}
+ // Check if we have a bitcast from another integer type as well.
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
- (Subtarget.hasSSE2() && VT == MVT::f64)))
+ (Subtarget.hasSSE2() && VT == MVT::f64) ||
+ (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
+ TLI.isTypeLegal(VT))))
return SDValue();
SDValue LogicOp0 = N0.getOperand(0);
@@ -39822,17 +40429,21 @@
// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
- LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
+ LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
+ LogicOp0.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
- return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
+ unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
+ return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
}
// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
- LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
+ LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
+ LogicOp1.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
- return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
+ unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
+ return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
}
return SDValue();
@@ -40277,6 +40888,38 @@
Idx);
}
+ // We can only legally extract other elements from 128-bit vectors and in
+ // certain circumstances, depending on SSE-level.
+ // TODO: Investigate float/double extraction if it will be just stored.
+ auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
+ unsigned Idx) {
+ EVT VecSVT = VecVT.getScalarType();
+ if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
+ (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
+ VecSVT == MVT::i64)) {
+ unsigned EltSizeInBits = VecSVT.getSizeInBits();
+ unsigned NumEltsPerLane = 128 / EltSizeInBits;
+ unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
+ unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
+ VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
+ Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
+ Idx &= (NumEltsPerLane - 1);
+ }
+ if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
+ ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
+ DAG.getBitcast(VecVT, Vec),
+ DAG.getIntPtrConstant(Idx, dl));
+ }
+ if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
+ DAG.getTargetConstant(Idx, dl, MVT::i8));
+ }
+ return SDValue();
+ };
+
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
@@ -40298,7 +40941,7 @@
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
// Simplify Mask based on demanded element.
- int ExtractIdx = (int)N->getConstantOperandVal(1);
+ int ExtractIdx = (int)IdxC.getZExtValue();
int Scale = Mask.size() / NumSrcElts;
int Lo = Scale * ExtractIdx;
int Hi = Scale * (ExtractIdx + 1);
@@ -40310,49 +40953,41 @@
while (Mask.size() > NumSrcElts &&
canWidenShuffleElements(Mask, WidenedMask))
Mask = std::move(WidenedMask);
- // TODO - investigate support for wider shuffle masks with known upper
- // undef/zero elements for implicit zero-extension.
}
}
- // Check if narrowing/widening failed.
- if (Mask.size() != NumSrcElts)
- return SDValue();
-
- int SrcIdx = Mask[IdxC.getZExtValue()];
+ // If narrowing/widening failed, see if we can extract+zero-extend.
+ int ExtractIdx;
+ EVT ExtractVT;
+ if (Mask.size() == NumSrcElts) {
+ ExtractIdx = Mask[IdxC.getZExtValue()];
+ ExtractVT = SrcVT;
+ } else {
+ unsigned Scale = Mask.size() / NumSrcElts;
+ if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
+ return SDValue();
+ unsigned ScaledIdx = Scale * IdxC.getZExtValue();
+ if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
+ return SDValue();
+ ExtractIdx = Mask[ScaledIdx];
+ EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
+ ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
+ assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
+ "Failed to widen vector type");
+ }
// If the shuffle source element is undef/zero then we can just accept it.
- if (SrcIdx == SM_SentinelUndef)
+ if (ExtractIdx == SM_SentinelUndef)
return DAG.getUNDEF(VT);
- if (SrcIdx == SM_SentinelZero)
+ if (ExtractIdx == SM_SentinelZero)
return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
: DAG.getConstant(0, dl, VT);
- SDValue SrcOp = Ops[SrcIdx / Mask.size()];
- SrcIdx = SrcIdx % Mask.size();
-
- // We can only extract other elements from 128-bit vectors and in certain
- // circumstances, depending on SSE-level.
- // TODO: Investigate using extract_subvector for larger vectors.
- // TODO: Investigate float/double extraction if it will be just stored.
- if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
- ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
- assert(SrcSVT == VT && "Unexpected extraction type");
- SrcOp = DAG.getBitcast(SrcVT, SrcOp);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
- DAG.getIntPtrConstant(SrcIdx, dl));
- }
-
- if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
- (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
- assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
- unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
- SrcOp = DAG.getBitcast(SrcVT, SrcOp);
- SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
- DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
- return DAG.getZExtOrTrunc(ExtOp, dl, VT);
- }
+ SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
+ ExtractIdx = ExtractIdx % Mask.size();
+ if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
+ return DAG.getZExtOrTrunc(V, dl, VT);
return SDValue();
}
@@ -41152,14 +41787,11 @@
LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
LHS.hasOneUse() && RHS.hasOneUse()) {
MVT SimpleVT = VT.getSimpleVT();
- bool LHSUnary, RHSUnary;
SmallVector<SDValue, 1> LHSOps, RHSOps;
SmallVector<int, 64> LHSMask, RHSMask, CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
- getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
- LHSUnary) &&
- getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
- RHSUnary)) {
+ getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
+ getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
int NumElts = VT.getVectorNumElements();
for (int i = 0; i != NumElts; ++i) {
if (CondMask[i] < NumElts)
@@ -41384,8 +42016,7 @@
if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
: RHS.getOperand(0).getValueType();
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
+ EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
VT.getSizeInBits());
RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
@@ -41401,41 +42032,74 @@
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
- // Canonicalize min/max:
- // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
- // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
- // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
- // the need for an extra compare against zero. e.g.
- // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
- // subl %esi, %edi
- // testl %edi, %edi
- // movl $0, %eax
- // cmovgl %edi, %eax
- // =>
- // xorl %eax, %eax
- // subl %esi, $edi
- // cmovsl %eax, %edi
- //
- // We can also canonicalize
- // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
- // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
- // This allows the use of a test instruction for the compare.
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
- Cond.hasOneUse() &&
- LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ Cond.hasOneUse()) {
+ EVT CondVT = Cond.getValueType();
+ SDValue Cond0 = Cond.getOperand(0);
+ SDValue Cond1 = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
- (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
- ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
- Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
- Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+
+ // Canonicalize min/max:
+ // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
+ // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare against zero. e.g.
+ // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ //
+ // We can also canonicalize
+ // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+ // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+ // This allows the use of a test instruction for the compare.
+ if (LHS == Cond0 && RHS == Cond1) {
+ if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
+ (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
+ ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
+ Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+ }
+ if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+ ISD::CondCode NewCC = ISD::SETUGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+ }
}
- if (CC == ISD::SETUGT && isOneConstant(RHS)) {
- ISD::CondCode NewCC = ISD::SETUGE;
- Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
- Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+
+ // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
+ // fold eq + gt/lt nested selects into ge/le selects
+ // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
+ // --> (select (cmpuge Cond0, Cond1), LHS, Y)
+ // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
+ // --> (select (cmpsle Cond0, Cond1), LHS, Y)
+ // .. etc ..
+ if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
+ RHS.getOperand(0).getOpcode() == ISD::SETCC) {
+ SDValue InnerSetCC = RHS.getOperand(0);
+ ISD::CondCode InnerCC =
+ cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
+ if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
+ Cond0 == InnerSetCC.getOperand(0) &&
+ Cond1 == InnerSetCC.getOperand(1)) {
+ ISD::CondCode NewCC;
+ switch (CC == ISD::SETEQ ? InnerCC : CC) {
+ case ISD::SETGT: NewCC = ISD::SETGE; break;
+ case ISD::SETLT: NewCC = ISD::SETLE; break;
+ case ISD::SETUGT: NewCC = ISD::SETUGE; break;
+ case ISD::SETULT: NewCC = ISD::SETULE; break;
+ default: NewCC = ISD::SETCC_INVALID; break;
+ }
+ if (NewCC != ISD::SETCC_INVALID) {
+ Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
+ }
+ }
}
}
@@ -41598,6 +42262,7 @@
SDValue CmpLHS = Cmp.getOperand(0);
SDValue CmpRHS = Cmp.getOperand(1);
+ EVT CmpVT = CmpLHS.getValueType();
if (!CmpLHS.hasOneUse())
return SDValue();
@@ -41620,21 +42285,46 @@
return SDValue();
APInt Comparison = CmpRHSC->getAPIntValue();
+ APInt NegAddend = -Addend;
+
+ // See if we can adjust the CC to make the comparison match the negated
+ // addend.
+ if (Comparison != NegAddend) {
+ APInt IncComparison = Comparison + 1;
+ if (IncComparison == NegAddend) {
+ if (CC == X86::COND_A && !Comparison.isMaxValue()) {
+ Comparison = IncComparison;
+ CC = X86::COND_AE;
+ } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
+ Comparison = IncComparison;
+ CC = X86::COND_L;
+ }
+ }
+ APInt DecComparison = Comparison - 1;
+ if (DecComparison == NegAddend) {
+ if (CC == X86::COND_AE && !Comparison.isMinValue()) {
+ Comparison = DecComparison;
+ CC = X86::COND_A;
+ } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
+ Comparison = DecComparison;
+ CC = X86::COND_LE;
+ }
+ }
+ }
// If the addend is the negation of the comparison value, then we can do
// a full comparison by emitting the atomic arithmetic as a locked sub.
- if (Comparison == -Addend) {
+ if (Comparison == NegAddend) {
// The CC is fine, but we need to rewrite the LHS of the comparison as an
// atomic sub.
auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
auto AtomicSub = DAG.getAtomic(
- ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
+ ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
- /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
+ /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
AN->getMemOperand());
auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
- DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
- DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
}
@@ -41656,8 +42346,7 @@
return SDValue();
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
- DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
- DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
}
@@ -42111,7 +42800,7 @@
}
// PMOVMSKB(PACKSSBW(LO(X), HI(X)))
// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
- if (CmpBits == 16 && Subtarget.hasInt256() &&
+ if (CmpBits >= 16 && Subtarget.hasInt256() &&
VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
@@ -42135,7 +42824,7 @@
// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
SmallVector<int, 32> ShuffleMask;
SmallVector<SDValue, 2> ShuffleInputs;
- if (NumElts == CmpBits &&
+ if (NumElts <= CmpBits &&
getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
ShuffleMask, DAG) &&
ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
@@ -43097,113 +43786,145 @@
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
- assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
- X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
- X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
- "Unexpected hadd/hsub/pack opcode");
+ assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
+ SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT SrcVT = N0.getValueType();
+ SDValue BC0 =
+ N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
+ SDValue BC1 =
+ N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
+
// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
// truncation trees that help us avoid lane crossing shuffles.
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
// TODO: We don't handle vXf64 shuffles yet.
- if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N0.getConstantOperandAPInt(1) == 0 &&
- N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
- N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
- N0.getOperand(0).getValueType().is256BitVector() &&
- SrcVT.getScalarSizeInBits() <= 32) {
- // TODO - support target/faux shuffles.
- SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
- if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
+ BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ BC0.getOperand(0) == BC1.getOperand(0) &&
+ BC0.getOperand(0).getValueType().is256BitVector() &&
+ BC0.getConstantOperandAPInt(1) == 0 &&
+ BC1.getConstantOperandAPInt(1) ==
+ BC0.getValueType().getVectorNumElements()) {
+ SmallVector<SDValue> ShuffleOps;
+ SmallVector<int> ShuffleMask, ScaledMask;
+ SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
+ if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
+ resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
// To keep the HOP LHS/RHS coherency, we must be able to scale the unary
- // shuffle to a vXi64 width - we can probably relax this in the future.
- SmallVector<int, 4> ShuffleMask;
- if (SVN->getOperand(1).isUndef() &&
- scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
- SDLoc DL(N);
+ // shuffle to a v4X64 width - we can probably relax this in the future.
+ if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
+ ShuffleOps[0].getValueType().is256BitVector() &&
+ scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
SDValue Lo, Hi;
MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
- std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
- Lo = DAG.getBitcast(N0.getValueType(), Lo);
- Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
+ Lo = DAG.getBitcast(SrcVT, Lo);
+ Hi = DAG.getBitcast(SrcVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
return DAG.getBitcast(VT, Res);
}
}
}
- // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
- // TODO: Merge with binary shuffle folds below.
+ // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
- int PostShuffle[4] = {0, 1, 2, 3};
+ // If either/both ops are a shuffle that can scale to v2x64,
+ // then see if we can perform this as a v4x32 post shuffle.
+ SmallVector<SDValue> Ops0, Ops1;
+ SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
+ bool IsShuf0 =
+ getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+ scaleShuffleElements(Mask0, 2, ScaledMask0) &&
+ all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
+ bool IsShuf1 =
+ getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+ scaleShuffleElements(Mask1, 2, ScaledMask1) &&
+ all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
+ if (IsShuf0 || IsShuf1) {
+ if (!IsShuf0) {
+ Ops0.assign({BC0});
+ ScaledMask0.assign({0, 1});
+ }
+ if (!IsShuf1) {
+ Ops1.assign({BC1});
+ ScaledMask1.assign({0, 1});
+ }
- // If the op is an unary shuffle that can scale to v2x64,
- // then we can perform this as a v4x32 post shuffle.
- auto AdjustOp = [&](SDValue V, int Offset) {
- auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
- SmallVector<int, 2> ScaledMask;
- if (!SVN || !SVN->getOperand(1).isUndef() ||
- !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
- !N->isOnlyUserOf(V.getNode()))
- return SDValue();
- PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
- PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
- return SVN->getOperand(0);
- };
-
- SDValue Src0 = AdjustOp(N0, 0);
- SDValue Src1 = AdjustOp(N1, 2);
- if (Src0 || Src1) {
- Src0 = Src0 ? Src0 : N0;
- Src1 = Src1 ? Src1 : N1;
- SDLoc DL(N);
- MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
- SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
- Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
- return DAG.getBitcast(VT, Res);
+ SDValue LHS, RHS;
+ int PostShuffle[4] = {-1, -1, -1, -1};
+ auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
+ if (M < 0)
+ return true;
+ Idx = M % 2;
+ SDValue Src = Ops[M / 2];
+ if (!LHS || LHS == Src) {
+ LHS = Src;
+ return true;
+ }
+ if (!RHS || RHS == Src) {
+ Idx += 2;
+ RHS = Src;
+ return true;
+ }
+ return false;
+ };
+ if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
+ FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
+ FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
+ FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
+ LHS = DAG.getBitcast(SrcVT, LHS);
+ RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
+ return DAG.getBitcast(VT, Res);
+ }
}
}
// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
- // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
if (VT.is256BitVector() && Subtarget.hasInt256()) {
SmallVector<int> Mask0, Mask1;
SmallVector<SDValue> Ops0, Ops1;
- if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
- getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
- !Ops0.empty() && !Ops1.empty()) {
- SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
- SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
- SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
- if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
- Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
- scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
- scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
- if ((Op00 == Op11) && (Op01 == Op10)) {
- std::swap(Op10, Op11);
- ShuffleVectorSDNode::commuteMask(ShuffleMask1);
- }
- if ((Op00 == Op10) && (Op01 == Op11)) {
- SmallVector<int, 4> ShuffleMask;
- ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
- ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
- SDLoc DL(N);
- MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
- SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
- Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
- return DAG.getBitcast(VT, Res);
- }
+ SmallVector<int, 2> ScaledMask0, ScaledMask1;
+ if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+ getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+ !Ops0.empty() && !Ops1.empty() &&
+ all_of(Ops0,
+ [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
+ all_of(Ops1,
+ [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
+ scaleShuffleElements(Mask0, 2, ScaledMask0) &&
+ scaleShuffleElements(Mask1, 2, ScaledMask1)) {
+ SDValue Op00 = peekThroughBitcasts(Ops0.front());
+ SDValue Op10 = peekThroughBitcasts(Ops1.front());
+ SDValue Op01 = peekThroughBitcasts(Ops0.back());
+ SDValue Op11 = peekThroughBitcasts(Ops1.back());
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ScaledMask1);
+ }
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ const int Map[4] = {0, 2, 1, 3};
+ SmallVector<int, 4> ShuffleMask(
+ {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
+ Map[ScaledMask1[1]]});
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
+ DAG.getBitcast(SrcVT, Op01));
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
}
}
}
@@ -43344,6 +44065,63 @@
X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode");
+ if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ // For slow-hop targets, if we have a hop with a single op, see if we already
+ // have another user that we can reuse and shuffle the result.
+ MVT VT = N->getSimpleValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (VT.is128BitVector() && LHS == RHS) {
+ for (SDNode *User : LHS->uses()) {
+ if (User != N && User->getOpcode() == N->getOpcode()) {
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
+ return DAG.getBitcast(
+ VT,
+ DAG.getVectorShuffle(ShufVT, SDLoc(N),
+ DAG.getBitcast(ShufVT, SDValue(User, 0)),
+ DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
+ }
+ if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
+ return DAG.getBitcast(
+ VT,
+ DAG.getVectorShuffle(ShufVT, SDLoc(N),
+ DAG.getBitcast(ShufVT, SDValue(User, 0)),
+ DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
+ }
+ }
+ }
+ }
+
+ // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
+ if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
+ LHS.getOpcode() == RHS.getOpcode() &&
+ LHS.getValueType() == RHS.getValueType()) {
+ SDValue LHS0 = LHS.getOperand(0);
+ SDValue RHS0 = LHS.getOperand(1);
+ SDValue LHS1 = RHS.getOperand(0);
+ SDValue RHS1 = RHS.getOperand(1);
+ if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
+ (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
+ SDLoc DL(N);
+ SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
+ LHS0.isUndef() ? RHS0 : LHS0,
+ LHS1.isUndef() ? RHS1 : LHS1);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+ Res = DAG.getBitcast(ShufVT, Res);
+ SDValue NewLHS =
+ DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
+ getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
+ SDValue NewRHS =
+ DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
+ getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
+ DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
+ DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
+ return SDValue(N, 0);
+ }
+ }
+ }
+
// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
return V;
@@ -43400,6 +44178,10 @@
assert(N->getOperand(1).getValueType() == MVT::i8 &&
"Unexpected shift amount type");
+ // (shift undef, X) -> 0
+ if (N0.isUndef())
+ return DAG.getConstant(0, SDLoc(N), VT);
+
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
unsigned ShiftVal = N->getConstantOperandVal(1);
@@ -43942,8 +44724,7 @@
uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
bool ConstantsMatch = true;
for (uint64_t j = 0; j < ArrayElementCount; j++) {
- ConstantInt *Elem =
- dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+ auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
ConstantsMatch = false;
break;
@@ -44159,7 +44940,8 @@
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
X86::MaxShuffleCombineDepth,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
+ /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
}
@@ -44758,8 +45540,7 @@
// Only do this when the result is at least 64 bits or we'll leaving
// dangling PACKSSDW nodes.
if (SVT == MVT::i8 && InSVT == MVT::i32) {
- EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements());
+ EVT MidVT = VT.changeVectorElementType(MVT::i16);
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
assert(Mid && "Failed to pack!");
@@ -44778,7 +45559,8 @@
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
- Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+ Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
+ (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
unsigned TruncOpc = 0;
SDValue SatVal;
if (auto SSatVal = detectSSatPattern(In, VT)) {
@@ -45156,8 +45938,7 @@
EVT CastVT = VT;
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
- CastVT =
- EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ CastVT = VT.changeVectorElementType(EltVT);
}
SDValue Load =
@@ -45290,8 +46071,7 @@
EVT EltVT = VT.getVectorElementType();
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
- EVT CastVT =
- EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ EVT CastVT = VT.changeVectorElementType(EltVT);
Value = DAG.getBitcast(CastVT, Value);
}
SDValue Extract =
@@ -45687,18 +46467,8 @@
"Unsupported vector type for horizontal add/sub");
unsigned NumElts = VT.getVectorNumElements();
- // TODO - can we make a general helper method that does all of this for us?
auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
SmallVectorImpl<int> &ShuffleMask) {
- if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!Op.getOperand(0).isUndef())
- N0 = Op.getOperand(0);
- if (!Op.getOperand(1).isUndef())
- N1 = Op.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
- ShuffleMask.append(Mask.begin(), Mask.end());
- return;
- }
bool UseSubVector = false;
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getValueType().is256BitVector() &&
@@ -45706,25 +46476,25 @@
Op = Op.getOperand(0);
UseSubVector = true;
}
- bool IsUnary;
SmallVector<SDValue, 2> SrcOps;
- SmallVector<int, 16> SrcShuffleMask;
+ SmallVector<int, 16> SrcMask, ScaledMask;
SDValue BC = peekThroughBitcasts(Op);
- if (isTargetShuffle(BC.getOpcode()) &&
- getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
- SrcOps, SrcShuffleMask, IsUnary)) {
- if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
- SrcOps.size() <= 2) {
+ if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
+ !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
+ return Op.getValueSizeInBits() == BC.getValueSizeInBits();
+ })) {
+ resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
+ if (!UseSubVector && SrcOps.size() <= 2 &&
+ scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
- ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
}
- if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
- SrcOps.size() == 1) {
- N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
- N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
- ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
- ShuffleMask.append(Mask.begin(), Mask.end());
+ if (UseSubVector && SrcOps.size() == 1 &&
+ scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
+ std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
+ ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
+ ShuffleMask.assign(Mask.begin(), Mask.end());
}
}
};
@@ -45761,6 +46531,17 @@
RMask.push_back(i);
}
+ // If we have an unary mask, ensure the other op is set to null.
+ if (isUndefOrInRange(LMask, 0, NumElts))
+ B = SDValue();
+ else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
+ A = SDValue();
+
+ if (isUndefOrInRange(RMask, 0, NumElts))
+ D = SDValue();
+ else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
+ C = SDValue();
+
// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
@@ -45851,29 +46632,64 @@
return true;
}
+// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
+static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ unsigned Opcode = N->getOpcode();
+ bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
+ SmallVector<int, 8> PostShuffleMask;
+
+ switch (Opcode) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
+ if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+ PostShuffleMask)) {
+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
+ }
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v16i16 || VT == MVT::v8i32)) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
+ if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+ PostShuffleMask)) {
+ auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+ };
+ SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+ {LHS, RHS}, HOpBuilder);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
+ }
+ break;
+ }
+
+ return SDValue();
+}
+
/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- EVT VT = N->getValueType(0);
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- bool IsFadd = N->getOpcode() == ISD::FADD;
- auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
- assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
-
- // Try to synthesize horizontal add/sub from adds/subs of shuffles.
- SmallVector<int, 8> PostShuffleMask;
- if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
- (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
- PostShuffleMask)) {
- SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
- if (!PostShuffleMask.empty())
- HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
- DAG.getUNDEF(VT), PostShuffleMask);
- return HorizBinOp;
- }
-
+ if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
+ return HOp;
return SDValue();
}
@@ -46003,10 +46819,8 @@
EVT InVT = In.getValueType();
unsigned NumElems = OutVT.getVectorNumElements();
- // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
- // SSE2, and we need to take care of it specially.
- // AVX512 provides vpmovdb.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
+ // AVX512 provides fast truncate ops.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
return SDValue();
EVT OutSVT = OutVT.getVectorElementType();
@@ -46017,9 +46831,7 @@
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
- if (Subtarget.hasSSSE3() && NumElems == 8 &&
- ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
- (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
return SDValue();
SDLoc DL(N);
@@ -46569,6 +47381,7 @@
EVT VT = Op.getValueType();
EVT SVT = VT.getScalarType();
unsigned Opc = Op.getOpcode();
+ SDNodeFlags Flags = Op.getNode()->getFlags();
switch (Opc) {
case ISD::FMA:
case X86ISD::FMSUB:
@@ -46583,6 +47396,11 @@
!isOperationLegal(ISD::FMA, VT))
break;
+ // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
+ // if it may have signed zeros.
+ if (!Flags.hasNoSignedZeros())
+ break;
+
// This is always negatible for free but we might be able to remove some
// extra operand negations as well.
SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
@@ -46662,13 +47480,16 @@
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
// If this is SSE1 only convert to FXOR to avoid scalarization.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
- N->getValueType(0) == MVT::v4i32) {
- return DAG.getBitcast(
- MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
- DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
- DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(MVT::v4i32,
+ DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N0),
+ DAG.getBitcast(MVT::v4f32, N1)));
}
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
@@ -46686,6 +47507,45 @@
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
return RV;
+ // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getValueType().isVector() &&
+ N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
+ return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
+ N0.getOperand(0).getValueType()));
+ }
+
+ // Handle AVX512 mask widening.
+ // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
+ if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
+ N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
+ TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
+ return DAG.getNode(
+ ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
+ DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
+ N0.getOperand(2));
+ }
+
+ // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
+ // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
+ // TODO: Under what circumstances could this be performed in DAGCombine?
+ if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
+ N0.getOperand(0).getOpcode() == N->getOpcode()) {
+ SDValue TruncExtSrc = N0.getOperand(0);
+ auto *N1C = dyn_cast<ConstantSDNode>(N1);
+ auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
+ if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
+ SDLoc DL(N);
+ SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
+ SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
+ return DAG.getNode(ISD::XOR, DL, VT, LHS,
+ DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
+ }
+ }
+
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
@@ -47375,7 +48235,7 @@
Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
// Compare against the bitmask and extend the result.
- EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ EVT CCVT = VT.changeVectorElementType(MVT::i1);
Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
@@ -47545,6 +48405,8 @@
unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
+ // Propagate fast-math-flags to new FMA node.
+ SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
if (IsStrict) {
assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
@@ -47827,6 +48689,7 @@
}
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
const SDValue LHS = N->getOperand(0);
@@ -47846,6 +48709,61 @@
return DAG.getNode(ISD::TRUNCATE, DL, VT,
DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
}
+
+ if (OpVT.isScalarInteger()) {
+ // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
+ // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
+ auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
+ if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
+ if (N0.getOperand(0) == N1)
+ return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
+ N0.getOperand(1));
+ if (N0.getOperand(1) == N1)
+ return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
+ N0.getOperand(0));
+ }
+ return SDValue();
+ };
+ if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
+ return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
+ if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
+ return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
+
+ // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
+ // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
+ auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
+ if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
+ if (N0.getOperand(0) == N1)
+ return DAG.getNode(ISD::AND, DL, OpVT, N1,
+ DAG.getNOT(DL, N0.getOperand(1), OpVT));
+ if (N0.getOperand(1) == N1)
+ return DAG.getNode(ISD::AND, DL, OpVT, N1,
+ DAG.getNOT(DL, N0.getOperand(0), OpVT));
+ }
+ return SDValue();
+ };
+ if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
+ return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
+ if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
+ return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
+
+ // cmpeq(trunc(x),0) --> cmpeq(x,0)
+ // cmpne(trunc(x),0) --> cmpne(x,0)
+ // iff x upper bits are zero.
+ // TODO: Add support for RHS to be truncate as well?
+ if (LHS.getOpcode() == ISD::TRUNCATE &&
+ LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
+ isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
+ EVT SrcVT = LHS.getOperand(0).getValueType();
+ APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
+ OpVT.getScalarSizeInBits());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
+ TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
+ return DAG.getSetCC(DL, VT, LHS.getOperand(0),
+ DAG.getConstant(0, DL, SrcVT), CC);
+ }
+ }
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
@@ -48024,8 +48942,7 @@
if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
if (BV->isConstant() && IndexWidth > 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
- unsigned NumElts = Index.getValueType().getVectorNumElements();
- EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
@@ -48039,8 +48956,7 @@
IndexWidth > 32 &&
Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
- unsigned NumElts = Index.getValueType().getVectorNumElements();
- EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
@@ -48052,8 +48968,7 @@
// Make sure the index is either i32 or i64
if (IndexWidth != 32 && IndexWidth != 64) {
MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
- EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
- Index.getValueType().getVectorNumElements());
+ EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
@@ -48210,8 +49125,7 @@
// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- InVT.getVectorNumElements());
+ EVT DstVT = InVT.changeVectorElementType(MVT::i32);
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
@@ -48253,8 +49167,7 @@
// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- InVT.getVectorNumElements());
+ EVT DstVT = InVT.changeVectorElementType(MVT::i32);
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
@@ -48271,8 +49184,7 @@
if (NumSignBits >= (BitWidth - 31)) {
EVT TruncVT = MVT::i32;
if (InVT.isVector())
- TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
- InVT.getVectorNumElements());
+ TruncVT = InVT.changeVectorElementType(TruncVT);
SDLoc dl(N);
if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
@@ -48428,15 +49340,28 @@
}
}
- // Look for a truncate with a single use.
- if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
+ // Look for a truncate.
+ if (Op.getOpcode() != ISD::TRUNCATE)
return SDValue();
+ SDValue Trunc = Op;
Op = Op.getOperand(0);
- // Arithmetic op can only have one use.
- if (!Op.hasOneUse())
- return SDValue();
+ // See if we can compare with zero against the truncation source,
+ // which should help using the Z flag from many ops. Only do this for
+ // i32 truncated op to prevent partial-reg compares of promoted ops.
+ EVT OpVT = Op.getValueType();
+ APInt UpperBits =
+ APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
+ if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
+ onlyZeroFlagUsed(SDValue(N, 0))) {
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, OpVT));
+ }
+
+ // After this the truncate and arithmetic op must have a single use.
+ if (!Trunc.hasOneUse() || !Op.hasOneUse())
+ return SDValue();
unsigned NewOpc;
switch (Op.getOpcode()) {
@@ -48958,10 +49883,17 @@
SDValue N01In = N01Elt.getOperand(0);
SDValue N10In = N10Elt.getOperand(0);
SDValue N11In = N11Elt.getOperand(0);
+
// First time we find an input capture it.
if (!In0) {
In0 = N00In;
In1 = N01In;
+
+ // The input vectors must be at least as wide as the output.
+ // If they are larger than the output, we extract subvector below.
+ if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
+ In1.getValueSizeInBits() < VT.getSizeInBits())
+ return SDValue();
}
// Mul is commutative so the input vectors can be in any order.
// Canonicalize to make the compares easier.
@@ -48975,8 +49907,6 @@
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- // Shrink by adding truncate nodes and let DAGCombine fold with the
- // sources.
EVT OpVT = Ops[0].getValueType();
assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
@@ -48985,38 +49915,61 @@
OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
+
+ // If the output is narrower than an input, extract the low part of the input
+ // vector.
+ EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ if (OutVT16.bitsLT(In0.getValueType())) {
+ In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ if (OutVT16.bitsLT(In1.getValueType())) {
+ In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
+ DAG.getIntPtrConstant(0, DL));
+ }
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
PMADDBuilder);
}
-static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+/// CMOV of constants requires materializing constant operands in registers.
+/// Try to fold those constants into an 'add' instruction to reduce instruction
+/// count. We do this with CMOV rather the generic 'select' because there are
+/// earlier folds that may be used to turn select-of-constants into logic hacks.
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
+ // If an operand is zero, add-of-0 gets simplified away, so that's clearly
+ // better because we eliminate 1-2 instructions. This transform is still
+ // an improvement without zero operands because we trade 2 move constants and
+ // 1 add for 2 adds (LEA) as long as the constants can be represented as
+ // immediate asm operands (fit in 32-bits).
+ auto isSuitableCmov = [](SDValue V) {
+ if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
+ return false;
+ if (!isa<ConstantSDNode>(V.getOperand(0)) ||
+ !isa<ConstantSDNode>(V.getOperand(1)))
+ return false;
+ return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
+ (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
+ V.getConstantOperandAPInt(1).isSignedIntN(32));
+ };
+
+ // Match an appropriate CMOV as the first operand of the add.
+ SDValue Cmov = N->getOperand(0);
+ SDValue OtherOp = N->getOperand(1);
+ if (!isSuitableCmov(Cmov))
+ std::swap(Cmov, OtherOp);
+ if (!isSuitableCmov(Cmov))
+ return SDValue();
+
+ // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
EVT VT = N->getValueType(0);
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- bool IsAdd = N->getOpcode() == ISD::ADD;
- auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
- assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
-
- SmallVector<int, 8> PostShuffleMask;
- if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
- VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() &&
- isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
- PostShuffleMask)) {
- auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
- };
- SDValue HorizBinOp =
- SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
- if (!PostShuffleMask.empty())
- HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
- DAG.getUNDEF(VT), PostShuffleMask);
- return HorizBinOp;
- }
-
- return SDValue();
+ SDLoc DL(N);
+ SDValue FalseOp = Cmov.getOperand(0);
+ SDValue TrueOp = Cmov.getOperand(1);
+ FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
+ TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
+ return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
+ Cmov.getOperand(3));
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
@@ -49026,13 +49979,16 @@
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
+ return Select;
+
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
// Try to synthesize horizontal adds from adds of shuffles.
- if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
+ if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
return V;
// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
@@ -49062,154 +50018,39 @@
return combineAddOrSubToADCOrSBB(N, DAG);
}
-static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- EVT VT = N->getValueType(0);
-
- if (!VT.isVector())
- return SDValue();
-
- // PSUBUS is supported, starting from SSE2.
- EVT EltVT = VT.getVectorElementType();
- if (!(Subtarget.hasSSE2() &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
- VT == MVT::v8i64 || VT == MVT::v16i32)))
- return SDValue();
-
- SDValue SubusLHS, SubusRHS;
- // Try to find umax(a,b) - b or a - umin(a,b) patterns
- // they may be converted to subus(a,b).
- // TODO: Need to add IR canonicalization for this code.
- if (Op0.getOpcode() == ISD::UMAX) {
- SubusRHS = Op1;
- SDValue MaxLHS = Op0.getOperand(0);
- SDValue MaxRHS = Op0.getOperand(1);
- if (MaxLHS == Op1)
- SubusLHS = MaxRHS;
- else if (MaxRHS == Op1)
- SubusLHS = MaxLHS;
- else
- return SDValue();
- } else if (Op1.getOpcode() == ISD::UMIN) {
- SubusLHS = Op0;
- SDValue MinLHS = Op1.getOperand(0);
- SDValue MinRHS = Op1.getOperand(1);
- if (MinLHS == Op0)
- SubusRHS = MinRHS;
- else if (MinRHS == Op0)
- SubusRHS = MinLHS;
- else
- return SDValue();
- } else if (Op1.getOpcode() == ISD::TRUNCATE &&
- Op1.getOperand(0).getOpcode() == ISD::UMIN &&
- (EltVT == MVT::i8 || EltVT == MVT::i16)) {
- // Special case where the UMIN has been truncated. Try to push the truncate
- // further up. This is similar to the i32/i64 special processing.
- SubusLHS = Op0;
- SDValue MinLHS = Op1.getOperand(0).getOperand(0);
- SDValue MinRHS = Op1.getOperand(0).getOperand(1);
- EVT TruncVT = Op1.getOperand(0).getValueType();
- if (!(Subtarget.hasSSE2() &&
- (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
- TruncVT == MVT::v16i32)))
- return SDValue();
- SDValue OpToSaturate;
- if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
- MinLHS.getOperand(0) == Op0)
- OpToSaturate = MinRHS;
- else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
- MinRHS.getOperand(0) == Op0)
- OpToSaturate = MinLHS;
- else
- return SDValue();
-
- // Saturate the non-extended input and then truncate it.
- SDLoc DL(N);
- SDValue SaturationConst =
- DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
- VT.getScalarSizeInBits()),
- DL, TruncVT);
- SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
- SaturationConst);
- SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
- } else
- return SDValue();
-
- // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
- // special preprocessing in some cases.
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
-
- assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
- "Unexpected VT!");
-
- // Special preprocessing case can be only applied
- // if the value was zero extended from 16 bit,
- // so we require first 16 bits to be zeros for 32 bit
- // values, or first 48 bits for 64 bit values.
- KnownBits Known = DAG.computeKnownBits(SubusLHS);
- unsigned NumZeros = Known.countMinLeadingZeros();
- if (NumZeros < (VT.getScalarSizeInBits() - 16))
- return SDValue();
-
- EVT ExtType = SubusLHS.getValueType();
- EVT ShrinkedType;
- if (VT == MVT::v8i32 || VT == MVT::v8i64)
- ShrinkedType = MVT::v8i16;
- else
- ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
-
- // If SubusLHS is zeroextended - truncate SubusRHS to it's
- // size SubusRHS = umin(0xFFF.., SubusRHS).
- SDValue SaturationConst =
- DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
- ShrinkedType.getScalarSizeInBits()),
- SDLoc(SubusLHS), ExtType);
- SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
- SaturationConst);
- SDValue NewSubusLHS =
- DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
- SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
- SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
- NewSubusLHS, NewSubusRHS);
-
- // Zero extend the result, it may be used somewhere as 32 bit,
- // if not zext and following trunc will shrink.
- return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
-}
-
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
- // X86 can't encode an immediate LHS of a sub. See if we can push the
- // negation into a preceding instruction.
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
- // If the RHS of the sub is a XOR with one use and a constant, invert the
- // immediate. Then add one to the LHS of the sub so we can turn
- // X-Y -> X+~Y+1, saving one register.
- if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
- isa<ConstantSDNode>(Op1.getOperand(1))) {
- const APInt &XorC = Op1.getConstantOperandAPInt(1);
- EVT VT = Op0.getValueType();
- SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
- Op1.getOperand(0),
- DAG.getConstant(~XorC, SDLoc(Op1), VT));
- return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
- DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+ // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
+ auto IsNonOpaqueConstant = [&](SDValue Op) {
+ if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(C))
+ return !Cst->isOpaque();
+ return true;
}
+ return false;
+ };
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction. If the RHS of the sub is a XOR with
+ // one use and a constant, invert the immediate, saving one register.
+ // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
+ if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
+ IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
+ SDLoc DL(N);
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
+ DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
+ SDValue NewAdd =
+ DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
}
// Try to synthesize horizontal subs from subs of shuffles.
- if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
- return V;
-
- // Try to create PSUBUS if SUB's argument is max/min
- if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
+ if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
@@ -49317,12 +50158,42 @@
}
}
+ // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
+ // Only concat of subvector high halves which vperm2x128 is best at.
+ // TODO: This should go in combineX86ShufflesRecursively eventually.
+ if (VT.is256BitVector() && Ops.size() == 2) {
+ SDValue Src0 = peekThroughBitcasts(Ops[0]);
+ SDValue Src1 = peekThroughBitcasts(Ops[1]);
+ if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ EVT SrcVT0 = Src0.getOperand(0).getValueType();
+ EVT SrcVT1 = Src1.getOperand(0).getValueType();
+ unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
+ unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
+ if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
+ Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
+ Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
+ DAG.getBitcast(VT, Src0.getOperand(0)),
+ DAG.getBitcast(VT, Src1.getOperand(0)),
+ DAG.getTargetConstant(0x31, DL, MVT::i8));
+ }
+ }
+ }
+
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
// but it currently struggles with different vector widths.
if (llvm::all_of(Ops, [Op0](SDValue Op) {
return Op.getOpcode() == Op0.getOpcode();
})) {
+ auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
+ SmallVector<SDValue> Subs;
+ for (SDValue SubOp : SubOps)
+ Subs.push_back(SubOp.getOperand(I));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+ };
+
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
case X86ISD::SHUFP: {
@@ -49331,15 +50202,9 @@
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op.getOperand(2) == Op0.getOperand(2);
})) {
- SmallVector<SDValue, 2> LHS, RHS;
- for (unsigned i = 0; i != NumOps; ++i) {
- LHS.push_back(Ops[i].getOperand(0));
- RHS.push_back(Ops[i].getOperand(1));
- }
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
- Op0.getOperand(2));
+ ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
}
break;
}
@@ -49348,22 +50213,15 @@
case X86ISD::PSHUFD:
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
- SmallVector<SDValue, 2> Src;
- for (unsigned i = 0; i != NumOps; ++i)
- Src.push_back(Ops[i].getOperand(0));
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
- Op0.getOperand(1));
+ ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
}
LLVM_FALLTHROUGH;
case X86ISD::VPERMILPI:
// TODO - add support for vXf64/vXi64 shuffles.
if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
- SmallVector<SDValue, 2> Src;
- for (unsigned i = 0; i != NumOps; ++i)
- Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
+ SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
Op0.getOperand(1));
return DAG.getBitcast(VT, Res);
@@ -49375,11 +50233,10 @@
int NumSrcElts = OpVT.getVectorNumElements();
SmallVector<int, 64> ConcatMask;
for (unsigned i = 0; i != NumOps; ++i) {
- bool IsUnary;
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubOps;
if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
- SubMask, IsUnary))
+ SubMask))
break;
for (int M : SubMask) {
if (0 <= M) {
@@ -49402,20 +50259,34 @@
}
break;
case X86ISD::VSHLI:
- case X86ISD::VSRAI:
case X86ISD::VSRLI:
+ // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
+ // TODO: Move this to LowerScalarImmediateShift?
+ if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
+ llvm::all_of(Ops, [](SDValue Op) {
+ return Op.getConstantOperandAPInt(1) == 32;
+ })) {
+ SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
+ SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
+ if (Op0.getOpcode() == X86ISD::VSHLI) {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {8, 0, 8, 2, 8, 4, 8, 6});
+ } else {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {1, 8, 3, 8, 5, 8, 7, 8});
+ }
+ return DAG.getBitcast(VT, Res);
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VSRAI:
if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(1) == Op.getOperand(1);
})) {
- SmallVector<SDValue, 2> Src;
- for (unsigned i = 0; i != NumOps; ++i)
- Src.push_back(Ops[i].getOperand(0));
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
- Op0.getOperand(1));
+ ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
}
break;
case X86ISD::VPERMI:
@@ -49425,12 +50296,8 @@
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(1) == Op.getOperand(1);
})) {
- SmallVector<SDValue, 2> Src;
- for (unsigned i = 0; i != NumOps; ++i)
- Src.push_back(Ops[i].getOperand(0));
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
- Op0.getOperand(1));
+ ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
}
break;
case ISD::AND:
@@ -49439,17 +50306,12 @@
case X86ISD::ANDNP:
// TODO: Add 256-bit support.
if (!IsSplat && VT.is512BitVector()) {
- SmallVector<SDValue, 2> LHS, RHS;
- for (unsigned i = 0; i != NumOps; ++i) {
- LHS.push_back(Ops[i].getOperand(0));
- RHS.push_back(Ops[i].getOperand(1));
- }
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumOps * SrcVT.getVectorNumElements());
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ ConcatSubOperand(SrcVT, Ops, 0),
+ ConcatSubOperand(SrcVT, Ops, 1));
}
break;
case X86ISD::HADD:
@@ -49460,17 +50322,12 @@
case X86ISD::PACKUS:
if (!IsSplat && VT.is256BitVector() &&
(VT.isFloatingPoint() || Subtarget.hasInt256())) {
- SmallVector<SDValue, 2> LHS, RHS;
- for (unsigned i = 0; i != NumOps; ++i) {
- LHS.push_back(Ops[i].getOperand(0));
- RHS.push_back(Ops[i].getOperand(1));
- }
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumOps * SrcVT.getVectorNumElements());
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ ConcatSubOperand(SrcVT, Ops, 0),
+ ConcatSubOperand(SrcVT, Ops, 1));
}
break;
case X86ISD::PALIGNR:
@@ -49480,15 +50337,9 @@
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(2) == Op.getOperand(2);
})) {
- SmallVector<SDValue, 2> LHS, RHS;
- for (unsigned i = 0; i != NumOps; ++i) {
- LHS.push_back(Ops[i].getOperand(0));
- RHS.push_back(Ops[i].getOperand(1));
- }
return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
- Op0.getOperand(2));
+ ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
}
break;
}
@@ -49651,6 +50502,19 @@
return BcastLd;
}
+ // If we're splatting the lower half subvector of a full vector load into the
+ // upper half, attempt to create a subvector broadcast.
+ if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
+ Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
+ auto *VecLd = dyn_cast<LoadSDNode>(Vec);
+ auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
+ if (VecLd && SubLd &&
+ DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
+ SubVec.getValueSizeInBits() / 8, 0))
+ return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
+ SubLd, 0, DAG);
+ }
+
return SDValue();
}
@@ -49794,7 +50658,8 @@
// extract the lowest subvector instead which should allow
// SimplifyDemandedVectorElts do more simplifications.
if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
- InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
// If we're extracting a broadcasted subvector, just use the lowest subvector.
@@ -49947,6 +50812,23 @@
Src.getOperand(0).getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
+ // See if we're broadcasting the scalar value, in which case just reuse that.
+ // Ensure the same SDValue from the SDNode use is being used.
+ if (VT.getScalarType() == Src.getValueType())
+ for (SDNode *User : Src->uses())
+ if (User->getOpcode() == X86ISD::VBROADCAST &&
+ Src == User->getOperand(0)) {
+ unsigned SizeInBits = VT.getFixedSizeInBits();
+ unsigned BroadcastSizeInBits =
+ User->getValueSizeInBits(0).getFixedSize();
+ if (BroadcastSizeInBits == SizeInBits)
+ return SDValue(User, 0);
+ if (BroadcastSizeInBits > SizeInBits)
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
+ // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
+ // coverage.
+ }
+
return SDValue();
}
@@ -50021,8 +50903,7 @@
ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
? ISD::SEXTLOAD
: ISD::ZEXTLOAD;
- EVT MemVT =
- EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
+ EVT MemVT = VT.changeVectorElementType(SVT);
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
@@ -50366,7 +51247,7 @@
case ISD::SIGN_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG:
return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
- case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
+ case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::PACKSS:
@@ -51224,7 +52105,7 @@
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32)
return std::make_pair(0U, &X86::GR32RegClass);
- if (VT != MVT::f80)
+ if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64RegClass);
break;
}
@@ -51235,9 +52116,10 @@
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_ABCDRegClass);
- if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 ||
+ (!VT.isVector() && !Subtarget.is64Bit()))
return std::make_pair(0U, &X86::GR32_ABCDRegClass);
- if (VT != MVT::f80)
+ if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
@@ -51246,9 +52128,10 @@
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
- if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 ||
+ (!VT.isVector() && !Subtarget.is64Bit()))
return std::make_pair(0U, &X86::GR32RegClass);
- if (VT != MVT::f80)
+ if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64RegClass);
break;
case 'R': // LEGACY_REGS
@@ -51256,9 +52139,10 @@
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
- if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 ||
+ (!VT.isVector() && !Subtarget.is64Bit()))
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
- if (VT != MVT::f80)
+ if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64_NOREXRegClass);
break;
case 'f': // FP Stack registers.
@@ -51434,21 +52318,22 @@
}
// GCC allows "st(0)" to be called just plain "st".
- if (StringRef("{st}").equals_lower(Constraint))
+ if (StringRef("{st}").equals_insensitive(Constraint))
return std::make_pair(X86::FP0, &X86::RFP80RegClass);
}
// flags -> EFLAGS
- if (StringRef("{flags}").equals_lower(Constraint))
+ if (StringRef("{flags}").equals_insensitive(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
// dirflag -> DF
// Only allow for clobber.
- if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
+ if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
+ VT == MVT::Other)
return std::make_pair(X86::DF, &X86::DFCCRRegClass);
// fpsr -> FPSW
- if (StringRef("{fpsr}").equals_lower(Constraint))
+ if (StringRef("{fpsr}").equals_insensitive(Constraint))
return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
return Res;
@@ -51567,9 +52452,10 @@
return Res;
}
-int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
- const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM,
+ Type *Ty,
+ unsigned AS) const {
// Scaling factors are not free at all.
// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
// will take 2 allocations in the out of order engine instead of 1
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
index 76c83b7..869857b 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
@@ -76,6 +76,10 @@
/// Same as call except it adds the NoTrack prefix.
NT_CALL,
+ // Pseudo for a OBJC call that gets emitted together with a special
+ // marker instruction.
+ CALL_RVMARKER,
+
/// X86 compare and logical compare instructions.
CMP,
FCMP,
@@ -780,9 +784,12 @@
// subvector broadcast from memory.
SUBV_BROADCAST_LOAD,
- // Store FP control world into i16 memory.
+ // Store FP control word into i16 memory.
FNSTCW16m,
+ // Load FP control word from i16 memory.
+ FLDCW16m,
+
/// This instruction implements FP_TO_SINT with the
/// integer destination in memory and a FP reg source. This corresponds
/// to the X86::FIST*m instructions and the rounding mode change stuff. It
@@ -847,6 +854,19 @@
};
} // end namespace X86ISD
+ namespace X86 {
+ /// Current rounding mode is represented in bits 11:10 of FPSR. These
+ /// values are same as corresponding constants for rounding mode used
+ /// in glibc.
+ enum RoundingMode {
+ rmToNearest = 0, // FE_TONEAREST
+ rmDownward = 1 << 10, // FE_DOWNWARD
+ rmUpward = 2 << 10, // FE_UPWARD
+ rmTowardZero = 3 << 10, // FE_TOWARDZERO
+ rmMask = 3 << 10 // Bit mask selecting rounding mode
+ };
+ }
+
/// Define some predicates that are used for node matching.
namespace X86 {
/// Returns true if Elt is a constant zero or floating point constant +0.0.
@@ -919,7 +939,7 @@
/// Returns true if the target allows unaligned memory accesses of the
/// specified type. Returns whether it is "fast" in the last argument.
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
@@ -1118,12 +1138,8 @@
unsigned
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
- else if (ConstraintCode == "v")
+ if (ConstraintCode == "v")
return InlineAsm::Constraint_v;
- else if (ConstraintCode == "X")
- return InlineAsm::Constraint_X;
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
@@ -1166,8 +1182,9 @@
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.
- int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
- unsigned AS) const override;
+ InstructionCost getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
/// This is used to enable splatted operand transforms for vector shifts
/// and vector funnel shifts.
@@ -1340,7 +1357,7 @@
/// If the target has a standard location for the stack protector cookie,
/// returns the address of that location. Otherwise, returns nullptr.
- Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+ Value *getIRStackGuard(IRBuilderBase &IRB) const override;
bool useLoadStackGuardNode() const override;
bool useStackGuardXorFP() const override;
@@ -1354,7 +1371,7 @@
/// Return true if the target stores SafeStack pointer at a fixed offset in
/// some non-standard address space, and populates the address space and
/// offset as appropriate.
- Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+ Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
SDValue Chain, SDValue Pointer,
@@ -1521,6 +1538,7 @@
SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -1586,10 +1604,6 @@
EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
/// Utility function to emit the xmm reg save portion of va_start.
- MachineBasicBlock *
- EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
MachineInstr &MI2,
MachineBasicBlock *BB) const;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/src/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index c4150ed..7d9466f 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -72,8 +72,8 @@
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
// The pass-through vector for an x86 masked load is a zero vector.
- CallInst *NewMaskedLoad =
- IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+ CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
+ II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
return IC.replaceInstUsesWith(II, NewMaskedLoad);
}
@@ -197,11 +197,11 @@
}
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
- auto Vec = II.getArgOperand(0);
- auto Amt = II.getArgOperand(1);
- auto VT = cast<FixedVectorType>(Vec->getType());
- auto SVT = VT->getElementType();
- auto AmtVT = Amt->getType();
+ Value *Vec = II.getArgOperand(0);
+ Value *Amt = II.getArgOperand(1);
+ auto *VT = cast<FixedVectorType>(Vec->getType());
+ Type *SVT = VT->getElementType();
+ Type *AmtVT = Amt->getType();
unsigned VWidth = VT->getNumElements();
unsigned BitWidth = SVT->getPrimitiveSizeInBits();
@@ -249,7 +249,7 @@
}
// Simplify if count is constant vector.
- auto CDV = dyn_cast<ConstantDataVector>(Amt);
+ auto *CDV = dyn_cast<ConstantDataVector>(Amt);
if (!CDV)
return nullptr;
@@ -263,7 +263,7 @@
APInt Count(64, 0);
for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
unsigned SubEltIdx = (NumSubElts - 1) - i;
- auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+ auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
Count <<= BitWidth;
Count |= SubElt->getValue().zextOrTrunc(64);
}
@@ -345,10 +345,10 @@
}
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
- auto Vec = II.getArgOperand(0);
- auto Amt = II.getArgOperand(1);
- auto VT = cast<FixedVectorType>(II.getType());
- auto SVT = VT->getElementType();
+ Value *Vec = II.getArgOperand(0);
+ Value *Amt = II.getArgOperand(1);
+ auto *VT = cast<FixedVectorType>(II.getType());
+ Type *SVT = VT->getElementType();
int NumElts = VT->getNumElements();
int BitWidth = SVT->getIntegerBitWidth();
@@ -628,8 +628,8 @@
};
// See if we're dealing with constant values.
- Constant *C0 = dyn_cast<Constant>(Op0);
- ConstantInt *CI0 =
+ auto *C0 = dyn_cast<Constant>(Op0);
+ auto *CI0 =
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
@@ -761,12 +761,12 @@
}
// See if we're dealing with constant values.
- Constant *C0 = dyn_cast<Constant>(Op0);
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CI00 =
+ auto *C0 = dyn_cast<Constant>(Op0);
+ auto *C1 = dyn_cast<Constant>(Op1);
+ auto *CI00 =
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
- ConstantInt *CI10 =
+ auto *CI10 =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
@@ -803,7 +803,7 @@
/// Attempt to convert pshufb* to shufflevector if the mask is constant.
static Value *simplifyX86pshufb(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
- Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ auto *V = dyn_cast<Constant>(II.getArgOperand(1));
if (!V)
return nullptr;
@@ -848,7 +848,7 @@
/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
- Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ auto *V = dyn_cast<Constant>(II.getArgOperand(1));
if (!V)
return nullptr;
@@ -1472,11 +1472,11 @@
VWidth1 == 16 && "Unexpected operand sizes");
// See if we're dealing with constant values.
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CILength =
+ auto *C1 = dyn_cast<Constant>(Op1);
+ auto *CILength =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
- ConstantInt *CIIndex =
+ auto *CIIndex =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
@@ -1511,8 +1511,8 @@
"Unexpected operand size");
// See if we're dealing with constant values.
- ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
- ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
// Attempt to simplify to a constant or shuffle vector.
if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
@@ -1537,8 +1537,8 @@
"Unexpected operand size");
// See if we're dealing with constant values.
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CI11 =
+ auto *C1 = dyn_cast<Constant>(Op1);
+ auto *CI11 =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
@@ -1573,8 +1573,8 @@
VWidth1 == 2 && "Unexpected operand sizes");
// See if we're dealing with constant values.
- ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
- ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+ auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
// Attempt to simplify to a constant or shuffle vector.
if (CILength && CIIndex) {
@@ -1756,8 +1756,7 @@
if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
} else {
- auto Arg = II.getArgOperand(0);
- auto ArgType = cast<FixedVectorType>(Arg->getType());
+ auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
ArgWidth = ArgType->getNumElements();
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
index e4f3290..d825981 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,23 +48,21 @@
VEX, T8XD;
// Pseduo instruction for RA.
- let hasSideEffects = 1, mayLoad = 1,
- Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
- def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
-
- let hasSideEffects = 1, mayStore = 1 in
- def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
-
- def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2,
- opaquemem:$src3,
- TILECFG:$cfg), []>;
+ def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
+ [(int_x86_ldtilecfg_internal addr:$src)]>;
+ def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
- TILE:$src4, TILECFG:$cfg), []>;
- def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2,
- TILECFG:$cfg), []>;
+ TILE:$src4), []>;
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+ def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
+ [(set TILE:$dst, (int_x86_tilezero_internal
+ GR16:$src1, GR16:$src2))]>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
@@ -101,10 +99,32 @@
}
// Pseduo instruction for RA.
- let Constraints = "$src4 = $dst" in
- def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+ let Constraints = "$src4 = $dst" in {
+ def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tdpbssd_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ def PTDPBSUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tdpbsud_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ def PTDPBUSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tdpbusd_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ def PTDPBUUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tdpbuud_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ }
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
@@ -137,6 +157,16 @@
"tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[]>, VEX_4V, T8XS;
+ // Pseduo instruction for RA.
+ let Constraints = "$src4 = $dst" in
+ def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tdpbf16ps_internal GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6))]>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
index 1901279..dd61d91 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2223,8 +2223,22 @@
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
}
+def X86pcmpm_imm : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
+// Swapped operand version of the above.
+def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ SSECC = X86::getSwappedVPCMPImm(SSECC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su,
+ PatFrag Frag_su,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> {
let isCommutable = 1 in
@@ -2272,25 +2286,23 @@
cond))))]>,
EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
- def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
- (_.VT _.RC:$src1), cond)),
+ def : Pat<(_.KVT (Frag:$cc (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmi")
- _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
+ _.RC:$src1, addr:$src2, (X86pcmpm_imm_commute $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2),
- (_.VT _.RC:$src1), cond))),
+ (_.KVT (Frag_su:$cc (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmik")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
- (CommFrag.OperandTransform $cc))>;
+ (X86pcmpm_imm_commute $cc))>;
}
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag Frag_su, PatFrag CommFrag,
- PatFrag CommFrag_su, X86FoldableSchedWrite sched,
+ PatFrag Frag_su, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> :
- avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
- sched, _, Name> {
+ avx512_icmp_cc<opc, Suffix, Frag, Frag_su, sched, _, Name> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
u8imm:$cc),
@@ -2315,65 +2327,49 @@
cond))))]>,
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2),
+ def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
- _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
+ _.RC:$src1, addr:$src2, (X86pcmpm_imm_commute $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2),
- (_.VT _.RC:$src1), cond))),
+ (_.KVT (Frag_su:$cc (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
- (CommFrag_su.OperandTransform $cc))>;
+ (X86pcmpm_imm_commute $cc))>;
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag Frag_su, PatFrag CommFrag,
- PatFrag CommFrag_su, X86SchedWriteWidths sched,
+ PatFrag Frag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su,
sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su,
sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su,
sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag Frag_su, PatFrag CommFrag,
- PatFrag CommFrag_su, X86SchedWriteWidths sched,
+ PatFrag Frag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su,
sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su,
sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su,
sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
-def X86pcmpm_imm : SDNodeXForm<setcc, [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- uint8_t SSECC = X86::getVPCMPImmForCond(CC);
- return getI8Imm(SSECC, SDLoc(N));
-}]>;
-
-// Swapped operand version of the above.
-def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- uint8_t SSECC = X86::getVPCMPImmForCond(CC);
- SSECC = X86::getSwappedVPCMPImm(SSECC);
- return getI8Imm(SSECC, SDLoc(N));
-}]>;
-
def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -2386,19 +2382,6 @@
return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
-// Same as above, but commutes immediate. Use for load folding.
-def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
- (setcc node:$src1, node:$src2, node:$cc), [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- return !ISD::isUnsignedIntSetCC(CC);
-}], X86pcmpm_imm_commute>;
-
-def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
- (setcc node:$src1, node:$src2, node:$cc), [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
-}], X86pcmpm_imm_commute>;
-
def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -2411,53 +2394,32 @@
return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
-// Same as above, but commutes immediate. Use for load folding.
-def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
- (setcc node:$src1, node:$src2, node:$cc), [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- return ISD::isUnsignedIntSetCC(CC);
-}], X86pcmpm_imm_commute>;
-
-def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
- (setcc node:$src1, node:$src2, node:$cc), [{
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
-}], X86pcmpm_imm_commute>;
-
// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su,
- X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su,
- X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su,
- X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su,
- X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su,
- X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su,
- X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su,
- X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su,
- X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
@@ -3141,7 +3103,7 @@
(!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- (Frag.OperandTransform $cc)), Narrow.KRC)>;
+ (X86pcmpm_imm $cc)), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
(Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
@@ -3151,11 +3113,10 @@
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+ (X86pcmpm_imm $cc)), Narrow.KRC)>;
}
multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
- PatFrag CommFrag, PatFrag CommFrag_su,
string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
@@ -3165,7 +3126,7 @@
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmib")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>;
+ addr:$src2, (X86pcmpm_imm $cc)), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
(Narrow.KVT
@@ -3175,26 +3136,26 @@
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+ addr:$src2, (X86pcmpm_imm $cc)), Narrow.KRC)>;
// Commuted with broadcast load.
-def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2),
- (Narrow.VT Narrow.RC:$src1),
- cond)),
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmib")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>;
+ addr:$src2, (X86pcmpm_imm_commute $cc)), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
(Narrow.KVT
- (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
- (Narrow.VT Narrow.RC:$src1),
- cond)))),
+ (Frag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>;
+ addr:$src2, (X86pcmpm_imm_commute $cc)), Narrow.KRC)>;
}
// Same as above, but for fp types which don't use PatFrags.
@@ -3264,17 +3225,17 @@
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>;
defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>;
@@ -3297,7 +3258,7 @@
}
// Mask setting all 0s or 1s
-multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, SDPatternOperator Val> {
let Predicates = [HasAVX512] in
let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1,
SchedRW = [WriteZero] in
@@ -3305,7 +3266,7 @@
[(set KRC:$dst, (VT Val))]>;
}
-multiclass avx512_mask_setop_w<PatFrag Val> {
+multiclass avx512_mask_setop_w<SDPatternOperator Val> {
defm W : avx512_mask_setop<VK16, v16i1, Val>;
defm D : avx512_mask_setop<VK32, v32i1, Val>;
defm Q : avx512_mask_setop<VK64, v64i1, Val>;
@@ -3665,11 +3626,11 @@
// These patterns exist to prevent the above patterns from introducing a second
// mask inversion when one already exists.
-def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
+def : Pat<(v8i64 (vselect (v8i1 (vnot VK8:$mask)),
(v8i64 immAllZerosV),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
-def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
+def : Pat<(v16i32 (vselect (v16i1 (vnot VK16:$mask)),
(v16i32 immAllZerosV),
(v16i32 VR512:$src))),
(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
@@ -5300,7 +5261,7 @@
//===----------------------------------------------------------------------===//
multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode, SDNode VecNode,
+ SDPatternOperator OpNode, SDNode VecNode,
X86FoldableSchedWrite sched, bit IsCommutable> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5390,7 +5351,7 @@
}
}
-multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode VecNode, SDNode RndNode,
X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
@@ -6429,7 +6390,7 @@
// FMA - Fused Multiply Operations
//
-multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
@@ -6473,7 +6434,7 @@
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
@@ -6494,7 +6455,7 @@
}
}
-multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd> {
defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
@@ -6518,7 +6479,7 @@
X86Fnmsub, X86FnmsubRnd>;
-multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
@@ -6564,7 +6525,7 @@
1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
@@ -6585,7 +6546,7 @@
}
}
-multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd > {
defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
@@ -6608,7 +6569,7 @@
defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub,
X86Fnmsub, X86FnmsubRnd>;
-multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
@@ -6656,7 +6617,7 @@
1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
@@ -6677,7 +6638,7 @@
}
}
-multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd > {
defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
@@ -6745,7 +6706,7 @@
}
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+ string OpcodeStr, SDPatternOperator OpNode, SDNode OpNodeRnd,
X86VectorVTInfo _, string SUFF> {
let ExeDomain = _.ExeDomain in {
defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
@@ -6779,7 +6740,7 @@
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> {
+ string OpcodeStr, SDPatternOperator OpNode, SDNode OpNodeRnd> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
OpNodeRnd, f32x_info, "SS">,
@@ -6795,7 +6756,7 @@
defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
-multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
+multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
SDNode RndOp, string Prefix,
string Suffix, SDNode Move,
X86VectorVTInfo _, PatLeaf ZeroFP> {
@@ -7408,7 +7369,7 @@
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
- X86VectorVTInfo _DstRC, SDNode OpNode,
+ X86VectorVTInfo _DstRC, SDPatternOperator OpNode,
SDNode OpNodeInt, SDNode OpNodeSAE,
X86FoldableSchedWrite sched, string aliasStr>{
let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
@@ -7595,7 +7556,7 @@
//===----------------------------------------------------------------------===//
multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode, SDNode MaskOpNode,
+ X86VectorVTInfo _Src, SDPatternOperator OpNode, SDPatternOperator MaskOpNode,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
@@ -7665,7 +7626,7 @@
// Conversion with rounding control (RC)
multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86VectorVTInfo _Src, SDPatternOperator OpNodeRnd,
X86FoldableSchedWrite sched> {
let Uses = [MXCSR] in
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -7677,8 +7638,8 @@
// Similar to avx512_vcvt_fp, but uses an extload for the memory form.
multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode,
- SDNode MaskOpNode,
+ X86VectorVTInfo _Src, SDPatternOperator OpNode,
+ SDNode MaskOpNode,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
@@ -7802,8 +7763,8 @@
// Convert Signed/Unsigned Doubleword to Double
let Uses = []<Register>, mayRaiseFPException = 0 in
-multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode MaskOpNode, SDNode OpNode128,
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ SDNode MaskOpNode, SDPatternOperator OpNode128,
SDNode MaskOpNode128,
X86SchedWriteWidths sched> {
// No rounding in this op
@@ -7828,7 +7789,7 @@
}
// Convert Signed/Unsigned Doubleword to Float
-multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in
@@ -7846,7 +7807,7 @@
}
// Convert Float to Signed/Unsigned Doubleword with truncation
-multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode,
SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
@@ -7882,7 +7843,7 @@
}
// Convert Double to Signed/Unsigned Doubleword with truncation
-multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeSAE,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
@@ -8028,7 +7989,7 @@
}
// Convert Double to Signed/Unsigned Quardword with truncation
-multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
@@ -8046,7 +8007,7 @@
}
// Convert Signed/Unsigned Quardword to Double
-multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
@@ -8091,7 +8052,7 @@
}
// Convert Float to Signed/Unsigned Quardword with truncation
-multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
@@ -8118,7 +8079,7 @@
}
// Convert Signed/Unsigned Quardword to Float
-multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
@@ -9716,7 +9677,8 @@
(ins _.RC:$src1, MaskRC:$mask, memop:$src2),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
- []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
+ []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
}
multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
@@ -10094,7 +10056,8 @@
// op(broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode MaskOpNode,
+ SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
@@ -10139,8 +10102,8 @@
}
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
- AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched,
Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
@@ -10338,8 +10301,8 @@
}
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
- bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
- SDNode MaskOpNode, SDNode OpNodeSAE,
+ bits<8> opcPs, bits<8> opcPd, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode, SDNode OpNodeSAE,
X86SchedWriteWidths sched, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
@@ -11316,39 +11279,39 @@
// TODO: We should maybe have a more generalized algorithm for folding to
// vpternlog.
let Predicates = [HasAVX512] in {
- def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)),
+ def : Pat<(v64i8 (vnot VR512:$src)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)),
+ def : Pat<(v32i16 (vnot VR512:$src)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)),
+ def : Pat<(v16i32 (vnot VR512:$src)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)),
+ def : Pat<(v8i64 (vnot VR512:$src)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
+ def : Pat<(v16i8 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
+ def : Pat<(v8i16 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
+ def : Pat<(v4i32 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
+ def : Pat<(v2i64 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
@@ -11356,28 +11319,28 @@
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
+ def : Pat<(v32i8 (vnot VR256X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
+ def : Pat<(v16i16 (vnot VR256X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
+ def : Pat<(v8i32 (vnot VR256X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
+ def : Pat<(v4i64 (vnot VR256X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
@@ -11387,22 +11350,22 @@
}
let Predicates = [HasVLX] in {
- def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
+ def : Pat<(v16i8 (vnot VR128X:$src)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
+ def : Pat<(v8i16 (vnot VR128X:$src)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
+ def : Pat<(v4i32 (vnot VR128X:$src)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
+ def : Pat<(v2i64 (vnot VR128X:$src)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
+ def : Pat<(v32i8 (vnot VR256X:$src)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
+ def : Pat<(v16i16 (vnot VR256X:$src)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
+ def : Pat<(v8i32 (vnot VR256X:$src)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
+ def : Pat<(v4i64 (vnot VR256X:$src)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
}
@@ -11563,7 +11526,7 @@
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
-multiclass AVX512_scalar_math_fp_patterns<SDNode Op, SDNode MaskedOp,
+multiclass AVX512_scalar_math_fp_patterns<SDPatternOperator Op, SDNode MaskedOp,
string OpcPrefix, SDNode MoveNode,
X86VectorVTInfo _, PatLeaf ZeroFP> {
let Predicates = [HasAVX512] in {
@@ -11635,7 +11598,7 @@
defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
-multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
+multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix,
SDNode Move, X86VectorVTInfo _> {
let Predicates = [HasAVX512] in {
def : Pat<(_.VT (Move _.VT:$dst,
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
index e83e1e7..ba00e7d 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -708,6 +708,19 @@
mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
+// BinOpRM - Instructions like "adc reg, reg, [mem]".
+// There is an implicit register read at the end of the operand sequence.
+class BinOpRM_ImplicitUse<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+ : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched.Folded, sched.ReadAfterFold,
+ // base, scale, index, offset, segment.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // implicit register read.
+ sched.ReadAfterFold]>;
+
// BinOpRM_F - Instructions like "cmp reg, [mem]".
class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
@@ -725,7 +738,7 @@
// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+ : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
EFLAGS))]>;
@@ -805,7 +818,11 @@
SDNode opnode>
: BinOpMR<opcode, mnemonic, typeinfo,
[(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
- (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+ (implicit EFLAGS)]>, Sched<[WriteALURMW,
+ // base, scale, index, offset, segment
+ ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault,
+ WriteALU.ReadAfterFold]>; // reg
// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -813,7 +830,12 @@
: BinOpMR<opcode, mnemonic, typeinfo,
[(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
addr:$dst),
- (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW,
+ // base, scale, index, offset, segment
+ ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault,
+ WriteALU.ReadAfterFold, // reg
+ WriteALU.ReadAfterFold]>; // EFLAGS
// BinOpMR_F - Instructions like "cmp [mem], reg".
class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
index dc6361a..202d320 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -69,7 +69,7 @@
let SchedRW = [WriteSystem] in {
// x86-64 va_start lowering magic.
-let usesCustomInserter = 1, Defs = [EFLAGS] in {
+let hasSideEffects = 1, Defs = [EFLAGS] in {
def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
(outs),
(ins GR8:$al,
@@ -80,7 +80,9 @@
timm:$regsavefi,
timm:$offset),
(implicit EFLAGS)]>;
+}
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
// va_list, and place the address of the next argument into a register.
let Defs = [EFLAGS] in {
@@ -930,10 +932,10 @@
defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
// Atomic exchange and add
-multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
- string frag> {
- let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
- SchedRW = [WriteALURMW] in {
+multiclass ATOMIC_RMW_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+ string frag> {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], mayLoad = 1, mayStore = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteALURMW] in {
def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
@@ -962,7 +964,7 @@
}
}
-defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;
+defm LXADD : ATOMIC_RMW_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;
/* The following multiclass tries to make sure that in code like
* x.store (immediate op x.load(acquire), release)
@@ -1195,6 +1197,12 @@
def : Pat<(X86call (i64 texternalsym:$dst)),
(CALL64pcrel32 texternalsym:$dst)>;
+def : Pat<(X86call_rvmarker (timm:$sel), (i64 texternalsym:$dst)),
+ (CALL64pcrel32_RVMARKER timm:$sel, texternalsym:$dst)>;
+def : Pat<(X86call_rvmarker (timm:$sel), (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32_RVMARKER timm:$sel, tglobaladdr:$dst)>;
+
+
// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
// can never use callee-saved registers. That is the purpose of the GR64_TC
// register classes.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrControl.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
index 4f78677..a6cb17f 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
@@ -206,7 +206,7 @@
}
// Loop instructions
-let SchedRW = [WriteJump] in {
+let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
@@ -415,6 +415,22 @@
}
}
+let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
+ Uses = [RSP, SSP],
+ SchedRW = [WriteJump] in {
+ def CALL64m_RVMARKER :
+ PseudoI<(outs), (ins i32imm:$sel, i64mem:$dst), [(X86call_rvmarker timm:$sel, (loadi64 addr:$dst))]>,
+ Requires<[In64BitMode]>;
+
+ def CALL64r_RVMARKER :
+ PseudoI<(outs), (ins i32imm:$sel, GR64:$dst), [(X86call_rvmarker timm:$sel, GR64:$dst)]>,
+ Requires<[In64BitMode]>;
+
+ def CALL64pcrel32_RVMARKER :
+ PseudoI<(outs), (ins i32imm:$sel, i64i32imm_brtarget:$dst), []>,
+ Requires<[In64BitMode]>;
+}
+
// Conditional tail calls are similar to the above, but they are branches
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
index f9be3a7..27328fe 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
@@ -35,7 +35,7 @@
multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op, X86FoldableSchedWrite sched> {
+ SDPatternOperator Op, X86FoldableSchedWrite sched> {
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
@@ -55,7 +55,7 @@
multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op, X86FoldableSchedWrite sched> {
+ SDPatternOperator Op, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
@@ -75,7 +75,7 @@
multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op, X86FoldableSchedWrite sched> {
+ SDPatternOperator Op, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
@@ -100,7 +100,7 @@
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy, string Suff,
PatFrag MemFrag128, PatFrag MemFrag256,
- SDNode Op, ValueType OpTy128, ValueType OpTy256,
+ SDPatternOperator Op, ValueType OpTy128, ValueType OpTy256,
X86SchedWriteWidths sched> {
defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
@@ -241,7 +241,7 @@
hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
- SDNode OpNode, RegisterClass RC,
+ SDPatternOperator OpNode, RegisterClass RC,
X86MemOperand x86memop, X86FoldableSchedWrite sched> {
defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
x86memop, RC, OpNode, sched>;
@@ -305,7 +305,7 @@
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, SDNode OpNode, X86FoldableSchedWrite sched> {
+ string OpStr, SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = SSEPackedSingle in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
FR32, f32mem, sched>,
@@ -329,7 +329,7 @@
defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG;
-multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
+multiclass scalar_fma_patterns<SDPatternOperator Op, string Prefix, string Suffix,
SDNode Move, ValueType VT, ValueType EltVT,
RegisterClass RC, PatFrag mem_frag> {
let Predicates = [HasFMA, NoAVX512] in {
@@ -388,7 +388,7 @@
let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+ X86MemOperand x86memop, ValueType OpVT, SDPatternOperator OpNode,
PatFrag mem_frag, X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
@@ -463,7 +463,7 @@
}
let Uses = [MXCSR], mayRaiseFPException = 1 in
-multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
ValueType OpVT128, ValueType OpVT256,
PatFrag ld_frag128, PatFrag ld_frag256,
X86SchedWriteWidths sched> {
@@ -602,7 +602,7 @@
loadv2f64, loadv4f64, SchedWriteFMA>;
}
-multiclass scalar_fma4_patterns<SDNode Op, string Name,
+multiclass scalar_fma4_patterns<SDPatternOperator Op, string Name,
ValueType VT, ValueType EltVT,
RegisterClass RC, PatFrag mem_frag> {
let Predicates = [HasFMA4] in {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
index 961b4e5..cda28d1 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -24,6 +24,7 @@
def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -38,6 +39,9 @@
def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
[SDNPHasChain, SDNPMayStore, SDNPSideEffect,
SDNPMemOperand]>;
+def X86fp_cwd_set16 : SDNode<"X86ISD::FLDCW16m", SDTX86CwdLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
def X86fstf32 : PatFrag<(ops node:$val, node:$ptr),
(X86fst node:$val, node:$ptr), [{
@@ -168,7 +172,7 @@
FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
// Factoring for arithmetic.
-multiclass FPBinary_rr<SDNode OpNode> {
+multiclass FPBinary_rr<SDPatternOperator OpNode> {
// Register op register -> register
// These are separated out because they have no reversed form.
def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
@@ -181,7 +185,7 @@
// The FopST0 series are not included here because of the irregularities
// in where the 'r' goes in assembly output.
// These instructions cannot address 80-bit memory.
-multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+multiclass FPBinary<SDPatternOperator OpNode, Format fp, string asmstring,
bit Forward = 1> {
// ST(0) = ST(0) + [mem]
def _Fp32m : FpIf32<(outs RFP32:$dst),
@@ -343,7 +347,7 @@
} // SchedRW
// Unary operations.
-multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPUnary<SDPatternOperator OpNode, Format fp, string asmstring> {
def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
[(set RFP32:$dst, (OpNode RFP32:$src))]>;
def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
@@ -373,6 +377,13 @@
} // SchedRW
} // Uses = [FPCW], mayRaiseFPException = 1
+let SchedRW = [WriteFTest] in {
+def XAM_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def XAM_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def XAM_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+def XAM_F : FPI<0xD9, MRM_E5, (outs), (ins), "fxam">;
+} // SchedRW
+
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1,
@@ -705,7 +716,8 @@
} // SchedRW
let Defs = [FPSW,FPCW], mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
- (outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
+ (outs), (ins i16mem:$dst), "fldcw\t$dst",
+ [(X86fp_cwd_set16 addr:$dst)]>,
Sched<[WriteLoad]>;
// FPU control instructions
@@ -727,7 +739,6 @@
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
-def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>;
def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
let Uses = [FPCW], mayRaiseFPException = 1 in {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
index 686b19f..dba1372 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
@@ -352,7 +352,8 @@
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
-
+ // Force to check predicate before compress EVEX to VEX encoding.
+ bit checkVEXPredicate = 0;
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
let TSFlags{8-7} = OpSizeBits;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
index d9bab14..12a2d92f 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -905,7 +905,7 @@
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
- return 1;
+ return MI.getOperand(0).getReg();
}
}
return 0;
@@ -940,7 +940,7 @@
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
- return 1;
+ return MI.getOperand(X86::AddrNumOperands).getReg();
}
}
return 0;
@@ -1006,6 +1006,7 @@
case X86::MOV64ri:
case X86::MOV64ri32:
case X86::MOV8ri:
+ case X86::PTILEZEROV:
return true;
case X86::MOV8rm:
@@ -2669,6 +2670,58 @@
return false;
}
+static bool isConvertibleLEA(MachineInstr *MI) {
+ unsigned Opcode = MI->getOpcode();
+ if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
+ Opcode != X86::LEA64_32r)
+ return false;
+
+ const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
+
+ if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
+ Scale.getImm() > 1)
+ return false;
+
+ return true;
+}
+
+bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
+ // Currently we're interested in following sequence only.
+ // r3 = lea r1, r2
+ // r5 = add r3, r4
+ // Both r3 and r4 are killed in add, we hope the add instruction has the
+ // operand order
+ // r5 = add r4, r3
+ // So later in X86FixupLEAs the lea instruction can be rewritten as add.
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
+ return false;
+
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ Register Reg1 = MI.getOperand(1).getReg();
+ Register Reg2 = MI.getOperand(2).getReg();
+
+ // Check if Reg1 comes from LEA in the same MBB.
+ if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
+ if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
+ Commute = true;
+ return true;
+ }
+ }
+
+ // Check if Reg2 comes from LEA in the same MBB.
+ if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
+ if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
+ Commute = false;
+ return true;
+ }
+ }
+
+ return false;
+}
+
X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
@@ -3794,7 +3847,8 @@
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
- assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILESTORED;
@@ -3808,15 +3862,11 @@
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
- } else if (RC->getID() == X86::TILECFGRegClassID) {
- unsigned Opc = X86::PSTTILECFG;
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
+ (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
@@ -3840,16 +3890,13 @@
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
- } else if (RC->getID() == X86::TILECFGRegClassID) {
- unsigned Opc = X86::PLDTILECFG;
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
- FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
+ (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
@@ -3977,8 +4024,10 @@
/// Check whether the definition can be converted
/// to remove a comparison against zero.
-inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
+inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
+ bool &ClearsOverflowFlag) {
NoSignFlag = false;
+ ClearsOverflowFlag = false;
switch (MI.getOpcode()) {
default: return false;
@@ -4013,21 +4062,6 @@
case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
- case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
- case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
- case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
- case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
- case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
- case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
- case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
- case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
- case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
- case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
- case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
- case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
- case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
- case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
- case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
@@ -4042,16 +4076,6 @@
case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
- case X86::ANDN32rr: case X86::ANDN32rm:
- case X86::ANDN64rr: case X86::ANDN64rm:
- case X86::BLSI32rr: case X86::BLSI32rm:
- case X86::BLSI64rr: case X86::BLSI64rm:
- case X86::BLSMSK32rr:case X86::BLSMSK32rm:
- case X86::BLSMSK64rr:case X86::BLSMSK64rm:
- case X86::BLSR32rr: case X86::BLSR32rm:
- case X86::BLSR64rr: case X86::BLSR64rm:
- case X86::BZHI32rr: case X86::BZHI32rm:
- case X86::BZHI64rr: case X86::BZHI64rm:
case X86::LZCNT16rr: case X86::LZCNT16rm:
case X86::LZCNT32rr: case X86::LZCNT32rm:
case X86::LZCNT64rr: case X86::LZCNT64rm:
@@ -4061,6 +4085,30 @@
case X86::TZCNT16rr: case X86::TZCNT16rm:
case X86::TZCNT32rr: case X86::TZCNT32rm:
case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return true;
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
+ case X86::BLSI32rr: case X86::BLSI32rm:
+ case X86::BLSI64rr: case X86::BLSI64rm:
+ case X86::BLSMSK32rr: case X86::BLSMSK32rm:
+ case X86::BLSMSK64rr: case X86::BLSMSK64rm:
+ case X86::BLSR32rr: case X86::BLSR32rm:
+ case X86::BLSR64rr: case X86::BLSR64rm:
case X86::BLCFILL32rr: case X86::BLCFILL32rm:
case X86::BLCFILL64rr: case X86::BLCFILL64rm:
case X86::BLCI32rr: case X86::BLCI32rm:
@@ -4075,16 +4123,23 @@
case X86::BLSFILL64rr: case X86::BLSFILL64rm:
case X86::BLSIC32rr: case X86::BLSIC32rm:
case X86::BLSIC64rr: case X86::BLSIC64rm:
+ case X86::BZHI32rr: case X86::BZHI32rm:
+ case X86::BZHI64rr: case X86::BZHI64rm:
case X86::T1MSKC32rr: case X86::T1MSKC32rm:
case X86::T1MSKC64rr: case X86::T1MSKC64rm:
case X86::TZMSK32rr: case X86::TZMSK32rm:
case X86::TZMSK64rr: case X86::TZMSK64rm:
+ // These instructions clear the overflow flag just like TEST.
+ // FIXME: These are not the only instructions in this switch that clear the
+ // overflow flag.
+ ClearsOverflowFlag = true;
return true;
case X86::BEXTR32rr: case X86::BEXTR64rr:
case X86::BEXTR32rm: case X86::BEXTR64rm:
case X86::BEXTRI32ri: case X86::BEXTRI32mi:
case X86::BEXTRI64ri: case X86::BEXTRI64mi:
- // BEXTR doesn't update the sign flag so we can't use it.
+ // BEXTR doesn't update the sign flag so we can't use it. It does clear
+ // the overflow flag, but that's not useful without the sign flag.
NoSignFlag = true;
return true;
}
@@ -4179,6 +4234,8 @@
}
CmpInstr.setDesc(get(NewOpcode));
CmpInstr.RemoveOperand(0);
+ // Mutating this instruction invalidates any debug data associated with it.
+ CmpInstr.dropDebugNumber();
// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
@@ -4204,8 +4261,9 @@
// right way.
bool ShouldUpdateCC = false;
bool NoSignFlag = false;
+ bool ClearsOverflowFlag = false;
X86::CondCode NewCC = X86::COND_INVALID;
- if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag)) {
+ if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
// Scan forward from the use until we hit the use we're looking for or the
// compare instruction.
for (MachineBasicBlock::iterator J = MI;; ++J) {
@@ -4317,11 +4375,15 @@
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
+ // CF is used, we can't perform this optimization.
+ return false;
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
case X86::COND_O: case X86::COND_NO:
- // CF and OF are used, we can't perform this optimization.
- return false;
+ // If OF is used, the instruction needs to clear it like CmpZero does.
+ if (!ClearsOverflowFlag)
+ return false;
+ break;
case X86::COND_S: case X86::COND_NS:
// If SF is used, but the instruction doesn't update the SF, then we
// can't do the optimization.
@@ -4490,7 +4552,7 @@
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
bool MinusOne) {
MachineBasicBlock &MBB = *MIB->getParent();
- DebugLoc DL = MIB->getDebugLoc();
+ const DebugLoc &DL = MIB->getDebugLoc();
Register Reg = MIB.getReg(0);
// Insert the XOR.
@@ -4509,7 +4571,7 @@
const TargetInstrInfo &TII,
const X86Subtarget &Subtarget) {
MachineBasicBlock &MBB = *MIB->getParent();
- DebugLoc DL = MIB->getDebugLoc();
+ const DebugLoc &DL = MIB->getDebugLoc();
int64_t Imm = MIB->getOperand(1).getImm();
assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
MachineBasicBlock::iterator I = MIB.getInstr();
@@ -4566,7 +4628,7 @@
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII) {
MachineBasicBlock &MBB = *MIB->getParent();
- DebugLoc DL = MIB->getDebugLoc();
+ const DebugLoc &DL = MIB->getDebugLoc();
Register Reg = MIB.getReg(0);
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
@@ -5706,7 +5768,7 @@
Align Alignment = MFI.getObjectAlign(FrameIndex);
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
- if (!RI.needsStackRealignment(MF))
+ if (!RI.hasStackRealignment(MF))
Alignment =
std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
@@ -6090,15 +6152,16 @@
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
- if (MF.getTarget().isPositionIndependent()) {
- if (Subtarget.is64Bit())
- PICBase = X86::RIP;
- else
- // FIXME: PICBase = getGlobalBaseReg(&MF);
- // This doesn't work for several reasons.
- // 1. GlobalBaseReg may have been spilled.
- // 2. It may not be live at MI.
- return nullptr;
+ // Since we're using Small or Kernel code model, we can always use
+ // RIP-relative addressing for a smaller encoding.
+ if (Subtarget.is64Bit()) {
+ PICBase = X86::RIP;
+ } else if (MF.getTarget().isPositionIndependent()) {
+ // FIXME: PICBase = getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return nullptr;
}
// Create a constant-pool entry.
@@ -6348,7 +6411,7 @@
case X86::CMP8ri: {
MachineOperand &MO0 = DataMI->getOperand(0);
MachineOperand &MO1 = DataMI->getOperand(1);
- if (MO1.getImm() == 0) {
+ if (MO1.isImm() && MO1.getImm() == 0) {
unsigned NewOpc;
switch (DataMI->getOpcode()) {
default: llvm_unreachable("Unreachable!");
@@ -6788,7 +6851,8 @@
// ENDBR instructions should not be scheduled around.
unsigned Opcode = MI.getOpcode();
- if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
+ if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
+ Opcode == X86::LDTILECFG)
return true;
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
@@ -7713,8 +7777,10 @@
}
/// Return the noop instruction to use for a noop.
-void X86InstrInfo::getNoop(MCInst &NopInst) const {
- NopInst.setOpcode(X86::NOOP);
+MCInst X86InstrInfo::getNop() const {
+ MCInst Nop;
+ Nop.setOpcode(X86::NOOP);
+ return Nop;
}
bool X86InstrInfo::isHighLatencyDef(int opc) const {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
index d7d2370..c663bb3 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
@@ -284,6 +284,10 @@
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
+ /// Returns true if we have preference on the operands order in MI, the
+ /// commute decision is returned in Commute.
+ bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override;
+
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
/// performs the same computations as the given \p MI but which has the
/// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
@@ -439,7 +443,7 @@
int64_t Offset2,
unsigned NumLoads) const override;
- void getNoop(MCInst &NopInst) const override;
+ MCInst getNop() const override;
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
index b006d1d..34afedb 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
@@ -204,6 +204,11 @@
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
+def X86call_rvmarker : SDNode<"X86ISD::CALL_RVMARKER", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+
def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
@@ -2587,7 +2592,7 @@
}
multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
- X86MemOperand x86memop, Intrinsic Int,
+ X86MemOperand x86memop, SDNode Int,
PatFrag ld_frag, X86FoldableSchedWrite Sched> {
def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -2962,14 +2967,15 @@
//===----------------------------------------------------------------------===//
// SERIALIZE Instruction
//
-def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
- [(int_x86_serialize)]>, PS,
- Requires<[HasSERIALIZE]>;
+let SchedRW = [WriteSystem] in
+ def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
+ [(int_x86_serialize)]>, PS,
+ Requires<[HasSERIALIZE]>;
//===----------------------------------------------------------------------===//
// TSXLDTRK - TSX Suspend Load Address Tracking
//
-let Predicates = [HasTSXLDTRK] in {
+let Predicates = [HasTSXLDTRK], SchedRW = [WriteSystem] in {
def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
[(int_x86_xsusldtrk)]>, XD;
def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
@@ -2979,7 +2985,7 @@
//===----------------------------------------------------------------------===//
// UINTR Instructions
//
-let Predicates = [HasUINTR, In64BitMode] in {
+let Predicates = [HasUINTR, In64BitMode], SchedRW = [WriteSystem] in {
def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
[]>, XS;
def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
index a185a20..41fda60 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
@@ -17,7 +17,7 @@
//===----------------------------------------------------------------------===//
/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
-multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
RegisterClass RC, X86MemOperand x86memop,
Domain d, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
@@ -63,7 +63,7 @@
}
/// sse12_fp_packed - SSE 1 & 2 packed instructions class
-multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag,
Domain d, X86FoldableSchedWrite sched,
@@ -616,7 +616,7 @@
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
-multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
string base_opc, string asm_opr> {
// No pattern as they need be special cased between high and low.
let hasSideEffects = 0, mayLoad = 1 in
@@ -811,7 +811,7 @@
//===----------------------------------------------------------------------===//
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+ SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, string mem, X86FoldableSchedWrite sched,
Domain d,
SchedRead Int2Fpu = ReadDefault> {
@@ -1837,7 +1837,7 @@
}
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
-multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
ValueType vt, X86MemOperand x86memop,
PatFrag ld_frag, string OpcodeStr, Domain d,
X86FoldableSchedWrite sched = WriteFComX> {
@@ -2589,7 +2589,7 @@
/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
/// classes below
multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86SchedWriteSizes sched> {
+ SDPatternOperator OpNode, X86SchedWriteSizes sched> {
let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
@@ -2618,7 +2618,7 @@
}
}
-multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86SchedWriteSizes sched> {
let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -2732,10 +2732,10 @@
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
-multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
- ValueType VT, ValueType EltTy,
- RegisterClass RC, PatFrag ld_frag,
- Predicate BasePredicate> {
+multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
+ ValueType VT, ValueType EltTy,
+ RegisterClass RC, PatFrag ld_frag,
+ Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
@@ -2791,7 +2791,7 @@
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
- Operand intmemop, SDNode OpNode, Domain d,
+ Operand intmemop, SDPatternOperator OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
@@ -2857,7 +2857,7 @@
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
- Operand intmemop, SDNode OpNode, Domain d,
+ Operand intmemop, SDPatternOperator OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
@@ -2901,7 +2901,7 @@
}
/// sse1_fp_unop_p - SSE1 unops in packed form.
-multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86SchedWriteWidths sched, list<Predicate> prds> {
let Predicates = prds in {
def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2938,7 +2938,7 @@
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86SchedWriteWidths sched> {
+ SDPatternOperator OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasAVX, NoVLX] in {
def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
@@ -2983,7 +2983,7 @@
XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
-multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
@@ -2992,7 +2992,7 @@
XS, VEX_4V, VEX_LIG, VEX_WIG;
}
-multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
@@ -3018,7 +3018,7 @@
// There is no f64 version of the reciprocal approximation instructions.
-multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
ValueType VT, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
@@ -4011,7 +4011,15 @@
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
- VEX, VEX_WIG;
+ VEX, VEX_WIG, AdSize64;
+let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask), "",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+ VEX, VEX_WIG, AdSize32 {
+ let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
+ let AsmVariantName = "NonParsable";
+}
let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -4020,7 +4028,15 @@
let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
+ AdSize64;
+let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+ AdSize32 {
+ let AsmVariantName = "NonParsable";
+}
} // ExeDomain = SSEPackedInt
@@ -4944,7 +4960,7 @@
VEX, VEX_WIG;
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
- VR256, VR128, WriteShuffle256>,
+ VR256, VR128, WriteVPMOV256>,
VEX, VEX_L, VEX_WIG;
}
@@ -5386,7 +5402,7 @@
multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
- ValueType VT, PatFrag mem_frag, SDNode OpNode,
+ ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
X86FoldableSchedWrite sched> {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@@ -7166,7 +7182,8 @@
//===----------------------------------------------------------------------===//
// AVX_VNNI
//===----------------------------------------------------------------------===//
-let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
+ ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
bit IsCommutable> {
let isCommutable = IsCommutable in
@@ -7200,10 +7217,10 @@
VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
}
-defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
-defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
-defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
-defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
+defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
+defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>;
+defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
(X86vpmaddwd node:$lhs, node:$rhs), [{
@@ -7881,12 +7898,12 @@
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX, Sched<[WriteLoad]>;
+ []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX, VEX_L, Sched<[WriteLoad]>;
+ []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
}
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/src/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
index eb87408..48c2705 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
@@ -39,7 +39,8 @@
"ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
}
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+let isTerminator = 1 in
+ def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
// Interrupt and SysCall Instructions.
@@ -47,15 +48,12 @@
def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>;
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
-} // SchedRW
def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
// The long form of "int $3" turns into int3 as a size optimization.
// FIXME: This doesn't work because InstAlias can't match immediate constants.
//def : InstAlias<"int\t$3", (INT3)>;
-let SchedRW = [WriteSystem] in {
-
def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
[(int_x86_int timm:$trap)]>;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/src/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 95655dd..40174a1 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -724,30 +724,34 @@
auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
if (isa<LoadInst>(Inst)) {
- // Try to generate target-sized register(/instruction).
- decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
-
auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
- // Perform matrix-transposition in order to compute interleaved
- // results by generating some sort of (optimized) target-specific
- // instructions.
-
switch (NumSubVecElems) {
default:
return false;
case 4:
- transpose_4x4(DecomposedVectors, TransposedVectors);
- break;
case 8:
case 16:
case 32:
case 64:
- deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
- NumSubVecElems);
+ if (ShuffleTy->getNumElements() != NumSubVecElems)
+ return false;
break;
}
+ // Try to generate target-sized register(/instruction).
+ decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+
+ if (NumSubVecElems == 4)
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ else
+ deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+
// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.
for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/src/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 72ab3e9..de2500b 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -324,9 +324,7 @@
* Find Intrinsic data by intrinsic ID
*/
static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
- const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
- std::end(IntrinsicsWithChain),
- IntNo);
+ const IntrinsicData *Data = lower_bound(IntrinsicsWithChain, IntNo);
if (Data != std::end(IntrinsicsWithChain) && Data->Id == IntNo)
return Data;
return nullptr;
@@ -1152,9 +1150,7 @@
* Return nullptr if intrinsic is not defined in the table.
*/
static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
- const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
- std::end(IntrinsicsWithoutChain),
- IntNo);
+ const IntrinsicData *Data = lower_bound(IntrinsicsWithoutChain, IntNo);
if (Data != std::end(IntrinsicsWithoutChain) && Data->Id == IntNo)
return Data;
return nullptr;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 1b371ac..2fd7405 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -32,26 +32,27 @@
/// In practice, not specifying those isn't a problem, and the below functions
/// should disappear quickly as we add support for legalizing non-power-of-2
/// sized types further.
-static void
-addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
- const LegalizerInfo::SizeAndActionsVec &v) {
+static void addAndInterleaveWithUnsupported(
+ LegacyLegalizerInfo::SizeAndActionsVec &result,
+ const LegacyLegalizerInfo::SizeAndActionsVec &v) {
for (unsigned i = 0; i < v.size(); ++i) {
result.push_back(v[i]);
if (i + 1 < v[i].first && i + 1 < v.size() &&
v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, Unsupported});
+ result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
}
}
-static LegalizerInfo::SizeAndActionsVec
-widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+static LegacyLegalizerInfo::SizeAndActionsVec
+widen_1(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
assert(v.size() >= 1);
assert(v[0].first > 1);
- LegalizerInfo::SizeAndActionsVec result = {{1, WidenScalar},
- {2, Unsupported}};
+ LegacyLegalizerInfo::SizeAndActionsVec result = {
+ {1, LegacyLegalizeActions::WidenScalar},
+ {2, LegacyLegalizeActions::Unsupported}};
addAndInterleaveWithUnsupported(result, v);
auto Largest = result.back().first;
- result.push_back({Largest + 1, Unsupported});
+ result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
return result;
}
@@ -75,20 +76,23 @@
.minScalar(0, LLT::scalar(32))
.libcall();
- setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
for (unsigned MemOp : {G_LOAD, G_STORE})
- setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
- narrowToSmallerAndWidenToSmallest);
- setLegalizeScalarToDifferentSizeStrategy(
- G_PTR_ADD, 1, widenToLargerTypesUnsupportedOtherwise);
- setLegalizeScalarToDifferentSizeStrategy(
- G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(
+ MemOp, 0, LegacyLegalizerInfo::narrowToSmallerAndWidenToSmallest);
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(
+ G_PTR_ADD, 1,
+ LegacyLegalizerInfo::widenToLargerTypesUnsupportedOtherwise);
+ LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(
+ G_CONSTANT, 0,
+ LegacyLegalizerInfo::widenToLargerTypesAndNarrowToLargest);
getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
- computeTables();
+ LegacyInfo.computeTables();
verify(*STI.getInstrInfo());
}
@@ -107,35 +111,37 @@
const LLT s64 = LLT::scalar(64);
const LLT s128 = LLT::scalar(128);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
+
for (auto Ty : {p0, s1, s8, s16, s32})
- setAction({G_IMPLICIT_DEF, Ty}, Legal);
+ LegacyInfo.setAction({G_IMPLICIT_DEF, Ty}, LegacyLegalizeActions::Legal);
for (auto Ty : {s8, s16, s32, p0})
- setAction({G_PHI, Ty}, Legal);
+ LegacyInfo.setAction({G_PHI, Ty}, LegacyLegalizeActions::Legal);
for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
for (auto Ty : {s8, s16, s32})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
for (unsigned Op : {G_UADDE}) {
- setAction({Op, s32}, Legal);
- setAction({Op, 1, s1}, Legal);
+ LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({Op, 1, s1}, LegacyLegalizeActions::Legal);
}
for (unsigned MemOp : {G_LOAD, G_STORE}) {
for (auto Ty : {s8, s16, s32, p0})
- setAction({MemOp, Ty}, Legal);
+ LegacyInfo.setAction({MemOp, Ty}, LegacyLegalizeActions::Legal);
// And everything's fine in addrspace 0.
- setAction({MemOp, 1, p0}, Legal);
+ LegacyInfo.setAction({MemOp, 1, p0}, LegacyLegalizeActions::Legal);
}
// Pointer-handling
- setAction({G_FRAME_INDEX, p0}, Legal);
- setAction({G_GLOBAL_VALUE, p0}, Legal);
+ LegacyInfo.setAction({G_FRAME_INDEX, p0}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_GLOBAL_VALUE, p0}, LegacyLegalizeActions::Legal);
- setAction({G_PTR_ADD, p0}, Legal);
- setAction({G_PTR_ADD, 1, s32}, Legal);
+ LegacyInfo.setAction({G_PTR_ADD, p0}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_PTR_ADD, 1, s32}, LegacyLegalizeActions::Legal);
if (!Subtarget.is64Bit()) {
getActionDefinitionsBuilder(G_PTRTOINT)
@@ -163,29 +169,31 @@
}
// Control-flow
- setAction({G_BRCOND, s1}, Legal);
+ LegacyInfo.setAction({G_BRCOND, s1}, LegacyLegalizeActions::Legal);
// Constants
for (auto Ty : {s8, s16, s32, p0})
- setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+ LegacyInfo.setAction({TargetOpcode::G_CONSTANT, Ty},
+ LegacyLegalizeActions::Legal);
// Extensions
for (auto Ty : {s8, s16, s32}) {
- setAction({G_ZEXT, Ty}, Legal);
- setAction({G_SEXT, Ty}, Legal);
- setAction({G_ANYEXT, Ty}, Legal);
+ LegacyInfo.setAction({G_ZEXT, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_SEXT, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_ANYEXT, Ty}, LegacyLegalizeActions::Legal);
}
- setAction({G_ANYEXT, s128}, Legal);
+ LegacyInfo.setAction({G_ANYEXT, s128}, LegacyLegalizeActions::Legal);
getActionDefinitionsBuilder(G_SEXT_INREG).lower();
// Merge/Unmerge
for (const auto &Ty : {s16, s32, s64}) {
- setAction({G_MERGE_VALUES, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_MERGE_VALUES, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, Ty},
+ LegacyLegalizeActions::Legal);
}
for (const auto &Ty : {s8, s16, s32}) {
- setAction({G_MERGE_VALUES, 1, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ LegacyInfo.setAction({G_MERGE_VALUES, 1, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, Ty}, LegacyLegalizeActions::Legal);
}
}
@@ -202,21 +210,23 @@
const LLT s64 = LLT::scalar(64);
const LLT s128 = LLT::scalar(128);
- setAction({G_IMPLICIT_DEF, s64}, Legal);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
+
+ LegacyInfo.setAction({G_IMPLICIT_DEF, s64}, LegacyLegalizeActions::Legal);
// Need to have that, as tryFoldImplicitDef will create this pattern:
// s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
- setAction({G_IMPLICIT_DEF, s128}, Legal);
+ LegacyInfo.setAction({G_IMPLICIT_DEF, s128}, LegacyLegalizeActions::Legal);
- setAction({G_PHI, s64}, Legal);
+ LegacyInfo.setAction({G_PHI, s64}, LegacyLegalizeActions::Legal);
for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- setAction({BinOp, s64}, Legal);
+ LegacyInfo.setAction({BinOp, s64}, LegacyLegalizeActions::Legal);
for (unsigned MemOp : {G_LOAD, G_STORE})
- setAction({MemOp, s64}, Legal);
+ LegacyInfo.setAction({MemOp, s64}, LegacyLegalizeActions::Legal);
// Pointer-handling
- setAction({G_PTR_ADD, 1, s64}, Legal);
+ LegacyInfo.setAction({G_PTR_ADD, 1, s64}, LegacyLegalizeActions::Legal);
getActionDefinitionsBuilder(G_PTRTOINT)
.legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
.maxScalar(0, s64)
@@ -224,11 +234,12 @@
getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s64}});
// Constants
- setAction({TargetOpcode::G_CONSTANT, s64}, Legal);
+ LegacyInfo.setAction({TargetOpcode::G_CONSTANT, s64},
+ LegacyLegalizeActions::Legal);
// Extensions
for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
- setAction({extOp, s64}, Legal);
+ LegacyInfo.setAction({extOp, s64}, LegacyLegalizeActions::Legal);
}
getActionDefinitionsBuilder(G_SITOFP)
@@ -270,10 +281,11 @@
.clampScalar(1, s8, s8);
// Merge/Unmerge
- setAction({G_MERGE_VALUES, s128}, Legal);
- setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
- setAction({G_MERGE_VALUES, 1, s128}, Legal);
- setAction({G_UNMERGE_VALUES, s128}, Legal);
+ LegacyInfo.setAction({G_MERGE_VALUES, s128}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, s128},
+ LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_MERGE_VALUES, 1, s128}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, s128}, LegacyLegalizeActions::Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE1() {
@@ -282,27 +294,31 @@
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
for (auto Ty : {s32, v4s32})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v4s32, v2s64})
- setAction({MemOp, Ty}, Legal);
+ LegacyInfo.setAction({MemOp, Ty}, LegacyLegalizeActions::Legal);
// Constants
- setAction({TargetOpcode::G_FCONSTANT, s32}, Legal);
+ LegacyInfo.setAction({TargetOpcode::G_FCONSTANT, s32},
+ LegacyLegalizeActions::Legal);
// Merge/Unmerge
for (const auto &Ty : {v4s32, v2s64}) {
- setAction({G_CONCAT_VECTORS, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, Ty},
+ LegacyLegalizeActions::Legal);
}
- setAction({G_MERGE_VALUES, 1, s64}, Legal);
- setAction({G_UNMERGE_VALUES, s64}, Legal);
+ LegacyInfo.setAction({G_MERGE_VALUES, 1, s64}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, s64}, LegacyLegalizeActions::Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE2() {
@@ -311,44 +327,49 @@
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
- const LLT v16s8 = LLT::vector(16, 8);
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v16s8 = LLT::fixed_vector(16, 8);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
- const LLT v32s8 = LLT::vector(32, 8);
- const LLT v16s16 = LLT::vector(16, 16);
- const LLT v8s32 = LLT::vector(8, 32);
- const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v32s8 = LLT::fixed_vector(32, 8);
+ const LLT v16s16 = LLT::fixed_vector(16, 16);
+ const LLT v8s32 = LLT::fixed_vector(8, 32);
+ const LLT v4s64 = LLT::fixed_vector(4, 64);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
for (auto Ty : {s64, v2s64})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
for (unsigned BinOp : {G_ADD, G_SUB})
for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
- setAction({G_MUL, v8s16}, Legal);
+ LegacyInfo.setAction({G_MUL, v8s16}, LegacyLegalizeActions::Legal);
- setAction({G_FPEXT, s64}, Legal);
- setAction({G_FPEXT, 1, s32}, Legal);
+ LegacyInfo.setAction({G_FPEXT, s64}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_FPEXT, 1, s32}, LegacyLegalizeActions::Legal);
- setAction({G_FPTRUNC, s32}, Legal);
- setAction({G_FPTRUNC, 1, s64}, Legal);
+ LegacyInfo.setAction({G_FPTRUNC, s32}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_FPTRUNC, 1, s64}, LegacyLegalizeActions::Legal);
// Constants
- setAction({TargetOpcode::G_FCONSTANT, s64}, Legal);
+ LegacyInfo.setAction({TargetOpcode::G_FCONSTANT, s64},
+ LegacyLegalizeActions::Legal);
// Merge/Unmerge
for (const auto &Ty :
{v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
- setAction({G_CONCAT_VECTORS, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, Ty},
+ LegacyLegalizeActions::Legal);
}
for (const auto &Ty : {v16s8, v8s16, v4s32, v2s64}) {
- setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, 1, Ty},
+ LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, Ty}, LegacyLegalizeActions::Legal);
}
}
@@ -356,51 +377,57 @@
if (!Subtarget.hasSSE41())
return;
- const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
- setAction({G_MUL, v4s32}, Legal);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
+
+ LegacyInfo.setAction({G_MUL, v4s32}, LegacyLegalizeActions::Legal);
}
void X86LegalizerInfo::setLegalizerInfoAVX() {
if (!Subtarget.hasAVX())
return;
- const LLT v16s8 = LLT::vector(16, 8);
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v16s8 = LLT::fixed_vector(16, 8);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
- const LLT v32s8 = LLT::vector(32, 8);
- const LLT v64s8 = LLT::vector(64, 8);
- const LLT v16s16 = LLT::vector(16, 16);
- const LLT v32s16 = LLT::vector(32, 16);
- const LLT v8s32 = LLT::vector(8, 32);
- const LLT v16s32 = LLT::vector(16, 32);
- const LLT v4s64 = LLT::vector(4, 64);
- const LLT v8s64 = LLT::vector(8, 64);
+ const LLT v32s8 = LLT::fixed_vector(32, 8);
+ const LLT v64s8 = LLT::fixed_vector(64, 8);
+ const LLT v16s16 = LLT::fixed_vector(16, 16);
+ const LLT v32s16 = LLT::fixed_vector(32, 16);
+ const LLT v8s32 = LLT::fixed_vector(8, 32);
+ const LLT v16s32 = LLT::fixed_vector(16, 32);
+ const LLT v4s64 = LLT::fixed_vector(4, 64);
+ const LLT v8s64 = LLT::fixed_vector(8, 64);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v8s32, v4s64})
- setAction({MemOp, Ty}, Legal);
+ LegacyInfo.setAction({MemOp, Ty}, LegacyLegalizeActions::Legal);
for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) {
- setAction({G_INSERT, Ty}, Legal);
- setAction({G_EXTRACT, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_INSERT, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_EXTRACT, 1, Ty}, LegacyLegalizeActions::Legal);
}
for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) {
- setAction({G_INSERT, 1, Ty}, Legal);
- setAction({G_EXTRACT, Ty}, Legal);
+ LegacyInfo.setAction({G_INSERT, 1, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_EXTRACT, Ty}, LegacyLegalizeActions::Legal);
}
// Merge/Unmerge
for (const auto &Ty :
{v32s8, v64s8, v16s16, v32s16, v8s32, v16s32, v4s64, v8s64}) {
- setAction({G_CONCAT_VECTORS, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, Ty},
+ LegacyLegalizeActions::Legal);
}
for (const auto &Ty :
{v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
- setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, 1, Ty},
+ LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, Ty}, LegacyLegalizeActions::Legal);
}
}
@@ -408,31 +435,35 @@
if (!Subtarget.hasAVX2())
return;
- const LLT v32s8 = LLT::vector(32, 8);
- const LLT v16s16 = LLT::vector(16, 16);
- const LLT v8s32 = LLT::vector(8, 32);
- const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v32s8 = LLT::fixed_vector(32, 8);
+ const LLT v16s16 = LLT::fixed_vector(16, 16);
+ const LLT v8s32 = LLT::fixed_vector(8, 32);
+ const LLT v4s64 = LLT::fixed_vector(4, 64);
- const LLT v64s8 = LLT::vector(64, 8);
- const LLT v32s16 = LLT::vector(32, 16);
- const LLT v16s32 = LLT::vector(16, 32);
- const LLT v8s64 = LLT::vector(8, 64);
+ const LLT v64s8 = LLT::fixed_vector(64, 8);
+ const LLT v32s16 = LLT::fixed_vector(32, 16);
+ const LLT v16s32 = LLT::fixed_vector(16, 32);
+ const LLT v8s64 = LLT::fixed_vector(8, 64);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned BinOp : {G_ADD, G_SUB})
for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
for (auto Ty : {v16s16, v8s32})
- setAction({G_MUL, Ty}, Legal);
+ LegacyInfo.setAction({G_MUL, Ty}, LegacyLegalizeActions::Legal);
// Merge/Unmerge
for (const auto &Ty : {v64s8, v32s16, v16s32, v8s64}) {
- setAction({G_CONCAT_VECTORS, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, 1, Ty},
+ LegacyLegalizeActions::Legal);
}
for (const auto &Ty : {v32s8, v16s16, v8s32, v4s64}) {
- setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
- setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ LegacyInfo.setAction({G_CONCAT_VECTORS, 1, Ty},
+ LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_UNMERGE_VALUES, Ty}, LegacyLegalizeActions::Legal);
}
}
@@ -440,38 +471,40 @@
if (!Subtarget.hasAVX512())
return;
- const LLT v16s8 = LLT::vector(16, 8);
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v16s8 = LLT::fixed_vector(16, 8);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v4s32 = LLT::fixed_vector(4, 32);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
- const LLT v32s8 = LLT::vector(32, 8);
- const LLT v16s16 = LLT::vector(16, 16);
- const LLT v8s32 = LLT::vector(8, 32);
- const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v32s8 = LLT::fixed_vector(32, 8);
+ const LLT v16s16 = LLT::fixed_vector(16, 16);
+ const LLT v8s32 = LLT::fixed_vector(8, 32);
+ const LLT v4s64 = LLT::fixed_vector(4, 64);
- const LLT v64s8 = LLT::vector(64, 8);
- const LLT v32s16 = LLT::vector(32, 16);
- const LLT v16s32 = LLT::vector(16, 32);
- const LLT v8s64 = LLT::vector(8, 64);
+ const LLT v64s8 = LLT::fixed_vector(64, 8);
+ const LLT v32s16 = LLT::fixed_vector(32, 16);
+ const LLT v16s32 = LLT::fixed_vector(16, 32);
+ const LLT v8s64 = LLT::fixed_vector(8, 64);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned BinOp : {G_ADD, G_SUB})
for (auto Ty : {v16s32, v8s64})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
- setAction({G_MUL, v16s32}, Legal);
+ LegacyInfo.setAction({G_MUL, v16s32}, LegacyLegalizeActions::Legal);
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v16s32, v8s64})
- setAction({MemOp, Ty}, Legal);
+ LegacyInfo.setAction({MemOp, Ty}, LegacyLegalizeActions::Legal);
for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) {
- setAction({G_INSERT, Ty}, Legal);
- setAction({G_EXTRACT, 1, Ty}, Legal);
+ LegacyInfo.setAction({G_INSERT, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_EXTRACT, 1, Ty}, LegacyLegalizeActions::Legal);
}
for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) {
- setAction({G_INSERT, 1, Ty}, Legal);
- setAction({G_EXTRACT, Ty}, Legal);
+ LegacyInfo.setAction({G_INSERT, 1, Ty}, LegacyLegalizeActions::Legal);
+ LegacyInfo.setAction({G_EXTRACT, Ty}, LegacyLegalizeActions::Legal);
}
/************ VLX *******************/
@@ -479,48 +512,52 @@
return;
for (auto Ty : {v4s32, v8s32})
- setAction({G_MUL, Ty}, Legal);
+ LegacyInfo.setAction({G_MUL, Ty}, LegacyLegalizeActions::Legal);
}
void X86LegalizerInfo::setLegalizerInfoAVX512DQ() {
if (!(Subtarget.hasAVX512() && Subtarget.hasDQI()))
return;
- const LLT v8s64 = LLT::vector(8, 64);
+ const LLT v8s64 = LLT::fixed_vector(8, 64);
- setAction({G_MUL, v8s64}, Legal);
+ auto &LegacyInfo = getLegacyLegalizerInfo();
+
+ LegacyInfo.setAction({G_MUL, v8s64}, LegacyLegalizeActions::Legal);
/************ VLX *******************/
if (!Subtarget.hasVLX())
return;
- const LLT v2s64 = LLT::vector(2, 64);
- const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v2s64 = LLT::fixed_vector(2, 64);
+ const LLT v4s64 = LLT::fixed_vector(4, 64);
for (auto Ty : {v2s64, v4s64})
- setAction({G_MUL, Ty}, Legal);
+ LegacyInfo.setAction({G_MUL, Ty}, LegacyLegalizeActions::Legal);
}
void X86LegalizerInfo::setLegalizerInfoAVX512BW() {
if (!(Subtarget.hasAVX512() && Subtarget.hasBWI()))
return;
- const LLT v64s8 = LLT::vector(64, 8);
- const LLT v32s16 = LLT::vector(32, 16);
+ const LLT v64s8 = LLT::fixed_vector(64, 8);
+ const LLT v32s16 = LLT::fixed_vector(32, 16);
+
+ auto &LegacyInfo = getLegacyLegalizerInfo();
for (unsigned BinOp : {G_ADD, G_SUB})
for (auto Ty : {v64s8, v32s16})
- setAction({BinOp, Ty}, Legal);
+ LegacyInfo.setAction({BinOp, Ty}, LegacyLegalizeActions::Legal);
- setAction({G_MUL, v32s16}, Legal);
+ LegacyInfo.setAction({G_MUL, v32s16}, LegacyLegalizeActions::Legal);
/************ VLX *******************/
if (!Subtarget.hasVLX())
return;
- const LLT v8s16 = LLT::vector(8, 16);
- const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s16 = LLT::fixed_vector(8, 16);
+ const LLT v16s16 = LLT::fixed_vector(16, 16);
for (auto Ty : {v8s16, v16s16})
- setAction({G_MUL, Ty}, Legal);
+ LegacyInfo.setAction({G_MUL, Ty}, LegacyLegalizeActions::Legal);
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/src/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 810fee0..165533e 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -372,9 +372,9 @@
auto Use = DFG.addr<UseNode *>(UseID);
if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
NodeAddr<PhiNode *> Phi = Use.Addr->getOwner(DFG);
- for (auto I : L.getRealUses(Phi.Id)) {
+ for (const auto& I : L.getRealUses(Phi.Id)) {
if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) {
- for (auto UA : I.second)
+ for (const auto &UA : I.second)
Uses.emplace(UA.first);
}
}
@@ -417,7 +417,7 @@
// Check whether the use propagates to more defs.
NodeAddr<InstrNode *> Owner{Use.Addr->getOwner(DFG)};
rdf::NodeList AnalyzedChildDefs;
- for (auto &ChildDef :
+ for (const auto &ChildDef :
Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) {
if (!DefsVisited.insert(ChildDef.Id).second)
continue; // Already visited this def
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
new file mode 100644
index 0000000..248069f
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -0,0 +1,679 @@
+//===-- X86LowerAMXIntrinsics.cpp -X86 Scalarize AMX Intrinsics------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform amx intrinsics to scalar operations.
+/// This pass is always enabled and it skips when it is not -O0 and has no
+/// optnone attributes. With -O0 or optnone attribute, the def of shape to amx
+/// intrinsics is near the amx intrinsics code. We are not able to find a
+/// point which post-dominate all the shape and dominate all amx intrinsics.
+/// To decouple the dependency of the shape, we transform amx intrinsics
+/// to scalar operation, so that compiling doesn't fail. In long term, we
+/// should improve fast register allocation to allocate amx register.
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-amx-intrinsics"
+
+#ifndef NDEBUG
+static bool isV256I32Ty(Type *Ty) {
+ if (auto *FVT = dyn_cast<FixedVectorType>(Ty))
+ return FVT->getNumElements() == 256 &&
+ FVT->getElementType()->isIntegerTy(32);
+ return false;
+}
+#endif
+
+static cl::opt<bool>
+ X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden,
+ cl::desc("X86: enable AMX scalarizition."));
+
+namespace {
+class X86LowerAMXIntrinsics {
+ Function &Func;
+
+public:
+ X86LowerAMXIntrinsics(Function &F, DomTreeUpdater &DomTU, LoopInfo *LoopI)
+ : Func(F), DTU(DomTU), LI(LoopI) {}
+ bool visit();
+
+private:
+ DomTreeUpdater &DTU;
+ LoopInfo *LI;
+ BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound,
+ Value *Step, StringRef Name, IRBuilderBase &B,
+ Loop *L);
+ template <bool IsTileLoad>
+ Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End,
+ IRBuilderBase &B, Value *Row, Value *Col,
+ Value *Ptr, Value *Stride, Value *Tile);
+ template <Intrinsic::ID IntrID>
+ typename std::enable_if<IntrID == Intrinsic::x86_tdpbssd_internal ||
+ IntrID == Intrinsic::x86_tdpbsud_internal ||
+ IntrID == Intrinsic::x86_tdpbusd_internal ||
+ IntrID == Intrinsic::x86_tdpbuud_internal ||
+ IntrID == Intrinsic::x86_tdpbf16ps_internal,
+ Value *>::type
+ createTileDPLoops(BasicBlock *Start, BasicBlock *End, IRBuilderBase &B,
+ Value *Row, Value *Col, Value *K, Value *Acc, Value *LHS,
+ Value *RHS);
+ template <bool IsTileLoad>
+ bool lowerTileLoadStore(Instruction *TileLoadStore);
+ template <Intrinsic::ID IntrID>
+ typename std::enable_if<IntrID == Intrinsic::x86_tdpbssd_internal ||
+ IntrID == Intrinsic::x86_tdpbsud_internal ||
+ IntrID == Intrinsic::x86_tdpbusd_internal ||
+ IntrID == Intrinsic::x86_tdpbuud_internal ||
+ IntrID == Intrinsic::x86_tdpbf16ps_internal,
+ bool>::type
+ lowerTileDP(Instruction *TileDP);
+ bool lowerTileZero(Instruction *TileZero);
+};
+} // anonymous namespace
+
+BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader,
+ BasicBlock *Exit, Value *Bound,
+ Value *Step, StringRef Name,
+ IRBuilderBase &B, Loop *L) {
+ LLVMContext &Ctx = Preheader->getContext();
+ BasicBlock *Header =
+ BasicBlock::Create(Ctx, Name + ".header", Preheader->getParent(), Exit);
+ BasicBlock *Body =
+ BasicBlock::Create(Ctx, Name + ".body", Header->getParent(), Exit);
+ BasicBlock *Latch =
+ BasicBlock::Create(Ctx, Name + ".latch", Header->getParent(), Exit);
+
+ Type *I16Ty = Type::getInt16Ty(Ctx);
+ BranchInst::Create(Body, Header);
+ BranchInst::Create(Latch, Body);
+ PHINode *IV =
+ PHINode::Create(I16Ty, 2, Name + ".iv", Header->getTerminator());
+ IV->addIncoming(ConstantInt::get(I16Ty, 0), Preheader);
+
+ B.SetInsertPoint(Latch);
+ Value *Inc = B.CreateAdd(IV, Step, Name + ".step");
+ Value *Cond = B.CreateICmpNE(Inc, Bound, Name + ".cond");
+ BranchInst::Create(Header, Exit, Cond, Latch);
+ IV->addIncoming(Inc, Latch);
+
+ BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
+ BasicBlock *Tmp = PreheaderBr->getSuccessor(0);
+ PreheaderBr->setSuccessor(0, Header);
+ DTU.applyUpdatesPermissive({
+ {DominatorTree::Delete, Preheader, Tmp},
+ {DominatorTree::Insert, Header, Body},
+ {DominatorTree::Insert, Body, Latch},
+ {DominatorTree::Insert, Latch, Header},
+ {DominatorTree::Insert, Latch, Exit},
+ {DominatorTree::Insert, Preheader, Header},
+ });
+ if (LI) {
+ L->addBasicBlockToLoop(Header, *LI);
+ L->addBasicBlockToLoop(Body, *LI);
+ L->addBasicBlockToLoop(Latch, *LI);
+ }
+ return Body;
+}
+
+template <bool IsTileLoad>
+Value *X86LowerAMXIntrinsics::createTileLoadStoreLoops(
+ BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row,
+ Value *Col, Value *Ptr, Value *Stride, Value *Tile) {
+ std::string IntrinName = IsTileLoad ? "tileload" : "tilestore";
+ Loop *RowLoop = nullptr;
+ Loop *ColLoop = nullptr;
+ if (LI) {
+ RowLoop = LI->AllocateLoop();
+ ColLoop = LI->AllocateLoop();
+ RowLoop->addChildLoop(ColLoop);
+ if (Loop *ParentL = LI->getLoopFor(Start))
+ ParentL->addChildLoop(RowLoop);
+ else
+ LI->addTopLevelLoop(RowLoop);
+ }
+
+ BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1),
+ IntrinName + ".scalarize.rows", B, RowLoop);
+ BasicBlock *RowLatch = RowBody->getSingleSuccessor();
+
+ BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1),
+ IntrinName + ".scalarize.cols", B, ColLoop);
+
+ BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor();
+ BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor();
+ BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor();
+ Value *CurrentRow = &*RowLoopHeader->begin();
+ Value *CurrentCol = &*ColLoopHeader->begin();
+ Type *EltTy = B.getInt32Ty();
+ FixedVectorType *V256I32Ty = FixedVectorType::get(EltTy, 256);
+
+ // Common part for tileload and tilestore
+ // *.scalarize.cols.body:
+ // Calculate %idxmem and %idxvec
+ B.SetInsertPoint(ColBody->getTerminator());
+ Value *CurrentRowZExt = B.CreateZExt(CurrentRow, Stride->getType());
+ Value *CurrentColZExt = B.CreateZExt(CurrentCol, Stride->getType());
+ Value *Offset =
+ B.CreateAdd(B.CreateMul(CurrentRowZExt, Stride), CurrentColZExt);
+ unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ Value *EltBasePtr = B.CreatePointerCast(Ptr, PointerType::get(EltTy, AS));
+ Value *EltPtr = B.CreateGEP(EltTy, EltBasePtr, Offset);
+ Value *Idx = B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentCol);
+ if (IsTileLoad) {
+ // tileload.scalarize.rows.header:
+ // %vec.phi.row = phi <256 x i32> [ zeroinitializer, %entry ], [ %ResVec,
+ // %tileload.scalarize.rows.latch ]
+ B.SetInsertPoint(RowLoopHeader->getTerminator());
+ Value *VecZero = Constant::getNullValue(V256I32Ty);
+ PHINode *VecCPhiRowLoop = B.CreatePHI(V256I32Ty, 2, "vec.phi.row");
+ VecCPhiRowLoop->addIncoming(VecZero, Start);
+
+ // tileload.scalarize.cols.header:
+ // %vec.phi = phi <256 x i32> [ %vec.phi.row, %tileload.scalarize.rows.body
+ // ], [ %ResVec, %tileload.scalarize.cols.latch ]
+ B.SetInsertPoint(ColLoopHeader->getTerminator());
+ PHINode *VecPhi = B.CreatePHI(V256I32Ty, 2, "vec.phi");
+ VecPhi->addIncoming(VecCPhiRowLoop, RowBody);
+
+ // tileload.scalarize.cols.body:
+ // Calculate %idxmem and %idxvec
+ // %eltptr = getelementptr i32, i32* %base, i64 %idxmem
+ // %elt = load i32, i32* %ptr
+ // %ResVec = insertelement <256 x i32> %vec.phi, i32 %elt, i16 %idxvec
+ B.SetInsertPoint(ColBody->getTerminator());
+ Value *Elt = B.CreateLoad(EltTy, EltPtr);
+ Value *ResVec = B.CreateInsertElement(VecPhi, Elt, Idx);
+ VecPhi->addIncoming(ResVec, ColLoopLatch);
+ VecCPhiRowLoop->addIncoming(ResVec, RowLatch);
+
+ return ResVec;
+ } else {
+ auto *BitCast = cast<BitCastInst>(Tile);
+ Value *Vec = BitCast->getOperand(0);
+ assert(isV256I32Ty(Vec->getType()) && "bitcast from non-v256i32 to x86amx");
+ // tilestore.scalarize.cols.body:
+ // %mul = mul i16 %row.iv, i16 16
+ // %idx = add i16 %mul, i16 %col.iv
+ // %vec = extractelement <16 x i32> %vec, i16 %idx
+ // store i32 %vec, i32* %ptr
+ B.SetInsertPoint(ColBody->getTerminator());
+ Value *Elt = B.CreateExtractElement(Vec, Idx);
+
+ B.CreateStore(Elt, EltPtr);
+ return nullptr;
+ }
+}
+
+template <Intrinsic::ID IntrID>
+typename std::enable_if<IntrID == Intrinsic::x86_tdpbssd_internal ||
+ IntrID == Intrinsic::x86_tdpbsud_internal ||
+ IntrID == Intrinsic::x86_tdpbusd_internal ||
+ IntrID == Intrinsic::x86_tdpbuud_internal ||
+ IntrID == Intrinsic::x86_tdpbf16ps_internal,
+ Value *>::type
+X86LowerAMXIntrinsics::createTileDPLoops(BasicBlock *Start, BasicBlock *End,
+ IRBuilderBase &B, Value *Row,
+ Value *Col, Value *K, Value *Acc,
+ Value *LHS, Value *RHS) {
+ std::string IntrinName;
+ switch (IntrID) {
+ case Intrinsic::x86_tdpbssd_internal:
+ IntrinName = "tiledpbssd";
+ break;
+ case Intrinsic::x86_tdpbsud_internal:
+ IntrinName = "tiledpbsud";
+ break;
+ case Intrinsic::x86_tdpbusd_internal:
+ IntrinName = "tiledpbusd";
+ break;
+ case Intrinsic::x86_tdpbuud_internal:
+ IntrinName = "tiledpbuud";
+ break;
+ case Intrinsic::x86_tdpbf16ps_internal:
+ IntrinName = "tiledpbf16ps";
+ break;
+ }
+ Loop *RowLoop = nullptr;
+ Loop *ColLoop = nullptr;
+ Loop *InnerLoop = nullptr;
+ if (LI) {
+ RowLoop = LI->AllocateLoop();
+ ColLoop = LI->AllocateLoop();
+ InnerLoop = LI->AllocateLoop();
+ ColLoop->addChildLoop(InnerLoop);
+ RowLoop->addChildLoop(ColLoop);
+ if (Loop *ParentL = LI->getLoopFor(Start))
+ ParentL->addChildLoop(RowLoop);
+ else
+ LI->addTopLevelLoop(RowLoop);
+ }
+
+ BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1),
+ IntrinName + ".scalarize.rows", B, RowLoop);
+ BasicBlock *RowLatch = RowBody->getSingleSuccessor();
+
+ BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1),
+ IntrinName + ".scalarize.cols", B, ColLoop);
+
+ BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor();
+
+ B.SetInsertPoint(ColBody->getTerminator());
+ BasicBlock *InnerBody =
+ createLoop(ColBody, ColLoopLatch, K, B.getInt16(1),
+ IntrinName + ".scalarize.inner", B, InnerLoop);
+
+ BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor();
+ BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor();
+ BasicBlock *InnerLoopHeader = InnerBody->getSinglePredecessor();
+ BasicBlock *InnerLoopLatch = InnerBody->getSingleSuccessor();
+ Value *CurrentRow = &*RowLoopHeader->begin();
+ Value *CurrentCol = &*ColLoopHeader->begin();
+ Value *CurrentInner = &*InnerLoopHeader->begin();
+
+ FixedVectorType *V256I32Ty = FixedVectorType::get(B.getInt32Ty(), 256);
+ auto *BitCastAcc = cast<BitCastInst>(Acc);
+ Value *VecC = BitCastAcc->getOperand(0);
+ assert(isV256I32Ty(VecC->getType()) && "bitcast from non-v256i32 to x86amx");
+ // TODO else create BitCast from x86amx to v256i32.
+ // Store x86amx to memory, and reload from memory
+ // to vector. However with -O0, it doesn't happen.
+ auto *BitCastLHS = cast<BitCastInst>(LHS);
+ Value *VecA = BitCastLHS->getOperand(0);
+ assert(isV256I32Ty(VecA->getType()) && "bitcast from non-v256i32 to x86amx");
+ auto *BitCastRHS = cast<BitCastInst>(RHS);
+ Value *VecB = BitCastRHS->getOperand(0);
+ assert(isV256I32Ty(VecB->getType()) && "bitcast from non-v256i32 to x86amx");
+
+ // tiledpbssd.scalarize.rows.header:
+ // %vec.c.phi.row = phi <256 x i32> [ %VecC, %continue ], [ %NewVecC,
+ // %tiledpbssd.scalarize.rows.latch ]
+
+ // %vec.d.phi.row = phi <256 x i32> [ zeroinitializer, %continue ], [
+ // %NewVecD, %tiledpbssd.scalarize.rows.latch ]
+ B.SetInsertPoint(RowLoopHeader->getTerminator());
+ PHINode *VecCPhiRowLoop = B.CreatePHI(V256I32Ty, 2, "vec.c.phi.row");
+ VecCPhiRowLoop->addIncoming(VecC, Start);
+ Value *VecZero = Constant::getNullValue(V256I32Ty);
+ PHINode *VecDPhiRowLoop = B.CreatePHI(V256I32Ty, 2, "vec.d.phi.row");
+ VecDPhiRowLoop->addIncoming(VecZero, Start);
+
+ // tiledpbssd.scalarize.cols.header:
+ // %vec.c.phi.col = phi <256 x i32> [ %vec.c.phi.row,
+ // %tiledpbssd.scalarize.rows.body ], [ %NewVecC,
+ // %tiledpbssd.scalarize.cols.latch ]
+
+ // %vec.d.phi.col = phi <256 x i32> [
+ // %vec.d.phi.row, %tiledpbssd.scalarize.rows.body ], [ %NewVecD,
+ // %tiledpbssd.scalarize.cols.latch ]
+
+ // calculate idxc.
+ B.SetInsertPoint(ColLoopHeader->getTerminator());
+ PHINode *VecCPhiColLoop = B.CreatePHI(V256I32Ty, 2, "vec.c.phi.col");
+ VecCPhiColLoop->addIncoming(VecCPhiRowLoop, RowBody);
+ PHINode *VecDPhiColLoop = B.CreatePHI(V256I32Ty, 2, "vec.d.phi.col");
+ VecDPhiColLoop->addIncoming(VecDPhiRowLoop, RowBody);
+ Value *IdxC =
+ B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentCol);
+
+ // tiledpbssd.scalarize.inner.header:
+ // %vec.c.inner.phi = phi <256 x i32> [ %vec.c.phi.col,
+ // %tiledpbssd.scalarize.cols.body ], [ %NewVecC,
+ // %tiledpbssd.scalarize.inner.latch ]
+
+ B.SetInsertPoint(InnerLoopHeader->getTerminator());
+ PHINode *VecCPhi = B.CreatePHI(V256I32Ty, 2, "vec.c.inner.phi");
+ VecCPhi->addIncoming(VecCPhiColLoop, ColBody);
+
+ B.SetInsertPoint(InnerBody->getTerminator());
+ Value *IdxA =
+ B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentInner);
+ Value *IdxB =
+ B.CreateAdd(B.CreateMul(CurrentInner, B.getInt16(16)), CurrentCol);
+ Value *NewVecC = nullptr;
+
+ if (IntrID != Intrinsic::x86_tdpbf16ps_internal) {
+ // tiledpbssd.scalarize.inner.body:
+ // calculate idxa, idxb
+ // %eltc = extractelement <256 x i32> %vec.c.inner.phi, i16 %idxc
+ // %elta = extractelement <256 x i32> %veca, i16 %idxa
+ // %eltav4i8 = bitcast i32 %elta to <4 x i8>
+ // %eltb = extractelement <256 x i32> %vecb, i16 %idxb
+ // %eltbv4i8 = bitcast i32 %eltb to <4 x i8>
+ // %eltav4i32 = sext <4 x i8> %eltav4i8 to <4 x i32>
+ // %eltbv4i32 = sext <4 x i8> %eltbv4i8 to <4 x i32>
+ // %mulab = mul <4 x i32> %eltbv4i32, %eltav4i32
+ // %acc = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %131)
+ // %neweltc = add i32 %elt, %acc
+ // %NewVecC = insertelement <256 x i32> %vec.c.inner.phi, i32 %neweltc,
+ // i16 %idxc
+ FixedVectorType *V4I8Ty = FixedVectorType::get(B.getInt8Ty(), 4);
+ FixedVectorType *V4I32Ty = FixedVectorType::get(B.getInt32Ty(), 4);
+ Value *EltC = B.CreateExtractElement(VecCPhi, IdxC);
+ Value *EltA = B.CreateExtractElement(VecA, IdxA);
+ Value *SubVecA = B.CreateBitCast(EltA, V4I8Ty);
+ Value *EltB = B.CreateExtractElement(VecB, IdxB);
+ Value *SubVecB = B.CreateBitCast(EltB, V4I8Ty);
+ Value *SEXTSubVecB = nullptr;
+ Value *SEXTSubVecA = nullptr;
+ switch (IntrID) {
+ case Intrinsic::x86_tdpbssd_internal:
+ SEXTSubVecB = B.CreateSExt(SubVecB, V4I32Ty);
+ SEXTSubVecA = B.CreateSExt(SubVecA, V4I32Ty);
+ break;
+ case Intrinsic::x86_tdpbsud_internal:
+ SEXTSubVecB = B.CreateZExt(SubVecB, V4I32Ty);
+ SEXTSubVecA = B.CreateSExt(SubVecA, V4I32Ty);
+ break;
+ case Intrinsic::x86_tdpbusd_internal:
+ SEXTSubVecB = B.CreateSExt(SubVecB, V4I32Ty);
+ SEXTSubVecA = B.CreateZExt(SubVecA, V4I32Ty);
+ break;
+ case Intrinsic::x86_tdpbuud_internal:
+ SEXTSubVecB = B.CreateZExt(SubVecB, V4I32Ty);
+ SEXTSubVecA = B.CreateZExt(SubVecA, V4I32Ty);
+ break;
+ default:
+ llvm_unreachable("Invalid intrinsic ID!");
+ }
+ Value *SubVecR = B.CreateAddReduce(B.CreateMul(SEXTSubVecA, SEXTSubVecB));
+ Value *ResElt = B.CreateAdd(EltC, SubVecR);
+ NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
+ } else {
+ // tiledpbf16ps.scalarize.inner.body:
+ // calculate idxa, idxb, idxc
+ // %eltc = extractelement <256 x i32> %vec.c.inner.phi, i16 %idxc
+ // %eltcf32 = bitcast i32 %eltc to float
+ // %elta = extractelement <256 x i32> %veca, i16 %idxa
+ // %eltav2i16 = bitcast i32 %elta to <2 x i16>
+ // %eltb = extractelement <256 x i32> %vecb, i16 %idxb
+ // %eltbv2i16 = bitcast i32 %eltb to <2 x i16>
+ // %shufflea = shufflevector <2 x i16> %elta, <2 x i16> zeroinitializer, <4
+ // x i32> <i32 2, i32 0, i32 3, i32 1>
+ // %eltav2f32 = bitcast <4 x i16> %shufflea to <2 x float>
+ // %shuffleb = shufflevector <2 x i16> %eltb, <2 xi16> zeroinitializer, <4 x
+ // i32> <i32 2, i32 0, i32 3, i32 1>
+ // %eltbv2f32 = bitcast <4 x i16> %shuffleb to <2 x float>
+ // %mulab = fmul <2 x float> %eltav2f32, %eltbv2f32
+ // %acc = call float
+ // @llvm.vector.reduce.fadd.v2f32(float %eltcf32, <2 x float> %mulab)
+ // %neweltc = bitcast float %acc to i32
+ // %NewVecC = insertelement <256 x i32> %vec.c.inner.phi, i32 %neweltc,
+ // i16 %idxc
+ // %NewVecD = insertelement <256 x i32> %vec.d.inner.phi, i32 %neweltc,
+ // i16 %idxc
+ FixedVectorType *V2I16Ty = FixedVectorType::get(B.getInt16Ty(), 2);
+ FixedVectorType *V2F32Ty = FixedVectorType::get(B.getFloatTy(), 2);
+ Value *EltC = B.CreateExtractElement(VecCPhi, IdxC);
+ Value *EltCF32 = B.CreateBitCast(EltC, B.getFloatTy());
+ Value *EltA = B.CreateExtractElement(VecA, IdxA);
+ Value *SubVecA = B.CreateBitCast(EltA, V2I16Ty);
+ Value *EltB = B.CreateExtractElement(VecB, IdxB);
+ Value *SubVecB = B.CreateBitCast(EltB, V2I16Ty);
+ Value *ZeroV2I16 = Constant::getNullValue(V2I16Ty);
+ int ShuffleMask[4] = {2, 0, 3, 1};
+ auto ShuffleArray = makeArrayRef(ShuffleMask);
+ Value *AV2F32 = B.CreateBitCast(
+ B.CreateShuffleVector(SubVecA, ZeroV2I16, ShuffleArray), V2F32Ty);
+ Value *BV2F32 = B.CreateBitCast(
+ B.CreateShuffleVector(SubVecB, ZeroV2I16, ShuffleArray), V2F32Ty);
+ Value *SubVecR = B.CreateFAddReduce(EltCF32, B.CreateFMul(AV2F32, BV2F32));
+ Value *ResElt = B.CreateBitCast(SubVecR, B.getInt32Ty());
+ NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
+ }
+
+ // tiledpbssd.scalarize.cols.latch:
+ // %NewEltC = extractelement <256 x i32> %vec.c.phi.col, i16 %idxc
+ // %NewVecD = insertelement <256 x i32> %vec.d.phi.col, i32 %NewEltC,
+ // i16 %idxc
+ B.SetInsertPoint(ColLoopLatch->getTerminator());
+ Value *NewEltC = B.CreateExtractElement(NewVecC, IdxC);
+ Value *NewVecD = B.CreateInsertElement(VecDPhiColLoop, NewEltC, IdxC);
+
+ VecCPhi->addIncoming(NewVecC, InnerLoopLatch);
+ VecCPhiRowLoop->addIncoming(NewVecC, RowLatch);
+ VecCPhiColLoop->addIncoming(NewVecC, ColLoopLatch);
+ VecDPhiRowLoop->addIncoming(NewVecD, RowLatch);
+ VecDPhiColLoop->addIncoming(NewVecD, ColLoopLatch);
+
+ return NewVecD;
+}
+
+template <Intrinsic::ID IntrID>
+typename std::enable_if<IntrID == Intrinsic::x86_tdpbssd_internal ||
+ IntrID == Intrinsic::x86_tdpbsud_internal ||
+ IntrID == Intrinsic::x86_tdpbusd_internal ||
+ IntrID == Intrinsic::x86_tdpbuud_internal ||
+ IntrID == Intrinsic::x86_tdpbf16ps_internal,
+ bool>::type
+X86LowerAMXIntrinsics::lowerTileDP(Instruction *TileDP) {
+ Value *M, *N, *K, *C, *A, *B;
+ match(TileDP, m_Intrinsic<IntrID>(m_Value(M), m_Value(N), m_Value(K),
+ m_Value(C), m_Value(A), m_Value(B)));
+ Instruction *InsertI = TileDP;
+ IRBuilder<> PreBuilder(TileDP);
+ PreBuilder.SetInsertPoint(TileDP);
+ // We visit the loop with (m, n/4, k/4):
+ // %n_dword = lshr i16 %n, 2
+ // %k_dword = lshr i16 %k, 2
+ Value *NDWord = PreBuilder.CreateLShr(N, PreBuilder.getInt16(2));
+ Value *KDWord = PreBuilder.CreateLShr(K, PreBuilder.getInt16(2));
+ BasicBlock *Start = InsertI->getParent();
+ BasicBlock *End =
+ SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue");
+ IRBuilder<> Builder(TileDP);
+ Value *ResVec = createTileDPLoops<IntrID>(Start, End, Builder, M, NDWord,
+ KDWord, C, A, B);
+ // we cannot assume there always be bitcast after tiledpbssd. So we need to
+ // insert one bitcast as required
+ Builder.SetInsertPoint(End->getFirstNonPHI());
+ Value *ResAMX =
+ Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
+ // Delete TileDP intrinsic and do some clean-up.
+ for (auto UI = TileDP->use_begin(), UE = TileDP->use_end(); UI != UE;) {
+ Instruction *I = cast<Instruction>((UI++)->getUser());
+ Value *Vec;
+ if (match(I, m_BitCast(m_Value(Vec)))) {
+ I->replaceAllUsesWith(ResVec);
+ I->eraseFromParent();
+ }
+ }
+ TileDP->replaceAllUsesWith(ResAMX);
+ TileDP->eraseFromParent();
+ return true;
+}
+
+template <bool IsTileLoad>
+bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) {
+ Value *M, *N, *Ptr, *Stride, *Tile;
+ if (IsTileLoad)
+ match(TileLoadStore,
+ m_Intrinsic<Intrinsic::x86_tileloadd64_internal>(
+ m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride)));
+ else
+ match(TileLoadStore, m_Intrinsic<Intrinsic::x86_tilestored64_internal>(
+ m_Value(M), m_Value(N), m_Value(Ptr),
+ m_Value(Stride), m_Value(Tile)));
+
+ Instruction *InsertI = TileLoadStore;
+ IRBuilder<> PreBuilder(TileLoadStore);
+ PreBuilder.SetInsertPoint(TileLoadStore);
+ Value *NDWord = PreBuilder.CreateLShr(N, PreBuilder.getInt16(2));
+ Value *StrideDWord = PreBuilder.CreateLShr(Stride, PreBuilder.getInt64(2));
+ BasicBlock *Start = InsertI->getParent();
+ BasicBlock *End =
+ SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue");
+ IRBuilder<> Builder(TileLoadStore);
+ Value *ResVec = createTileLoadStoreLoops<IsTileLoad>(
+ Start, End, Builder, M, NDWord, Ptr, StrideDWord,
+ IsTileLoad ? nullptr : Tile);
+ if (IsTileLoad) {
+ // we cannot assume there always be bitcast after tileload. So we need to
+ // insert one bitcast as required
+ Builder.SetInsertPoint(End->getFirstNonPHI());
+ Value *ResAMX =
+ Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
+ // Delete tileloadd6 intrinsic and do some clean-up
+ for (auto UI = TileLoadStore->use_begin(), UE = TileLoadStore->use_end();
+ UI != UE;) {
+ Instruction *I = cast<Instruction>((UI++)->getUser());
+ Value *Vec;
+ if (match(I, m_BitCast(m_Value(Vec)))) {
+ I->replaceAllUsesWith(ResVec);
+ I->eraseFromParent();
+ }
+ }
+ TileLoadStore->replaceAllUsesWith(ResAMX);
+ }
+ TileLoadStore->eraseFromParent();
+ return true;
+}
+
+bool X86LowerAMXIntrinsics::lowerTileZero(Instruction *TileZero) {
+ IRBuilder<> Builder(TileZero);
+ FixedVectorType *V256I32Ty = FixedVectorType::get(Builder.getInt32Ty(), 256);
+ Value *VecZero = Constant::getNullValue(V256I32Ty);
+ for (auto UI = TileZero->use_begin(), UE = TileZero->use_end(); UI != UE;) {
+ Instruction *I = cast<Instruction>((UI++)->getUser());
+ Value *Vec;
+ if (match(I, m_BitCast(m_Value(Vec)))) {
+ I->replaceAllUsesWith(VecZero);
+ I->eraseFromParent();
+ }
+ }
+ TileZero->eraseFromParent();
+ return true;
+}
+
+bool X86LowerAMXIntrinsics::visit() {
+ bool C = false;
+ SmallVector<IntrinsicInst *, 8> WorkList;
+ for (BasicBlock *BB : depth_first(&Func)) {
+ for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+ if (auto *Inst = dyn_cast<IntrinsicInst>(&*II++)) {
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::x86_tdpbssd_internal:
+ case Intrinsic::x86_tdpbsud_internal:
+ case Intrinsic::x86_tdpbusd_internal:
+ case Intrinsic::x86_tdpbuud_internal:
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tilestored64_internal:
+ case Intrinsic::x86_tilezero_internal:
+ case Intrinsic::x86_tdpbf16ps_internal:
+ WorkList.push_back(Inst);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ for (auto *Inst : WorkList) {
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::x86_tdpbssd_internal:
+ C = lowerTileDP<Intrinsic::x86_tdpbssd_internal>(Inst) || C;
+ break;
+ case Intrinsic::x86_tdpbsud_internal:
+ C = lowerTileDP<Intrinsic::x86_tdpbsud_internal>(Inst) || C;
+ break;
+ case Intrinsic::x86_tdpbusd_internal:
+ C = lowerTileDP<Intrinsic::x86_tdpbusd_internal>(Inst) || C;
+ break;
+ case Intrinsic::x86_tdpbuud_internal:
+ C = lowerTileDP<Intrinsic::x86_tdpbuud_internal>(Inst) || C;
+ break;
+ case Intrinsic::x86_tdpbf16ps_internal:
+ C = lowerTileDP<Intrinsic::x86_tdpbf16ps_internal>(Inst) || C;
+ break;
+ case Intrinsic::x86_tileloadd64_internal:
+ C = lowerTileLoadStore<true>(Inst) || C;
+ break;
+ case Intrinsic::x86_tilestored64_internal:
+ C = lowerTileLoadStore<false>(Inst) || C;
+ break;
+ case Intrinsic::x86_tilezero_internal:
+ C = lowerTileZero(Inst) || C;
+ break;
+ default:
+ llvm_unreachable("invalid amx intrinsics!");
+ }
+ }
+
+ return C;
+}
+
+class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {
+ initializeX86LowerAMXIntrinsicsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (!X86ScalarizeAMX)
+ return false;
+ TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+ if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
+ TM->getOptLevel() != CodeGenOpt::None)
+ return false;
+
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ X86LowerAMXIntrinsics LAT(F, DTU, LI);
+ return LAT.visit();
+ }
+ StringRef getPassName() const override { return "Lower AMX intrinsics"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+static const char PassName[] = "Lower AMX intrinsics";
+char X86LowerAMXIntrinsicsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
+ false, false)
+
+FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+ return new X86LowerAMXIntrinsicsLegacyPass();
+}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp b/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 85166de..4ba44cc 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -14,6 +14,27 @@
/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
/// not be combined with load/store, we transform the bitcast to amx load/store
/// and <256 x i32> store/load.
+///
+/// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S
+/// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile,
+/// because that is necessary for AMX fast register allocation. (In Fast
+/// registera allocation, register will be allocated before spill/reload, so
+/// there is no additional register for amx to identify the step in spill.)
+/// The volatileTileData() will handle this case.
+/// e.g.
+/// ----------------------------------------------------------
+/// | def %td = ... |
+/// | ... |
+/// | "use %td" |
+/// ----------------------------------------------------------
+/// will transfer to -->
+/// ----------------------------------------------------------
+/// | def %td = ... |
+/// | call void @llvm.x86.tilestored64.internal(mem, %td) |
+/// | ... |
+/// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)|
+/// | "use %td2" |
+/// ----------------------------------------------------------
//
//===----------------------------------------------------------------------===//
//
@@ -23,6 +44,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
@@ -33,13 +55,15 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace PatternMatch;
#define DEBUG_TYPE "lower-amx-type"
-static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
+ BasicBlock *BB) {
Function &F = *BB->getParent();
Module *M = BB->getModule();
const DataLayout &DL = M->getDataLayout();
@@ -54,12 +78,50 @@
return AllocaRes;
}
-static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+ TargetMachine *TM = nullptr;
+
+ // In AMX intrinsics we let Shape = {Row, Col}, but the
+ // RealCol = Col / ElementSize. We may use the RealCol
+ // as a new Row for other new created AMX intrinsics.
+ std::map<Value *, Value *> Col2Row;
+
+public:
+ X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {}
+ bool visit();
+ void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
+ void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
+ bool transformBitcast(BitCastInst *Bitcast);
+ std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
+ Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
+};
+
+Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V,
+ unsigned Granularity) {
+ if (Col2Row.count(V))
+ return Col2Row[V];
+ IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator Iter = I->getIterator();
+ ++Iter;
+ Builder.SetInsertPoint(&*Iter);
+ }
+ ConstantInt *Gran = Builder.getInt16(Granularity);
+ Value *RealRow = Builder.CreateUDiv(V, Gran);
+ Col2Row[V] = RealRow;
+ return RealRow;
+}
+
+std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
+ unsigned OpNo) {
Value *Row = nullptr, *Col = nullptr;
switch (II->getIntrinsicID()) {
default:
llvm_unreachable("Expect amx intrinsics");
case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tileloaddt164_internal:
case Intrinsic::x86_tilestored64_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
@@ -67,7 +129,11 @@
}
// a * b + c
// The shape depends on which operand.
- case Intrinsic::x86_tdpbssd_internal: {
+ case Intrinsic::x86_tdpbssd_internal:
+ case Intrinsic::x86_tdpbsud_internal:
+ case Intrinsic::x86_tdpbusd_internal:
+ case Intrinsic::x86_tdpbuud_internal:
+ case Intrinsic::x86_tdpbf16ps_internal: {
switch (OpNo) {
case 3:
Row = II->getArgOperand(0);
@@ -79,6 +145,13 @@
break;
case 5:
Row = II->getArgOperand(2);
+ // FIXME: There is a design bug for AMX shape, which the Col should be
+ // Col/4 if it will be used as Row, but current Greedy RA can't handle
+ // this case well, it may failed if we generate a new Shape definition.
+ // So Let's just do it in O0 first.
+ // Row = Row / 4
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ Row = getRowFromCol(II, Row, 4);
Col = II->getArgOperand(1);
break;
}
@@ -94,7 +167,7 @@
// -->
// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
// i8* %addr, i64 %stride64)
-static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
Value *Row = nullptr, *Col = nullptr;
Use &U = *(Bitcast->use_begin());
unsigned OpNo = U.getOperandNo();
@@ -119,7 +192,7 @@
// -->
// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
// %stride64, %13)
-static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
Value *Tile = Bitcast->getOperand(0);
auto *II = cast<IntrinsicInst>(Tile);
@@ -151,14 +224,14 @@
}
// transform bitcast to <store, load> instructions.
-static bool transformBitcast(BitCastInst *Bitcast) {
+bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
IRBuilder<> Builder(Bitcast);
AllocaInst *AllocaAddr;
Value *I8Ptr, *Stride;
auto *Src = Bitcast->getOperand(0);
auto Prepare = [&]() {
- AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+ AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent());
I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
Stride = Builder.getInt64(64);
};
@@ -209,17 +282,9 @@
return true;
}
-namespace {
-class X86LowerAMXType {
- Function &Func;
-
-public:
- X86LowerAMXType(Function &F) : Func(F) {}
- bool visit();
-};
-
bool X86LowerAMXType::visit() {
SmallVector<Instruction *, 8> DeadInsts;
+ Col2Row.clear();
for (BasicBlock *BB : post_order(&Func)) {
for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
@@ -316,6 +381,260 @@
}
} // anonymous namespace
+static Value *getAllocaPos(BasicBlock *BB) {
+ Module *M = BB->getModule();
+ Function *F = BB->getParent();
+ IRBuilder<> Builder(&F->getEntryBlock().front());
+ const DataLayout &DL = M->getDataLayout();
+ unsigned AllocaAS = DL.getAllocaAddrSpace();
+ Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+ AllocaInst *AllocaRes =
+ new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
+ BasicBlock::iterator Iter = AllocaRes->getIterator();
+ ++Iter;
+ Builder.SetInsertPoint(&*Iter);
+ Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy());
+ return I8Ptr;
+}
+
+static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
+ assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
+ auto *II = cast<IntrinsicInst>(TileDef);
+ assert(II && "Not tile intrinsic!");
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+
+ BasicBlock *BB = TileDef->getParent();
+ BasicBlock::iterator Iter = TileDef->getIterator();
+ IRBuilder<> Builder(BB, ++Iter);
+ Value *Stride = Builder.getInt64(64);
+ std::array<Value *, 5> Args = {Row, Col, Ptr, Stride, TileDef};
+
+ Instruction *TileStore =
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ return TileStore;
+}
+
+static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
+ Value *V = U.get();
+ assert(V->getType()->isX86_AMXTy() && "Not define tile!");
+
+ // Get tile shape.
+ IntrinsicInst *II = nullptr;
+ if (IsPHI) {
+ Value *PhiOp = dyn_cast<PHINode>(V)->getIncomingValue(0);
+ II = cast<IntrinsicInst>(PhiOp);
+ } else {
+ II = cast<IntrinsicInst>(V);
+ }
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+
+ Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+ IRBuilder<> Builder(UserI);
+ Value *Stride = Builder.getInt64(64);
+ std::array<Value *, 4> Args = {Row, Col, Ptr, Stride};
+
+ Value *TileLoad =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ UserI->replaceUsesOfWith(V, TileLoad);
+}
+
+static bool isIncomingOfPHI(Instruction *I) {
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ if (isa<PHINode>(V))
+ return true;
+ }
+ return false;
+}
+
+// Let all AMX tile data become volatile data, shorten the life range
+// of each tile register before fast register allocation.
+namespace {
+class X86VolatileTileData {
+ Function &F;
+
+public:
+ X86VolatileTileData(Function &Func) : F(Func) {}
+ Value *updatePhiIncomings(BasicBlock *BB,
+ SmallVector<Instruction *, 2> &Incomings);
+ void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr);
+ bool volatileTileData();
+ void volatileTilePHI(PHINode *Inst);
+ void volatileTileNonPHI(Instruction *I);
+};
+
+Value *X86VolatileTileData::updatePhiIncomings(
+ BasicBlock *BB, SmallVector<Instruction *, 2> &Incomings) {
+ Value *I8Ptr = getAllocaPos(BB);
+
+ for (auto *I : Incomings) {
+ User *Store = createTileStore(I, I8Ptr);
+
+ // All its uses (except phi) should load from stored mem.
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ if (isa<PHINode>(V) || V == Store)
+ continue;
+ replaceWithTileLoad(U, I8Ptr);
+ }
+ }
+ return I8Ptr;
+}
+
+void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI,
+ Value *StorePtr) {
+ for (Use &U : PHI->uses())
+ replaceWithTileLoad(U, StorePtr, true);
+ PHI->eraseFromParent();
+}
+
+// Smilar with volatileTileNonPHI, this function only handle PHI Nodes
+// and their related AMX intrinsics.
+// 1) PHI Def should change to tileload.
+// 2) PHI Incoming Values should tilestored in just after their def.
+// 3) The mem of these tileload and tilestores should be same.
+// e.g.
+// ------------------------------------------------------
+// bb_dom:
+// ...
+// br i1 %bool.cond, label %if.else, label %if.then
+//
+// if.then:
+// def %t0 = ...
+// ...
+// use %t0
+// ...
+// br label %if.end
+//
+// if.else:
+// def %t1 = ...
+// br label %if.end
+//
+// if.end:
+// %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ]
+// ...
+// use %td
+// ------------------------------------------------------
+// -->
+// ------------------------------------------------------
+// bb_entry:
+// %mem = alloca <256 x i32>, align 1024 *
+// ...
+// bb_dom:
+// ...
+// br i1 %bool.cond, label %if.else, label %if.then
+//
+// if.then:
+// def %t0 = ...
+// call void @llvm.x86.tilestored64.internal(mem, %t0) *
+// ...
+// %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)*
+// use %t0` *
+// ...
+// br label %if.end
+//
+// if.else:
+// def %t1 = ...
+// call void @llvm.x86.tilestored64.internal(mem, %t1) *
+// br label %if.end
+//
+// if.end:
+// ...
+// %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) *
+// use %td
+// ------------------------------------------------------
+void X86VolatileTileData::volatileTilePHI(PHINode *PHI) {
+ BasicBlock *BB = PHI->getParent();
+ SmallVector<Instruction *, 2> Incomings;
+
+ for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+ Value *Op = PHI->getIncomingValue(I);
+ Instruction *Inst = dyn_cast<Instruction>(Op);
+ assert(Inst && "We shouldn't fold AMX instrution!");
+ Incomings.push_back(Inst);
+ }
+
+ Value *StorePtr = updatePhiIncomings(BB, Incomings);
+ replacePhiDefWithLoad(PHI, StorePtr);
+}
+
+// Store the defined tile and load it before use.
+// All its users are not PHI.
+// e.g.
+// ------------------------------------------------------
+// def %td = ...
+// ...
+// "use %td"
+// ------------------------------------------------------
+// -->
+// ------------------------------------------------------
+// def %td = ...
+// call void @llvm.x86.tilestored64.internal(mem, %td)
+// ...
+// %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)
+// "use %td2"
+// ------------------------------------------------------
+void X86VolatileTileData::volatileTileNonPHI(Instruction *I) {
+ BasicBlock *BB = I->getParent();
+ Value *I8Ptr = getAllocaPos(BB);
+ User *Store = createTileStore(I, I8Ptr);
+
+ // All its uses should load from stored mem.
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ assert(!isa<PHINode>(V) && "PHI Nodes should be excluded!");
+ if (V != Store)
+ replaceWithTileLoad(U, I8Ptr);
+ }
+}
+
+// Volatile Tile Model:
+// 1) All the uses of tile data comes from tileload in time.
+// 2) All the defs of tile data tilestore into mem immediately.
+// For example:
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+// 3) No terminator, call or other amx instructions in the key amx area.
+bool X86VolatileTileData::volatileTileData() {
+ bool Changed = false;
+ for (BasicBlock &BB : F) {
+ SmallVector<Instruction *, 2> PHIInsts;
+ SmallVector<Instruction *, 8> AMXDefInsts;
+
+ for (Instruction &I : BB) {
+ if (!I.getType()->isX86_AMXTy())
+ continue;
+ if (isa<PHINode>(&I))
+ PHIInsts.push_back(&I);
+ else
+ AMXDefInsts.push_back(&I);
+ }
+
+ // First we "volatile" the non-phi related amx intrinsics.
+ for (Instruction *I : AMXDefInsts) {
+ if (isIncomingOfPHI(I))
+ continue;
+ volatileTileNonPHI(I);
+ Changed = true;
+ }
+
+ for (Instruction *I : PHIInsts) {
+ volatileTilePHI(dyn_cast<PHINode>(I));
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+} // anonymous namespace
+
namespace {
class X86LowerAMXTypeLegacyPass : public FunctionPass {
@@ -327,13 +646,31 @@
}
bool runOnFunction(Function &F) override {
- X86LowerAMXType LAT(F);
+ TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+
+ X86LowerAMXType LAT(F, TM);
bool C = LAT.visit();
+
+ // Prepare for fast register allocation at O0.
+ // Todo: May better check the volatile model of AMX code, not just
+ // by checking Attribute::OptimizeNone and CodeGenOpt::None.
+ if (TM->getOptLevel() == CodeGenOpt::None) {
+ // If Front End not use O0 but the Mid/Back end use O0, (e.g.
+ // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
+ // sure the amx data is volatile, that is nessary for AMX fast
+ // register allocation.
+ if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
+ X86VolatileTileData VTD(F);
+ C = VTD.volatileTileData() || C;
+ }
+ }
+
return C;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
}
};
@@ -343,6 +680,7 @@
char X86LowerAMXTypeLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
false)
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/src/llvm-project/llvm/lib/Target/X86/X86LowerTileCopy.cpp
new file mode 100644
index 0000000..03692d1
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -0,0 +1,132 @@
+//===-- X86LowerTileCopy.cpp - Expand Tile Copy Instructions---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which lower AMX tile copy instructions. Since
+// there is no tile copy instruction, we need store tile register to stack
+// and load from stack to another tile register. We need extra GR to hold
+// the stride, and we need stack slot to hold the tile data register.
+// We would run this pass after copy propagation, so that we don't miss copy
+// optimization. And we would run this pass before prolog/epilog insertion,
+// so that we can allocate stack slot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-lower-tile-copy"
+
+namespace {
+
+class X86LowerTileCopy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86LowerTileCopy() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "X86 Lower Tile Copy"; }
+};
+
+} // namespace
+
+char X86LowerTileCopy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+ false, false)
+INITIALIZE_PASS_END(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+ false, false)
+
+void X86LowerTileCopy::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createX86LowerTileCopyPass() {
+ return new X86LowerTileCopy();
+}
+
+bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo *TII = ST.getInstrInfo();
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII++;
+ if (!MI.isCopy())
+ continue;
+ MachineOperand &DstMO = MI.getOperand(0);
+ MachineOperand &SrcMO = MI.getOperand(1);
+ Register SrcReg = SrcMO.getReg();
+ Register DstReg = DstMO.getReg();
+ if (!X86::TILERegClass.contains(DstReg, SrcReg))
+ continue;
+
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ // Allocate stack slot for tile register
+ unsigned Size = TRI->getSpillSize(X86::TILERegClass);
+ Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
+ int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+ // Allocate stack slot for stride register
+ Size = TRI->getSpillSize(X86::GR64RegClass);
+ Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+ int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+ // TODO: Pick a killed regiter to avoid save/reload. There is problem
+ // to get live interval in this stage.
+ Register GR64Cand = X86::RAX;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ // mov %rax (%sp)
+ BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
+ .addReg(GR64Cand);
+ // mov 64 %rax
+ BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+ // tilestored %tmm, (%sp, %idx)
+ unsigned Opc = X86::TILESTORED;
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
+ .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
+ MachineOperand &MO = NewMI->getOperand(2);
+ MO.setReg(GR64Cand);
+ MO.setIsKill(true);
+ // tileloadd (%sp, %idx), %tmm
+ Opc = X86::TILELOADD;
+ NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
+ TileSS);
+ // restore %rax
+ // mov (%sp) %rax
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
+ StrideSS);
+ MI.eraseFromParent();
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/src/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
index 89fa3ae..7d916f9 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2167,7 +2167,7 @@
const MachineOperand &DstOp = MI->getOperand(0);
CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
if (auto *CF = dyn_cast<ConstantFP>(C)) {
- CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
+ CS << "0x" << toString(CF->getValueAPF().bitcastToAPInt(), 16, false);
OutStreamer.AddComment(CS.str());
}
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/src/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index eedad95..46d2e2a 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -108,6 +108,13 @@
/// True if this function has any preallocated calls.
bool HasPreallocatedCall = false;
+ /// Whether this function has an extended frame record [Ctx, RBP, Return
+ /// addr]. If so, bit 60 of the in-memory frame pointer will be 1 to enable
+ /// other tools to detect the extended record.
+ bool HasSwiftAsyncContext = false;
+
+ Optional<int> SwiftAsyncContextFrameIdx;
+
ValueMap<const Value *, size_t> PreallocatedIds;
SmallVector<size_t, 0> PreallocatedStackSizes;
SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
@@ -197,6 +204,14 @@
bool hasPreallocatedCall() const { return HasPreallocatedCall; }
void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+ bool hasSwiftAsyncContext() const { return HasSwiftAsyncContext; }
+ void setHasSwiftAsyncContext(bool v) { HasSwiftAsyncContext = v; }
+
+ Optional<int> getSwiftAsyncContextFrameIdx() const {
+ return SwiftAsyncContextFrameIdx;
+ }
+ void setSwiftAsyncContextFrameIdx(int v) { SwiftAsyncContextFrameIdx = v; }
+
size_t getPreallocatedIdForCallSite(const Value *CS) {
auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()});
if (Insert.second) {
@@ -219,7 +234,7 @@
PreallocatedArgOffsets[Id].assign(AO.begin(), AO.end());
}
- const ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) {
+ ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) {
assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
return PreallocatedArgOffsets[Id];
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/src/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index c8899a8..ab4d2bd 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -112,9 +112,9 @@
} // end anonymous namespace
-/// Provide DenseMapInfo for MemOpKey.
namespace llvm {
+/// Provide DenseMapInfo for MemOpKey.
template <> struct DenseMapInfo<MemOpKey> {
using PtrInfo = DenseMapInfo<const MachineOperand *>;
@@ -295,8 +295,8 @@
/// Replace debug value MI with a new debug value instruction using register
/// VReg with an appropriate offset and DIExpression to incorporate the
/// address displacement AddrDispShift. Return new debug value instruction.
- MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned VReg,
- int64_t AddrDispShift);
+ MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned OldReg,
+ unsigned NewReg, int64_t AddrDispShift);
/// Removes LEAs which calculate similar addresses.
bool removeRedundantLEAs(MemOpMap &LEAs);
@@ -576,21 +576,50 @@
}
MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
- unsigned VReg,
+ unsigned OldReg,
+ unsigned NewReg,
int64_t AddrDispShift) {
const DIExpression *Expr = MI.getDebugExpression();
- if (AddrDispShift != 0)
- Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
+ if (AddrDispShift != 0) {
+ if (MI.isNonListDebugValue()) {
+ Expr =
+ DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
+ } else {
+ // Update the Expression, appending an offset of `AddrDispShift` to the
+ // Op corresponding to `OldReg`.
+ SmallVector<uint64_t, 3> Ops;
+ DIExpression::appendOffset(Ops, AddrDispShift);
+ for (MachineOperand &Op : MI.getDebugOperandsForReg(OldReg)) {
+ unsigned OpIdx = MI.getDebugOperandIndex(&Op);
+ Expr = DIExpression::appendOpsToArg(Expr, Ops, OpIdx);
+ }
+ }
+ }
// Replace DBG_VALUE instruction with modified version.
MachineBasicBlock *MBB = MI.getParent();
DebugLoc DL = MI.getDebugLoc();
bool IsIndirect = MI.isIndirectDebugValue();
const MDNode *Var = MI.getDebugVariable();
+ unsigned Opcode = MI.isNonListDebugValue() ? TargetOpcode::DBG_VALUE
+ : TargetOpcode::DBG_VALUE_LIST;
if (IsIndirect)
- assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
- return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(TargetOpcode::DBG_VALUE),
- IsIndirect, VReg, Var, Expr);
+ assert(MI.getDebugOffset().getImm() == 0 &&
+ "DBG_VALUE with nonzero offset");
+ SmallVector<MachineOperand, 4> NewOps;
+ // If we encounter an operand using the old register, replace it with an
+ // operand that uses the new register; otherwise keep the old operand.
+ auto replaceOldReg = [OldReg, NewReg](const MachineOperand &Op) {
+ if (Op.isReg() && Op.getReg() == OldReg)
+ return MachineOperand::CreateReg(NewReg, false, false, false, false,
+ false, false, false, false, 0,
+ /*IsRenamable*/ true);
+ return Op;
+ };
+ for (const MachineOperand &Op : MI.debug_operands())
+ NewOps.push_back(replaceOldReg(Op));
+ return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(Opcode), IsIndirect,
+ NewOps, Var, Expr);
}
// Try to find similar LEAs in the list and replace one with another.
@@ -635,7 +664,7 @@
// Replace DBG_VALUE instruction with modified version using the
// register from the replacing LEA and the address displacement
// between the LEA instructions.
- replaceDebugValue(MI, FirstVReg, AddrDispShift);
+ replaceDebugValue(MI, LastVReg, FirstVReg, AddrDispShift);
continue;
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp b/src/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
index ec81b07..e10dab7 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -222,7 +222,7 @@
void PadShortFunc::addPadding(MachineBasicBlock *MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd) {
- DebugLoc DL = MBBI->getDebugLoc();
+ const DebugLoc &DL = MBBI->getDebugLoc();
unsigned IssueWidth = TSM.getIssueWidth();
for (unsigned i = 0, e = IssueWidth * NOOPsToAdd; i != e; ++i)
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td b/src/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
index 833013f..3844667 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
@@ -233,3 +233,16 @@
];
}
def : PfmCountersBinding<"znver2", ZnVer2PfmCounters>;
+
+def ZnVer3PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+ let UopsCounter = PfmCounter<"retired_ops">;
+ let IssueCounters = [
+ PfmIssueCounter<"Zn3Int", "ops_type_dispatched_from_decoder:int_disp_retire_mode">,
+ PfmIssueCounter<"Zn3FPU", "ops_type_dispatched_from_decoder:fp_disp_retire_mode">,
+ PfmIssueCounter<"Zn3Load", "ls_dispatch:ld_dispatch">,
+ PfmIssueCounter<"Zn3Store", "ls_dispatch:store_dispatch">,
+ PfmIssueCounter<"Zn3Divider", "div_op_count">
+ ];
+}
+def : PfmCountersBinding<"znver3", ZnVer3PfmCounters>;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/src/llvm-project/llvm/lib/Target/X86/X86PreAMXConfig.cpp
new file mode 100644
index 0000000..d9c6d08
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -0,0 +1,423 @@
+//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Insert tilecfg for each area of key AMX intrinsic.
+/// All the key AMX intrinsic's tile operand must come from tileload. And the
+/// def tile of key AMX intrinsic must be tilestored.
+/// take tdpbssd for example:
+/// --------------------------------------------------------------------------
+/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key
+/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) |
+/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx
+/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) |
+/// call void @llvm.x86.tilestored64.internal(... td) area
+/// --------------------------------------------------------------------------
+/// This pass will insert tilecfg before every key-amx-area, some like:
+/// --------------------------------------------------------------------------
+/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
+/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
+/// ...
+/// ... pre-config shape of %t1 *
+/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+/// ... *
+/// ... pre-config shape of %t2 * shapes
+/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 *
+/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+/// ...
+/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "pre-amx-config"
+
+static bool isAMXIntrinsic(IntrinsicInst *II) {
+ for (Value *Operand : II->operands())
+ if (Operand->getType()->isX86_AMXTy())
+ return true;
+ return II->getType()->isX86_AMXTy();
+}
+
+static bool isTileLoad(IntrinsicInst *II) {
+ return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
+ II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
+}
+
+static bool isTileStore(IntrinsicInst *II) {
+ return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal;
+}
+
+#ifndef NDEBUG
+static bool onlyTileDef(IntrinsicInst *II) {
+ for (Value *Operand : II->operands())
+ if (Operand->getType()->isX86_AMXTy())
+ return false;
+ return II->getType()->isX86_AMXTy();
+}
+
+static bool brokenVolatile(Instruction *I) {
+ // Todo: it is weak to identify a normal call here.
+ if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) || I->isTerminator())
+ return true;
+ return false;
+}
+#endif
+
+namespace {
+class X86PreAMXConfig {
+ Function &F;
+
+public:
+ X86PreAMXConfig(Function &Func) : F(Func) {}
+ bool preTileConfig();
+ bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
+ bool findConfigShapes(
+ DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
+ bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
+ bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+ SmallVector<Value *, 8> &Shapes);
+ BasicBlock::iterator
+ getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
+ SmallVector<Value *, 8> &Shapes);
+ bool checkVolatileModel(SmallSet<Value *, 4> &Loads, IntrinsicInst *Store,
+ IntrinsicInst *KeyAMX);
+};
+
+// Orderly write the shapes in tilecfg's mem. This maybe not right.
+// Because the first shape may not corresponding to the first tmm register,
+// so we need to handle at at X86FastTileConfig::materializeTileCfg()
+// after register allocation.
+// For example:
+// --------------------------------------------------------------------------
+// zeroinitialize tilecfg's mem (of ldtilecfg)
+// --------------------------------------------------------------------------
+// ... pre-config shape of %t1 *
+// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 *
+// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 *
+// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+// ... *
+// ... pre-config shape of %t2 *
+// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 *
+// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 *
+// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
+// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+// ... *
+// ... pre-config shape of %t3 * of
+// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 *
+// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 *
+// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
+// ... * tiles
+// ... pre-config shape of %td *
+// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 *
+// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 *
+// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
+// --------------------------------------------------------------------------
+// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+ SmallVector<Value *, 8> &Shapes) {
+ bool Write = false;
+ LLVMContext &Ctx = Pos->getParent()->getContext();
+ Type *I8Ty = Type::getInt8Ty(Ctx);
+ Type *I16Ty = Type::getInt16Ty(Ctx);
+
+ // TODO: Currently we defaultly set Palette = 1, it may be assigned to
+ // other value in the future.
+ Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
+ Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
+ Value *PalettePos =
+ GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
+ new StoreInst(PaletteValue, PalettePos, Pos);
+
+ for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
+ Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
+ Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
+ const std::string ShapeName = "amx.tmm." + itostr(I);
+ Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
+ ShapeName + ".shape.row", Pos);
+ Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
+ ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
+ ShapeName + ".shape.col", Pos);
+ Value *Row = Shapes[I * 2];
+ Value *Col = Shapes[I * 2 + 1];
+ Row = new TruncInst(Row, I8Ty, "", Pos);
+ new StoreInst(Row, RowPos, Pos);
+ new StoreInst(Col, ColPos, Pos);
+ Write = true;
+ }
+ return Write;
+}
+
+bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
+ SmallVector<Value *, 8> &Shapes) {
+ Module *M = F.getParent();
+ IRBuilder<> Builder(ModelStart);
+ const DataLayout &DL = M->getDataLayout();
+ unsigned AddrSpace = DL.getAllocaAddrSpace();
+ LLVMContext &Ctx = Builder.getContext();
+ Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false);
+ Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx));
+
+ AllocaInst *Addr =
+ new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front());
+ Addr->setAlignment(Alignment);
+ Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
+
+ std::array<Value *, 1> Args = {I8Ptr};
+ Instruction *Cfg =
+ Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);
+
+ Value *Val0 = Constant::getNullValue(V512Ty);
+ Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
+ assert(Init0 && "Not Zero initilizate the cfg mem!");
+
+ preWriteTileCfg(I8Ptr, Cfg, Shapes);
+
+ return Init0;
+}
+
+// Todo: We may need to handle "more than one store" case in the future.
+bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads,
+ IntrinsicInst *Store,
+ IntrinsicInst *KeyAMX) {
+ Value *ST = Store->getOperand(4);
+
+ // Only has tileload and tilestore.
+ if (!KeyAMX)
+ return (Loads.size() == 1) && Loads.contains(ST);
+
+ // All Loads should be operands of KeyAMX.
+ // All tile operands of KeyAMX should come from Loads.
+ for (Value *Op : KeyAMX->operands()) {
+ if (Op->getType()->isX86_AMXTy())
+ if (!Loads.erase(Op))
+ return false;
+ }
+
+ // The def of KeyAMX should be stored into mem.
+ // Todo: is it key amx can be no def?
+ return Loads.empty() && (ST == cast<Value>(KeyAMX));
+}
+
+bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX,
+ SmallVector<Value *, 8> &Shapes) {
+ for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) {
+ Value *Op = KeyAMX->getOperand(I);
+ if (!Op->getType()->isX86_AMXTy())
+ continue;
+ IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op);
+ assert((TileDef && isTileLoad(TileDef)) &&
+ "All KeyAMX's tile definiation should comes from TileLoad!");
+ Shapes.push_back(TileDef->getOperand(0));
+ Shapes.push_back(TileDef->getOperand(1));
+ }
+ if (!isTileStore(KeyAMX)) {
+ Shapes.push_back(KeyAMX->getOperand(0));
+ Shapes.push_back(KeyAMX->getOperand(1));
+ }
+ return Shapes.size() != 0;
+}
+
+// Collect the shapes and skip the area of current key amx intrinsic.
+//
+// For example:
+// ...
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k)
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k)
+// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k)
+// --------------------------------------------------------------------------
+BasicBlock::iterator
+X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
+ SmallVector<Value *, 8> &Shapes) {
+ IntrinsicInst *KeyAMX = nullptr;
+ BasicBlock *BB = Iter->getParent();
+ BasicBlock::iterator PosEnd = BB->end();
+ SmallSet<Value *, 4> Loads;
+
+ // See TileStore as "Config Position End" and check volatile model.
+ for (auto I = Iter, E = BB->end(); I != E; ++I) {
+ assert(!brokenVolatile(&*I) && "Not reach tile store!");
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
+ if (!II || !isAMXIntrinsic(II))
+ continue;
+
+ if (isTileLoad(II)) {
+ Loads.insert(II);
+ } else if (isTileStore(II)) {
+ if (!checkVolatileModel(Loads, II, KeyAMX))
+ report_fatal_error("Not Volatile AMX Model!");
+ PosEnd = I;
+ break;
+ } else {
+ assert(!KeyAMX && "Too many key amx intrinsic!");
+ KeyAMX = II;
+ }
+ }
+ assert(PosEnd != BB->end() && "Not find TileStore!");
+
+ // See KeyAMX as TileStore if only TileLoad and TileStore.
+ if (!KeyAMX)
+ KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd);
+
+ // Get Shapes in order.
+ assert(Shapes.empty() && "Shapes should be clean.");
+ getKeyAMXShapes(KeyAMX, Shapes);
+
+ return PosEnd;
+}
+
+// Record a key amx area's shapes with its position.
+// Use the first tileload as its position.
+// For example:
+// ...
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) /
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes:
+// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n)
+// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n)
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::findConfigShapes(
+ DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
+ bool Find = false;
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
+ if (!II)
+ continue;
+ if (!isAMXIntrinsic(II))
+ continue;
+ assert(onlyTileDef(II) && "Not volatile model for AMX at O0!");
+
+ I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]);
+ Find = true;
+ }
+ }
+ return Find;
+}
+
+// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic.
+// e.g. (key amx = tdpbssd)
+// --------------------------------------------------------------------------
+// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
+// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
+// ...
+// ... pre-config shape of %t1 *
+// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+// ... *
+// ... pre-config shape of %t2 *
+// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
+// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+// ... *
+// ... pre-config shape of %t3 * of
+// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
+// ... * tiles
+// ... pre-config shape of %td *
+// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
+//
+// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::preTileConfig() {
+ DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
+ bool NeedCfg = findConfigShapes(PosAndShapes);
+ if (!NeedCfg)
+ return false;
+ for (auto &IPAndShapes : PosAndShapes)
+ addTileConfig(IPAndShapes.first, IPAndShapes.second);
+
+ return true;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86PreAMXConfigPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86PreAMXConfigPass() : FunctionPass(ID) {
+ initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+ bool C = false;
+
+ // Prepare for fast register allocation at O0.
+ if (TM->getOptLevel() == CodeGenOpt::None) {
+
+ // We pre-config each key AMX intrinsic at O0.
+ // In theory, one tile config can cover several AMX intrinsics, but
+ // it is very diffcult to classify the tile shapes at O0. So here we
+ // let thing be easy, pre-config every key AMX intrinsic.
+ X86PreAMXConfig PCFG(F);
+ C = PCFG.preTileConfig();
+ }
+
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Pre AMX Tile Config";
+char X86PreAMXConfigPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
+
+FunctionPass *llvm::createX86PreAMXConfigPass() {
+ return new X86PreAMXConfigPass();
+}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp b/src/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 05ee6c6..b85a0b6 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -1,4 +1,4 @@
-//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//===-- X86PreTileConfig.cpp - Tile Register Pre-configure-----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,31 +6,20 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file Pass to pre-config the shape of AMX register
-/// AMX register need to be configured before use. The shape of AMX register
-/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
-/// The pldtilecfg is to config tile registers. It should dominator all AMX
-/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
-/// register is used by all AMX instructions.
-/// This pass is to find the common dominator of all AMX instructions and
-/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
-/// produces is inserted as the last operand of each AMX instruction. We use
-/// this scheme to model the def-use relationship between AMX config instruction
-/// and other AMX instructions. Below is an example.
+/// \file Pass to pre-config the shapes of AMX registers
+/// AMX register needs to be configured before use. The shapes of AMX register
+/// are encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
///
-/// ----B1----
-/// / \
-/// / \
-/// B2 B3
-/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
+/// The instruction ldtilecfg is used to config the shapes. It must be reachable
+/// for all variable shapes. ldtilecfg will be inserted more than once if we
+/// cannot find a dominating point for all AMX instructions.
///
-/// is transformed to
+/// The configure register is caller saved according to ABI. We need to insert
+/// ldtilecfg again after the call instruction if callee clobbers any AMX
+/// registers.
///
-/// B1
-/// %25:tilecfg = PLDTILECFG
-/// / \
-/// / \
-/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
+/// This pass calculates all points that ldtilecfg need to be inserted to and
+/// insert them. It reports error if the reachability conditions aren't met.
//
//===----------------------------------------------------------------------===//
@@ -38,32 +27,141 @@
#include "X86InstrBuilder.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TileShapeInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
#define DEBUG_TYPE "tile-pre-config"
+#define REPORT_CONFIG_FAIL \
+ report_fatal_error( \
+ MF.getName() + \
+ ": Failed to config tile register, please define the shape earlier");
namespace {
-class X86PreTileConfig : public MachineFunctionPass {
- // context
- MachineFunction *MF = nullptr;
- const X86Subtarget *ST = nullptr;
- const TargetRegisterInfo *TRI;
- const TargetInstrInfo *TII;
- MachineDominatorTree *DomTree = nullptr;
- MachineRegisterInfo *MRI = nullptr;
+struct MIRef {
+ MachineInstr *MI = nullptr;
+ MachineBasicBlock *MBB = nullptr;
+ // A virtual position for instruction that will be inserted after MI.
+ size_t Pos = 0;
+ MIRef() = default;
+ MIRef(MachineBasicBlock *MBB) : MBB(MBB) {
+ for (auto I = MBB->begin(), E = MBB->end(); I != E && I->isPHI();
+ ++I, ++Pos)
+ MI = &*I;
+ }
+ MIRef(MachineInstr *MI)
+ : MI(MI), MBB(MI->getParent()),
+ Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
+ MIRef(MachineInstr *MI, MachineBasicBlock *MBB)
+ : MI(MI), MBB(MBB),
+ Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
+ MIRef(MachineInstr *MI, MachineBasicBlock *MBB, size_t Pos)
+ : MI(MI), MBB(MBB), Pos(Pos) {}
+ operator bool() const { return MBB != nullptr; }
+ bool operator==(const MIRef &RHS) const {
+ return MI == RHS.MI && MBB == RHS.MBB;
+ }
+ bool operator!=(const MIRef &RHS) const { return !(*this == RHS); }
+ bool operator<(const MIRef &RHS) const {
+ // Comparison between different BBs happens when inserting a MIRef into set.
+ // So we compare MBB first to make the insertion happy.
+ return MBB < RHS.MBB || (MBB == RHS.MBB && Pos < RHS.Pos);
+ }
+ bool operator>(const MIRef &RHS) const {
+ // Comparison between different BBs happens when inserting a MIRef into set.
+ // So we compare MBB first to make the insertion happy.
+ return MBB > RHS.MBB || (MBB == RHS.MBB && Pos > RHS.Pos);
+ }
+};
- MachineInstr *getTileConfigPoint();
+struct BBInfo {
+ MIRef FirstAMX;
+ MIRef LastCall;
+ bool HasAMXRegLiveIn = false;
+ bool TileCfgForbidden = false;
+ bool NeedTileCfgLiveIn = false;
+};
+
+class X86PreTileConfig : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const MachineLoopInfo *MLI;
+ SmallSet<MachineInstr *, 8> DefVisited;
+ DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
+ DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs;
+
+ /// Check if the callee will clobber AMX registers.
+ bool isDestructiveCall(MachineInstr &MI, BitVector UsableRegs) {
+ auto Iter = llvm::find_if(
+ MI.operands(), [](MachineOperand &MO) { return MO.isRegMask(); });
+ if (Iter == MI.operands_end())
+ return false;
+ UsableRegs.clearBitsInMask(Iter->getRegMask());
+ return !UsableRegs.none();
+ }
+
+ /// Check if MI is AMX pseudo instruction.
+ bool isAMXInstruction(MachineInstr &MI) {
+ if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
+ return false;
+ MachineOperand &MO = MI.getOperand(0);
+ // We can simply check if it is AMX instruction by its def.
+ // But we should exclude old API which uses physical registers.
+ if (MO.isReg() && MO.getReg().isVirtual() &&
+ MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
+ collectShapeInfo(MI);
+ return true;
+ }
+ // PTILESTOREDV is the only exception that doesn't def a AMX register.
+ return MI.getOpcode() == X86::PTILESTOREDV;
+ }
+
+ /// Check if it is an edge from loop bottom to loop head.
+ bool isLoopBackEdge(MachineBasicBlock *Header, MachineBasicBlock *Bottom) {
+ if (!MLI->isLoopHeader(Header))
+ return false;
+ auto *ML = MLI->getLoopFor(Header);
+ if (ML->contains(Bottom) && ML->isLoopLatch(Bottom))
+ return true;
+
+ return false;
+ }
+
+ /// Collect the shape def information for later use.
+ void collectShapeInfo(MachineInstr &MI);
+
+ /// Try to hoist shapes definded below AMX instructions.
+ bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
+ MIRef &FirstAMX = BBVisitedInfo[MBB].FirstAMX;
+ auto FirstShapeBelowAMX = llvm::lower_bound(Shapes, FirstAMX);
+ auto InsertPoint = FirstAMX.MI->getIterator();
+ for (auto I = FirstShapeBelowAMX, E = Shapes.end(); I != E; ++I) {
+ // Do not hoist instructions that access memory.
+ if (I->MI->mayLoadOrStore())
+ return false;
+ for (auto &MO : I->MI->operands()) {
+ if (MO.isDef())
+ continue;
+ // Do not hoist instructions if the sources' def under AMX instruction.
+ // TODO: We can handle isMoveImmediate MI here.
+ if (MO.isReg() && MIRef(MRI->getVRegDef(MO.getReg())) > FirstAMX)
+ return false;
+ // TODO: Maybe need more checks here.
+ }
+ MBB->insert(InsertPoint, I->MI->removeFromParent());
+ }
+ // We only need to mark the last shape in the BB now.
+ Shapes.clear();
+ Shapes.push_back(MIRef(&*--InsertPoint, MBB));
+ return true;
+ }
public:
X86PreTileConfig() : MachineFunctionPass(ID) {}
@@ -74,10 +172,21 @@
}
/// X86PreTileConfig analysis usage.
- void getAnalysisUsage(AnalysisUsage &AU) const override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
- /// Perform register allocation.
- bool runOnMachineFunction(MachineFunction &mf) override;
+ /// Clear MF related structures.
+ void releaseMemory() override {
+ ShapeBBs.clear();
+ DefVisited.clear();
+ BBVisitedInfo.clear();
+ }
+
+ /// Perform ldtilecfg instructions inserting.
+ bool runOnMachineFunction(MachineFunction &MF) override;
static char ID;
};
@@ -87,176 +196,206 @@
char X86PreTileConfig::ID = 0;
INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
- "Tile Register Configure", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+ "Tile Register Pre-configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
- "Tile Register Configure", false, false)
+ "Tile Register Pre-configure", false, false)
-void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesAll();
- AU.addRequired<MachineDominatorTree>();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
+ auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
+ MIRef MIR(MI, MBB);
+ auto I = llvm::lower_bound(ShapeBBs[MBB], MIR);
+ if (I == ShapeBBs[MBB].end() || *I != MIR)
+ ShapeBBs[MBB].insert(I, MIR);
+ };
-static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
- const TargetInstrInfo *TII,
- MachineRegisterInfo *MRI,
- const X86Subtarget *ST) {
- auto *MBB = MI->getParent();
-
- // FIXME: AMX should assume AVX512 enabled.
- if (ST->hasAVX512()) {
- // Zero stack slot.
- Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
- BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
- .addReg(Zmm, RegState::Undef)
- .addReg(Zmm, RegState::Undef);
- addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
- FrameIdx)
- .addReg(Zmm);
- }
-
- // build psuedo ldtilecfg
- Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
-
- addFrameReference(
- BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
-
- return VReg;
-}
-
-static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default:
- llvm_unreachable("Unexpected machine instruction on tile");
- case X86::PTILELOADDV:
- case X86::PTDPBSSDV:
- case X86::PTILEZEROV:
- MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
- MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
- ShapeT Shape(&MO1, &MO2, MRI);
- return Shape;
+ SmallVector<Register, 8> WorkList(
+ {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
+ while (!WorkList.empty()) {
+ Register R = WorkList.pop_back_val();
+ MachineInstr *DefMI = MRI->getVRegDef(R);
+ assert(DefMI && "R must has one define instruction");
+ MachineBasicBlock *DefMBB = DefMI->getParent();
+ if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
+ continue;
+ if (DefMI->isPHI()) {
+ for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
+ if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
+ RecordShape(DefMI, DefMBB); // In this case, PHI is also a shape def.
+ else
+ WorkList.push_back(DefMI->getOperand(I).getReg());
+ } else {
+ RecordShape(DefMI, DefMBB);
+ }
}
}
-MachineInstr *X86PreTileConfig::getTileConfigPoint() {
- DenseMap<Register, ShapeT> PhysShapeInfo;
- MachineBasicBlock *MBB = nullptr;
- DenseSet<const MachineInstr *> MIs;
- for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
- Register VirtReg = Register::index2VirtReg(i);
- if (MRI->reg_nodbg_empty(VirtReg))
- continue;
- const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- if (RC.getID() != X86::TILERegClassID)
- continue;
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const TargetInstrInfo *TII = ST.getInstrInfo();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
- // Find the common dominator for all MI that define tile register.
- for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
- if (MO.isUndef())
- continue;
- const auto *MI = MO.getParent();
- // PHI or IMPLICIT_DEF instructiion.
- // There must be a input tile before PHI instruction.
- if (MI->isTransient())
- continue;
- if (!MBB)
- MBB = const_cast<MachineBasicBlock *>(MI->getParent());
- MBB = DomTree->findNearestCommonDominator(
- MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+ BitVector AMXRegs(TRI->getNumRegs());
+ for (unsigned I = 0; I < RC->getNumRegs(); I++)
+ AMXRegs.set(X86::TMM0 + I);
- // Collect the instructions that define shape.
- ShapeT Shape = getShape(*MI, MRI);
- std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
- Shape.getCol()};
- for (auto *ShapeMO : ShapeMOs) {
- Register ShapeReg = ShapeMO->getReg();
- for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
- const auto *ShapeMI = MO.getParent();
- MIs.insert(ShapeMI);
+ // Iterate MF to collect information.
+ MRI = &MF.getRegInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ SmallSet<MIRef, 8> CfgNeedInsert;
+ SmallVector<MachineBasicBlock *, 8> CfgLiveInBBs;
+ for (auto &MBB : MF) {
+ size_t Pos = 0;
+ for (auto &MI : MBB) {
+ ++Pos;
+ if (isAMXInstruction(MI)) {
+ // If there's call before the AMX, we need to reload tile config.
+ if (BBVisitedInfo[&MBB].LastCall)
+ CfgNeedInsert.insert(BBVisitedInfo[&MBB].LastCall);
+ else // Otherwise, we need tile config to live in this BB.
+ BBVisitedInfo[&MBB].NeedTileCfgLiveIn = true;
+ // Always record the first AMX in case there's shape def after it.
+ if (!BBVisitedInfo[&MBB].FirstAMX)
+ BBVisitedInfo[&MBB].FirstAMX = MIRef(&MI, &MBB, Pos);
+ } else if (MI.isCall() && isDestructiveCall(MI, AMXRegs)) {
+ // Record the call only if the callee clobbers all AMX registers.
+ BBVisitedInfo[&MBB].LastCall = MIRef(&MI, &MBB, Pos);
+ }
+ }
+ if (BBVisitedInfo[&MBB].NeedTileCfgLiveIn) {
+ if (&MBB == &MF.front())
+ CfgNeedInsert.insert(MIRef(&MBB));
+ else
+ CfgLiveInBBs.push_back(&MBB);
+ }
+ if (BBVisitedInfo[&MBB].FirstAMX || BBVisitedInfo[&MBB].HasAMXRegLiveIn)
+ for (auto *Succ : MBB.successors())
+ if (!isLoopBackEdge(Succ, &MBB))
+ BBVisitedInfo[Succ].HasAMXRegLiveIn = true;
+ }
+
+ // Update NeedTileCfgLiveIn for predecessors.
+ while (!CfgLiveInBBs.empty()) {
+ MachineBasicBlock *MBB = CfgLiveInBBs.pop_back_val();
+ for (auto *Pred : MBB->predecessors()) {
+ if (BBVisitedInfo[Pred].LastCall) {
+ CfgNeedInsert.insert(BBVisitedInfo[Pred].LastCall);
+ } else if (!BBVisitedInfo[Pred].NeedTileCfgLiveIn) {
+ BBVisitedInfo[Pred].NeedTileCfgLiveIn = true;
+ if (Pred == &MF.front())
+ CfgNeedInsert.insert(MIRef(Pred));
+ else
+ CfgLiveInBBs.push_back(Pred);
+ }
+ }
+ }
+
+ // There's no AMX instruction if we didn't find a tile config live in point.
+ if (CfgNeedInsert.empty())
+ return false;
+
+ // Avoid to insert ldtilecfg before any shape defs.
+ SmallVector<MachineBasicBlock *, 8> WorkList;
+ for (auto &I : ShapeBBs) {
+ // TODO: We can hoist shapes across BBs here.
+ if (BBVisitedInfo[I.first].HasAMXRegLiveIn)
+ REPORT_CONFIG_FAIL
+ if (BBVisitedInfo[I.first].FirstAMX &&
+ BBVisitedInfo[I.first].FirstAMX < I.second.back() &&
+ !hoistShapesInBB(I.first, I.second))
+ REPORT_CONFIG_FAIL
+ WorkList.push_back(I.first);
+ }
+ while (!WorkList.empty()) {
+ MachineBasicBlock *MBB = WorkList.pop_back_val();
+ for (auto *Pred : MBB->predecessors()) {
+ if (!BBVisitedInfo[Pred].TileCfgForbidden && !isLoopBackEdge(MBB, Pred)) {
+ BBVisitedInfo[Pred].TileCfgForbidden = true;
+ WorkList.push_back(Pred);
+ }
+ }
+ }
+
+ DebugLoc DL;
+ SmallSet<MIRef, 8> VisitedOrInserted;
+ int SS = MF.getFrameInfo().CreateStackObject(
+ ST.getTileConfigSize(), ST.getTileConfigAlignment(), false);
+
+ // Try to insert for the tile config live in points.
+ for (auto I : CfgNeedInsert) {
+ SmallSet<MIRef, 8> InsertPoints;
+ SmallVector<MIRef, 8> WorkList({I});
+ while (!WorkList.empty()) {
+ MIRef I = WorkList.pop_back_val();
+ if (!VisitedOrInserted.count(I)) {
+ if (!BBVisitedInfo[I.MBB].TileCfgForbidden) {
+ // If the BB is all shapes reachable, stop sink and try to insert.
+ InsertPoints.insert(I);
+ } else {
+ // Avoid the BB to be multi visited.
+ VisitedOrInserted.insert(I);
+ // Sink the inserting point along the chain with NeedTileCfgLiveIn =
+ // true when MBB isn't all shapes reachable.
+ for (auto *Succ : I.MBB->successors())
+ if (BBVisitedInfo[Succ].NeedTileCfgLiveIn)
+ WorkList.push_back(MIRef(Succ));
}
}
}
- }
- if (!MBB)
- return nullptr;
- // This pass is before the pass of eliminating PHI node, so it
- // is in SSA form.
- assert(MRI->isSSA() && "Not SSA form in pre-tile config");
- // Shape def should dominate tile config MBB.
- // def s s1 s2
- // / \ \ /
- // / \ \ /
- // conf s3=phi(s1,s2)
- // |
- // c
- //
- for (const auto *MI : MIs) {
- const MachineBasicBlock *ShapeMBB = MI->getParent();
- if (DomTree->dominates(ShapeMBB, MBB))
- continue;
- if (MI->isMoveImmediate())
- continue;
- report_fatal_error(MF->getName() + ": Failed to config tile register, "
- "please define the shape earlier");
- }
- // ldtilecfg should be inserted after the MI that define the shape.
- MachineBasicBlock::reverse_instr_iterator I, E;
- for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
- auto *MI = &*I;
- if (MIs.count(MI) && (!MI->isMoveImmediate()))
- break;
- }
- MachineBasicBlock::iterator MII;
- if (I == E)
- MII = MBB->getFirstNonPHI();
- else {
- MII = MachineBasicBlock::iterator(&*I);
- MII++;
- }
- return &*MII;
-}
-
-static void addTileCFGUse(MachineFunction &MF, Register CFG) {
- for (MachineBasicBlock &MBB : MF) {
-
- // Traverse the basic block.
- for (MachineInstr &MI : MBB) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default:
- break;
- case X86::PTILELOADDV:
- case X86::PTILESTOREDV:
- case X86::PTDPBSSDV:
- case X86::PTILEZEROV:
- unsigned NumOperands = MI.getNumOperands();
- MI.RemoveOperand(NumOperands - 1);
- MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
- break;
+ // A given point might be forked due to shape conditions are not met.
+ for (MIRef I : InsertPoints) {
+ // Make sure we insert ldtilecfg after the last shape def in MBB.
+ if (ShapeBBs.count(I.MBB) && I < ShapeBBs[I.MBB].back())
+ I = ShapeBBs[I.MBB].back();
+ // There're chances the MBB is sunk more than once. Record it to avoid
+ // multi insert.
+ if (VisitedOrInserted.insert(I).second) {
+ auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
+ addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)),
+ SS);
}
}
}
-}
-bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
- MF = &mf;
- MRI = &mf.getRegInfo();
- ST = &mf.getSubtarget<X86Subtarget>();
- TRI = ST->getRegisterInfo();
- TII = mf.getSubtarget().getInstrInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
+ // Zero stack slot.
+ MachineBasicBlock &MBB = MF.front();
+ MachineInstr *MI = &*MBB.begin();
+ if (ST.hasAVX512()) {
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm)
+ .addReg(Zmm, RegState::Undef)
+ .addReg(Zmm, RegState::Undef);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
+ .addReg(Zmm);
+ } else if (ST.hasAVX2()) {
+ Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm)
+ .addReg(Ymm, RegState::Undef)
+ .addReg(Ymm, RegState::Undef);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
+ .addReg(Ymm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
+ .addReg(Ymm);
+ } else {
+ assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
+ Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm)
+ .addReg(Xmm, RegState::Undef)
+ .addReg(Xmm, RegState::Undef);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48)
+ .addReg(Xmm);
+ }
+ // Fill in the palette first.
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), SS).addImm(1);
- MachineInstr *MI = getTileConfigPoint();
- if (!MI)
- return false;
- unsigned Size = ST->getTileConfigSize();
- Align Alignment = ST->getTileConfigAlignment();
- int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
- Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
- addTileCFGUse(mf, CFG);
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
index d90b4e7..c474842 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -62,7 +62,7 @@
// This matches the simplified 32-bit pointer code in the data layout
// computation.
// FIXME: Should use the data layout?
- bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+ bool Use64BitReg = !TT.isX32();
StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
@@ -290,6 +290,11 @@
if (MF->getFunction().hasFnAttribute("no_caller_saved_registers"))
CC = CallingConv::X86_INTR;
+ // If atribute specified, override the CSRs normally specified by the
+ // calling convention and use the empty set instead.
+ if (MF->getFunction().hasFnAttribute("no_callee_saved_registers"))
+ return CSR_NoRegs_SaveList;
+
switch (CC) {
case CallingConv::GHC:
case CallingConv::HiPE:
@@ -349,6 +354,10 @@
if (!HasSSE)
return CSR_Win64_NoSSE_SaveList;
return CSR_Win64_SaveList;
+ case CallingConv::SwiftTail:
+ if (!Is64Bit)
+ return CSR_32_SaveList;
+ return IsWin64 ? CSR_Win64_SwiftTail_SaveList : CSR_64_SwiftTail_SaveList;
case CallingConv::X86_64_SysV:
if (CallsEHReturn)
return CSR_64EHRet_SaveList;
@@ -465,6 +474,10 @@
break;
case CallingConv::Win64:
return CSR_Win64_RegMask;
+ case CallingConv::SwiftTail:
+ if (!Is64Bit)
+ return CSR_32_RegMask;
+ return IsWin64 ? CSR_Win64_SwiftTail_RegMask : CSR_64_SwiftTail_RegMask;
case CallingConv::X86_64_SysV:
return CSR_64_RegMask;
case CallingConv::X86_INTR:
@@ -497,6 +510,7 @@
F.getAttributes().hasAttrSomewhere(Attribute::SwiftError);
if (IsSwiftCC)
return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask;
+
return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask;
}
@@ -643,7 +657,7 @@
// can't address variables from the stack pointer. MS inline asm can
// reference locals while also adjusting the stack pointer. When we can't
// use both the SP and the FP, we need a separate base pointer register.
- bool CantUseFP = needsStackRealignment(MF);
+ bool CantUseFP = hasStackRealignment(MF);
return CantUseFP && CantUseSP(MFI);
}
@@ -723,8 +737,8 @@
int FIOffset;
Register BasePtr;
if (MI.isReturn()) {
- assert((!needsStackRealignment(MF) ||
- MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
+ assert((!hasStackRealignment(MF) ||
+ MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
"Return instruction can only reference SP relative frame objects");
FIOffset =
TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
@@ -870,10 +884,21 @@
default:
llvm_unreachable("Unexpected machine instruction on tile register!");
break;
+ case X86::COPY: {
+ Register SrcReg = MI->getOperand(1).getReg();
+ ShapeT Shape = getTileShape(SrcReg, VRM, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
+ case X86::PTILELOADDT1V:
case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV:
case X86::PTILEZEROV:
+ case X86::PTDPBF16PSV:
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
index 75cbd4e..1ab9d25 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -61,7 +61,7 @@
def BH : X86Reg<"bh", 7>;
// X86-64 only, requires REX.
-let CostPerUse = 1 in {
+let CostPerUse = [1] in {
def SIL : X86Reg<"sil", 6>;
def DIL : X86Reg<"dil", 7>;
def BPL : X86Reg<"bpl", 5>;
@@ -126,7 +126,7 @@
def IP : X86Reg<"ip", 0>;
// X86-64 only, requires REX.
-let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1,
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = [1],
CoveredBySubRegs = 1 in {
def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>;
def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>;
@@ -152,7 +152,7 @@
}
// X86-64 only, requires REX
-let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1,
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = [1],
CoveredBySubRegs = 1 in {
def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>;
def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>;
@@ -176,7 +176,7 @@
def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
// These also require REX.
-let CostPerUse = 1 in {
+let CostPerUse = [1] in {
def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
@@ -219,7 +219,7 @@
def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
// X86-64 only
-let CostPerUse = 1 in {
+let CostPerUse = [1] in {
def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
@@ -639,8 +639,3 @@
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
-def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
- let CopyCost = -1; // Don't allow copying of tile config registers.
- let isAllocatable = 1;
- let Size = 512;
-}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td b/src/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
index 4aea7bc..d2ced1c 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -206,6 +206,10 @@
defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [BWPort0156], 1, [1], 1>;
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;
@@ -582,6 +586,7 @@
defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVPMOV256, [BWPort5], 3, [1], 1, 6>; // 256-bit width packed vector width-changing move.
defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td b/src/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
index 746dbae..99fddcd 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -125,6 +125,10 @@
defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>;
def : WriteRes<WriteZero, []>;
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
// Arithmetic.
defm : HWWriteResPair<WriteALU, [HWPort0156], 1>;
defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>;
@@ -432,6 +436,7 @@
defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>;
defm : HWWriteResPair<WriteBlendZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVPMOV256, [HWPort5], 3, [1], 1, 7>;
defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2], 2, 6>;
defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>;
@@ -777,7 +782,7 @@
def : InstRW<[HWWriteP1], (instregex "TST_F")>;
// FXAM.
-def : InstRW<[HWWrite2P1], (instrs FXAM)>;
+def : InstRW<[HWWrite2P1], (instrs XAM_F)>;
// FPREM.
def HWWriteFPREM : SchedWriteRes<[]> {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td b/src/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
index ac32f1b..2f7157f 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -112,6 +112,7 @@
def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; }
def : WriteRes<WriteMove, [SBPort015]>;
def : WriteRes<WriteZero, []>;
+def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 5; let NumMicroOps = 0; }
// Arithmetic.
defm : SBWriteResPair<WriteALU, [SBPort015], 1>;
@@ -573,6 +574,7 @@
defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>;
defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>;
defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVPMOV256, [SBPort5], 1, [1], 1, 7>;
defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>;
defm : SBWriteResPair<WriteFMA, [SBPort01], 5>;
defm : SBWriteResPair<WriteFMAX, [SBPort01], 5>;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 0599564..8486bdd 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -203,6 +203,10 @@
defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [SKLPort0156], 1, [1], 1>;
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;
@@ -582,6 +586,7 @@
defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVPMOV256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width packed vector width-changing move.
defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 7fc96d1..ba80d47 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -203,6 +203,10 @@
defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [SKXPort0156], 1, [1], 1>;
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;
@@ -583,6 +587,7 @@
defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVPMOV256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width packed vector width-changing move.
defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86Schedule.td b/src/llvm-project/llvm/lib/Target/X86/X86Schedule.td
index f204d66..09148fc 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86Schedule.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86Schedule.td
@@ -125,6 +125,7 @@
def WriteStore : SchedWrite;
def WriteStoreNT : SchedWrite;
def WriteMove : SchedWrite;
+def WriteVecMaskedGatherWriteback : SchedWrite;
def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy
// Arithmetic.
@@ -488,6 +489,7 @@
defm WriteFShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // Fp 256-bit width vector shuffles.
defm WriteFVarShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // Fp 256-bit width variable shuffles.
defm WriteShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // 256-bit width vector shuffles.
+defm WriteVPMOV256 : X86SchedWritePair<ReadAfterVecYLd>; // 256-bit width packed vector width-changing move.
defm WriteVarShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // 256-bit width vector variable shuffles.
defm WriteVarVecShift : X86SchedWritePair<ReadAfterVecXLd>; // Variable vector shifts.
defm WriteVarVecShiftY : X86SchedWritePair<ReadAfterVecYLd>; // Variable vector shifts (YMM).
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
index b90baf6..d00c2e3 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -37,6 +37,7 @@
def AtomPort1 : ProcResource<1>; // ALU: ALU1, bit processing, jump, and LEA
// SIMD/FP: SIMD ALU, FP Adder
+// NOTE: This is for ops that can use EITHER port, not for ops that require BOTH ports.
def AtomPort01 : ProcResGroup<[AtomPort0, AtomPort1]>;
// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
@@ -48,9 +49,6 @@
def : ReadAdvance<ReadInt2Fpu, 0>;
-// Many SchedWrites are defined in pairs with and without a folded load.
-// Instructions with folded loads are usually micro-fused, so they only appear
-// as two micro-ops when dispatched by the schedulers.
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW,
@@ -59,14 +57,13 @@
int RRLat = 1, int RMLat = 1,
list<int> RRRes = [1],
list<int> RMRes = [1]> {
- // Register variant is using a single cycle on ExePort.
+ // Register variant.
def : WriteRes<SchedRW, RRPorts> {
let Latency = RRLat;
let ResourceCycles = RRRes;
}
- // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
- // latency.
+ // Memory variant.
def : WriteRes<SchedRW.Folded, RMPorts> {
let Latency = RMLat;
let ResourceCycles = RMRes;
@@ -168,6 +165,7 @@
def : WriteRes<WriteStore, [AtomPort0]>;
def : WriteRes<WriteStoreNT, [AtomPort0]>;
def : WriteRes<WriteMove, [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
@@ -227,30 +225,30 @@
defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>;
-defm : AtomWriteResPair<WriteFAdd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFAddX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFAdd, [AtomPort1], [AtomPort0,AtomPort1], 5, 5, [1], [1,1]>;
+defm : AtomWriteResPair<WriteFAddX, [AtomPort1], [AtomPort0,AtomPort1], 5, 5, [1], [1,1]>;
defm : X86WriteResPairUnsupported<WriteFAddY>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
-defm : AtomWriteResPair<WriteFAdd64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFAdd64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteFAdd64, [AtomPort1], [AtomPort0,AtomPort1], 5, 5, [1], [1,1]>;
+defm : AtomWriteResPair<WriteFAdd64X, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
defm : X86WriteResPairUnsupported<WriteFAdd64Y>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : AtomWriteResPair<WriteFCmp, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFCmpX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFCmp, [AtomPort1], [AtomPort0,AtomPort1], 5, 5, [1], [1,1]>;
+defm : AtomWriteResPair<WriteFCmpX, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
defm : X86WriteResPairUnsupported<WriteFCmpY>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : AtomWriteResPair<WriteFCmp64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteFCmp64, [AtomPort1], [AtomPort0,AtomPort1], 5, 5, [1], [1,1]>;
+defm : AtomWriteResPair<WriteFCmp64X, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
defm : AtomWriteResPair<WriteFComX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
-defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [2], [2]>;
+defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [2], [2]>;
defm : X86WriteResPairUnsupported<WriteFMulY>;
defm : X86WriteResPairUnsupported<WriteFMulZ>;
-defm : AtomWriteResPair<WriteFMul64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
-defm : AtomWriteResPair<WriteFMul64X, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : AtomWriteResPair<WriteFMul64, [AtomPort0], [AtomPort0], 5, 5, [2], [2]>;
+defm : AtomWriteResPair<WriteFMul64X, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 9, 10, [9,9], [10,10]>;
defm : X86WriteResPairUnsupported<WriteFMul64Y>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
defm : AtomWriteResPair<WriteFRcp, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
@@ -315,30 +313,30 @@
// Conversions.
////////////////////////////////////////////////////////////////////////////////
-defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
-defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [7,7], [6,6]>;
+defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2IY>;
defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
-defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [7,7], [6,6]>;
+defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IY>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
-defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
+defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSY>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
-defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
+defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDY>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
-defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
+defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
-defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>;
+defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>;
defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>;
defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
@@ -388,19 +386,19 @@
defm : AtomWriteResPair<WriteVecLogicX, [AtomPort01], [AtomPort0], 1, 1>;
defm : X86WriteResPairUnsupported<WriteVecLogicY>;
defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
-defm : AtomWriteResPair<WriteVecTest, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTest>;
defm : X86WriteResPairUnsupported<WriteVecTestY>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
-defm : AtomWriteResPair<WriteVecShift, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
-defm : AtomWriteResPair<WriteVecShiftX, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : AtomWriteResPair<WriteVecShift, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2]>;
+defm : AtomWriteResPair<WriteVecShiftX, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2]>;
defm : X86WriteResPairUnsupported<WriteVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : AtomWriteResPair<WriteVecShiftImm, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
-defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : AtomWriteResPair<WriteVecShiftImm, [AtomPort0], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort0], [AtomPort0], 1, 1>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
-defm : AtomWriteResPair<WriteVecIMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
-defm : AtomWriteResPair<WriteVecIMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteVecIMul, [AtomPort0], [AtomPort0], 4, 4, [1], [1]>;
+defm : AtomWriteResPair<WriteVecIMulX, [AtomPort0], [AtomPort0], 5, 5, [2], [2]>;
defm : X86WriteResPairUnsupported<WriteVecIMulY>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
defm : X86WriteResPairUnsupported<WritePMULLD>;
@@ -410,8 +408,8 @@
defm : X86WriteResPairUnsupported<WriteMPSAD>;
defm : X86WriteResPairUnsupported<WriteMPSADY>;
defm : X86WriteResPairUnsupported<WriteMPSADZ>;
-defm : AtomWriteResPair<WritePSADBW, [AtomPort01], [AtomPort01], 4, 4, [4], [4]>;
-defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WritePSADBW, [AtomPort0], [AtomPort0], 4, 4, [1], [1]>;
+defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [2], [2]>;
defm : X86WriteResPairUnsupported<WritePSADBWY>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : AtomWriteResPair<WriteShuffle, [AtomPort0], [AtomPort0], 1, 1>;
@@ -419,7 +417,7 @@
defm : X86WriteResPairUnsupported<WriteShuffleY>;
defm : X86WriteResPairUnsupported<WriteShuffleZ>;
defm : AtomWriteResPair<WriteVarShuffle, [AtomPort0], [AtomPort0], 1, 1>;
-defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
+defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 4, 5, [3,3], [4,4]>;
defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
defm : X86WriteResPairUnsupported<WriteBlend>;
@@ -429,6 +427,7 @@
defm : X86WriteResPairUnsupported<WriteVarBlendY>;
defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVPMOV256>;
defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
defm : X86WriteResPairUnsupported<WriteVarVecShift>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
@@ -500,7 +499,7 @@
let Latency = 1;
let ResourceCycles = [1];
}
-def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr,
+def : InstRW<[AtomWrite0_1], (instrs XAM_F, LD_Frr,
MOVSX64rr32)>;
def : SchedAlias<WriteALURMW, AtomWrite0_1>;
def : SchedAlias<WriteADCRMW, AtomWrite0_1>;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 0a201bc..99d4011 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -269,6 +269,7 @@
def : WriteRes<WriteStore, [PdStore]>;
def : WriteRes<WriteStoreNT, [PdStore]>;
def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
+defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
// FIXME: These are copy and pasted from WriteLoad/Store.
@@ -1196,6 +1197,7 @@
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
+defm : PdWriteResXMMPair<WriteVPMOV256, [PdFPU01, PdFPMAL]>;
defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 13b6eed..cdd0383 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -273,6 +273,7 @@
def : WriteRes<WriteStore, [JSAGU]>;
def : WriteRes<WriteStoreNT, [JSAGU]>;
def : WriteRes<WriteMove, [JALU01]>;
+defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
@@ -733,6 +734,7 @@
defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVPMOV256>;
defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
////////////////////////////////////////////////////////////////////////////////
@@ -833,8 +835,8 @@
let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
let NumMicroOps = 63;
}
-def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
- VMASKMOVDQU, VMASKMOVDQU64)>;
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
+ VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
///////////////////////////////////////////////////////////////////////////////
// SchedWriteVariant definitions.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
index 3d53ef1..123844a 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -88,6 +88,7 @@
def : WriteRes<WriteLoad, [SLM_MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteMove, [SLM_IEC_RSV01]>;
def : WriteRes<WriteZero, []>;
+defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load.
@@ -110,6 +111,7 @@
defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>;
defm : SLMWriteResPair<WriteIMul64Imm, [SLM_IEC_RSV1], 3>;
defm : SLMWriteResPair<WriteIMul64Reg, [SLM_IEC_RSV1], 3>;
+def : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>;
defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>;
@@ -223,6 +225,10 @@
defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
defm : SLMWriteResPair<WriteFDivY, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
@@ -269,35 +275,53 @@
defm : SLMWriteResPair<WriteFVarShuffleY,[SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
// Conversion between integer and float.
-defm : SLMWriteResPair<WriteCvtSS2I, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPS2I, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPS2IY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtSS2I, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteCvtPS2I, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtPS2IY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : SLMWriteResPair<WriteCvtSD2I, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPD2I, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPD2IY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtSD2I, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteCvtPD2I, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtPD2IY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-defm : SLMWriteResPair<WriteCvtI2SS, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtI2PS, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtI2PSY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2SS, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtI2PS, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtI2PSY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : SLMWriteResPair<WriteCvtI2SD, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtI2PD, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtI2PDY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2SD, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtI2PD, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtI2PDY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-defm : SLMWriteResPair<WriteCvtSS2SD, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPS2PD, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtSS2SD, [SLM_FPC_RSV0], 4, [2]>;
+defm : SLMWriteResPair<WriteCvtPS2PD, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : SLMWriteResPair<WriteCvtSD2SS, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPD2PS, [SLM_FPC_RSV01], 4>;
-defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtSD2SS, [SLM_FPC_RSV0], 4, [2]>;
+defm : SLMWriteResPair<WriteCvtPD2PS, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
// Vector integer operations.
def : WriteRes<WriteVecLoad, [SLM_MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteVecLoadX, [SLM_MEC_RSV]> { let Latency = 3; }
@@ -321,14 +345,18 @@
def : WriteRes<WriteVecMoveToGpr, [SLM_IEC_RSV01]>;
def : WriteRes<WriteVecMoveFromGpr, [SLM_IEC_RSV01]>;
-defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 1>;
-defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 1>;
-defm : SLMWriteResPair<WriteVecShiftY, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 2, [2], 2>;
+defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 2, [2], 2>;
+defm : SLMWriteResPair<WriteVecShiftY, [SLM_FPC_RSV0], 2, [2], 2>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SLMWriteResPair<WriteVarVecShift, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>;
@@ -341,8 +369,8 @@
defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
-defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 4>;
-defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2], 2>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2], 2>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
// FIXME: The below is closer to correct, but caused some perf regressions.
//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
@@ -354,12 +382,15 @@
defm : X86WriteResPairUnsupported<WriteShuffleZ>;
defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteVarShuffle, [SLM_FPC_RSV0], 1>;
-defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 1>;
-defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 5, [5], 4>;
+defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 5, [5], 4>;
defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;
defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0], 7>;
defm : X86WriteResPairUnsupported<WriteMPSADZ>;
@@ -368,6 +399,9 @@
defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0], 4>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : SLMWriteResPair<WritePHMINPOS, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVPMOV256>;
// Vector insert/extract operations.
defm : SLMWriteResPair<WriteVecInsert, [SLM_FPC_RSV0], 1>;
@@ -397,6 +431,7 @@
// Packed Compare Explicit Length Strings, Return Mask
defm : SLMWriteResPair<WritePCmpEStrM, [SLM_FPC_RSV0], 17, [17]>;
+
// Packed Compare Implicit Length Strings, Return Index
defm : SLMWriteResPair<WritePCmpIStrI, [SLM_FPC_RSV0], 17, [17]>;
@@ -422,39 +457,6 @@
def : WriteRes<WriteFence, [SLM_MEC_RSV]>;
def : WriteRes<WriteNop, []>;
-// AVX/FMA is not supported on that architecture, but we should define the basic
-// scheduling resources anyway.
-def : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
-defm : X86WriteResPairUnsupported<WriteFBlendY>;
-defm : X86WriteResPairUnsupported<WriteFBlendZ>;
-defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
-defm : X86WriteResPairUnsupported<WriteVarBlendY>;
-defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
-defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>;
-defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
-defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
-defm : X86WriteResPairUnsupported<WriteFShuffle256>;
-defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
-defm : X86WriteResPairUnsupported<WriteShuffle256>;
-defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
-defm : SLMWriteResPair<WriteVarVecShift, [SLM_FPC_RSV0], 1>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
-defm : X86WriteResPairUnsupported<WriteFMA>;
-defm : X86WriteResPairUnsupported<WriteFMAX>;
-defm : X86WriteResPairUnsupported<WriteFMAY>;
-defm : X86WriteResPairUnsupported<WriteFMAZ>;
-
-defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
-defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
-defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
-defm : X86WriteResUnsupported<WriteCvtPS2PH>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
-
// Remaining SLM instrs.
def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
index fe09d6f..12f8e7c 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -179,6 +179,10 @@
def : WriteRes<WriteMove, [ZnALU]>;
def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 8; let NumMicroOps = 0; }
+
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
@@ -437,6 +441,7 @@
defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WriteVPMOV256, [ZnFPU12], 1, [1], 2>;
defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>;
defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>;
defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>;
@@ -969,7 +974,7 @@
def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>;
// FXAM.
-def : InstRW<[ZnWriteFPU3Lat1], (instrs FXAM)>;
+def : InstRW<[ZnWriteFPU3Lat1], (instrs XAM_F)>;
// FPREM.
def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
@@ -1019,11 +1024,6 @@
MMX_PACKSSWBirm,
MMX_PACKUSWBirm)>;
-// VPMOVSX/ZX BW BD BQ WD WQ DQ.
-// y <- x.
-def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
-def : InstRW<[ZnWriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
-
def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
let Latency = 2;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 48da0d6..5b4b151 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -178,6 +178,10 @@
def : WriteRes<WriteMove, [Zn2ALU]>;
def : WriteRes<WriteLoad, [Zn2AGU]> { let Latency = 8; }
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 8; let NumMicroOps = 0; }
+
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [Zn2ALU]>;
defm : Zn2WriteResPair<WriteALU, [Zn2ALU], 1>;
@@ -419,6 +423,7 @@
defm : Zn2WriteResFpuPair<WriteBlendY, [Zn2FPU01], 1>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
defm : Zn2WriteResFpuPair<WriteShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WriteVPMOV256, [Zn2FPU12], 4, [1], 2, 4>;
defm : Zn2WriteResFpuPair<WriteVarShuffle256, [Zn2FPU], 2>;
defm : Zn2WriteResFpuPair<WritePSADBW, [Zn2FPU0], 3>;
defm : Zn2WriteResFpuPair<WritePSADBWX, [Zn2FPU0], 3>;
@@ -978,7 +983,7 @@
def : InstRW<[Zn2WriteFPU0Lat1], (instregex "TST_F")>;
// FXAM.
-def : InstRW<[Zn2WriteFPU3Lat1], (instrs FXAM)>;
+def : InstRW<[Zn2WriteFPU3Lat1], (instrs XAM_F)>;
// FPREM.
def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
@@ -1029,11 +1034,6 @@
MMX_PACKSSWBirm,
MMX_PACKUSWBirm)>;
-// VPMOVSX/ZX BW BD BQ WD WQ DQ.
-// y <- x.
-def : InstRW<[Zn2WriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
-def : InstRW<[Zn2WriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
-
def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ;
def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ;
def Zn2WriteFPU013m : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td
new file mode 100644
index 0000000..4a91a91
--- /dev/null
+++ b/src/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -0,0 +1,1670 @@
+//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver3 to support instruction
+// scheduling and other instruction cost heuristics.
+// Based on:
+// * AMD Software Optimization Guide for AMD Family 19h Processors.
+// https://www.amd.com/system/files/TechDocs/56665.zip
+// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
+// http://www.agner.org/optimize/microarchitecture.pdf
+// * AMD Zen 3 Ryzen Deep Dive Review
+// https://www.anandtech.com/show/16214/
+//===----------------------------------------------------------------------===//
+
+def Znver3Model : SchedMachineModel {
+ // AMD SOG 19h, 2.9.6 Dispatch
+ // The processor may dispatch up to 6 macro ops per cycle
+ // into the execution engine.
+ let IssueWidth = 6;
+ // AMD SOG 19h, 2.10.3
+ // The retire control unit (RCU) tracks the completion status of all
+ // outstanding operations (integer, load/store, and floating-point) and is
+ // the final arbiter for exception processing and recovery.
+ // The unit can receive up to 6 macro ops dispatched per cycle and track up
+ // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
+ let MicroOpBufferSize = 256;
+ // AMD SOG 19h, 2.9.1 Op Cache
+ // The op cache is organized as an associative cache with 64 sets and 8 ways.
+ // At each set-way intersection is an entry containing up to 8 macro ops.
+ // The maximum capacity of the op cache is 4K ops.
+ // Agner, 22.5 µop cache
+ // The size of the µop cache is big enough for holding most critical loops.
+ // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
+ // with large values here the compilation of certain loops
+ // ends up taking way too long.
+ // let LoopMicroOpBufferSize = 4096;
+ let LoopMicroOpBufferSize = 512;
+ // AMD SOG 19h, 2.6.2 L1 Data Cache
+ // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
+ // AMD SOG 19h, 2.12 L1 Data Cache
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve 4-cycle load-to-use integer load latency.
+ let LoadLatency = 4;
+ // AMD SOG 19h, 2.12 L1 Data Cache
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
+ int VecLoadLatency = 7;
+ // Latency of a simple store operation.
+ int StoreLatency = 1;
+ // FIXME
+ let HighLatency = 25; // FIXME: any better choice?
+ // AMD SOG 19h, 2.8 Optimizing Branching
+ // The branch misprediction penalty is in the range from 11 to 18 cycles,
+ // <...>. The common case penalty is 13 cycles.
+ let MispredictPenalty = 13;
+
+ let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+ let CompleteModel = 1;
+}
+
+let SchedModel = Znver3Model in {
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.3 Retire Control Unit
+// The unit can receive up to 6 macro ops dispatched per cycle and track up to
+// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
+// The retire unit handles in-order commit of up to eight macro ops per cycle.
+def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
+
+//===----------------------------------------------------------------------===//
+// Units
+//===----------------------------------------------------------------------===//
+
+// There are total of three Units, each one with it's own schedulers.
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Unit
+//
+
+// AMD SOG 19h, 2.4 Superscalar Organization
+// The processor uses four decoupled independent integer scheduler queues,
+// each one servicing one ALU pipeline and one or two other pipelines
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// The processor contains 4 general purpose integer execution pipes.
+// Each pipe has an ALU capable of general purpose integer operations.
+def Zn3ALU0 : ProcResource<1>;
+def Zn3ALU1 : ProcResource<1>;
+def Zn3ALU2 : ProcResource<1>;
+def Zn3ALU3 : ProcResource<1>;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// There is also a separate branch execution unit.
+def Zn3BRU1 : ProcResource<1>;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// There are three Address Generation Units (AGUs) for all load and store
+// address generation. There are also 3 store data movement units
+// associated with the same schedulers as the AGUs.
+def Zn3AGU0 : ProcResource<1>;
+def Zn3AGU1 : ProcResource<1>;
+def Zn3AGU2 : ProcResource<1>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// ALU0 additionally has divide <...> execution capability.
+defvar Zn3Divider = Zn3ALU0;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// ALU0 additionally has <...> branch execution capability.
+defvar Zn3BRU0 = Zn3ALU0;
+
+// Integer Multiplication issued on ALU1.
+defvar Zn3Multiplier = Zn3ALU1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// General ALU operations
+def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
+
+// General AGU operations
+def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
+
+// Control flow: jumps, calls
+def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
+
+// Everything that isn't control flow, but still needs to access CC register,
+// namely: conditional moves, SETcc.
+def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
+
+// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
+
+// Simple bit twiddling: bit test, shift/rotate, bit extraction
+def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.3 Retire Control Unit
+// The integer physical register file (PRF) consists of 192 registers.
+def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
+ 6, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
+// AMD SOG 19h, 2.10.1 Schedulers
+// The schedulers can receive up to six macro ops per cycle, with a limit of
+// two per scheduler. Each scheduler can issue one micro op per cycle into
+// each of its associated pipelines
+// FIXME: these are 4 separate schedulers, not a single big one.
+def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
+ Zn3ALU1, Zn3AGU1, // scheduler 1
+ Zn3ALU2, Zn3AGU2, // scheduler 2
+ Zn3ALU3, Zn3BRU1 // scheduler 3
+ ]> {
+ let BufferSize = !mul(4, 24);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Unit
+//
+
+// AMD SOG 19h, 2.4 Superscalar Organization
+// The processor uses <...> two decoupled independent floating point schedulers
+// each servicing two FP pipelines and one store or FP-to-integer pipeline.
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.1 Schedulers
+// <...>, and six FPU pipes.
+// Agner, 22.10 Floating point execution pipes
+// There are six floating point/vector execution pipes,
+def Zn3FPP0 : ProcResource<1>;
+def Zn3FPP1 : ProcResource<1>;
+def Zn3FPP2 : ProcResource<1>;
+def Zn3FPP3 : ProcResource<1>;
+def Zn3FPP45 : ProcResource<2>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+defvar Zn3FPFMul0 = Zn3FPP0;
+defvar Zn3FPFMul1 = Zn3FPP1;
+
+// (v)FADD*
+defvar Zn3FPFAdd0 = Zn3FPP2;
+defvar Zn3FPFAdd1 = Zn3FPP3;
+
+// All convert operations except pack/unpack
+defvar Zn3FPFCvt0 = Zn3FPP2;
+defvar Zn3FPFCvt1 = Zn3FPP3;
+
+// All Divide and Square Root except Reciprocal Approximation
+// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
+// FDIV unit can support 2 simultaneous operations in flight
+// even though it occupies a single pipe.
+// FIXME: BufferSize=2 ?
+defvar Zn3FPFDiv = Zn3FPP1;
+
+// Moves and Logical operations on Floating Point Data Types
+defvar Zn3FPFMisc0 = Zn3FPP0;
+defvar Zn3FPFMisc1 = Zn3FPP1;
+defvar Zn3FPFMisc2 = Zn3FPP2;
+defvar Zn3FPFMisc3 = Zn3FPP3;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+defvar Zn3FPVAdd0 = Zn3FPP0;
+defvar Zn3FPVAdd1 = Zn3FPP1;
+defvar Zn3FPVAdd2 = Zn3FPP2;
+defvar Zn3FPVAdd3 = Zn3FPP3;
+
+// Integer Multiplies, SAD, Blendvb
+defvar Zn3FPVMul0 = Zn3FPP0;
+defvar Zn3FPVMul1 = Zn3FPP3;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+defvar Zn3FPVShuf = Zn3FPP1;
+defvar Zn3FPVShufAux = Zn3FPP2;
+
+// Bit Shift Left/Right operations
+defvar Zn3FPVShift0 = Zn3FPP1;
+defvar Zn3FPVShift1 = Zn3FPP2;
+
+// Moves and Logical operations on Packed Integer Data Types
+defvar Zn3FPVMisc0 = Zn3FPP0;
+defvar Zn3FPVMisc1 = Zn3FPP1;
+defvar Zn3FPVMisc2 = Zn3FPP2;
+defvar Zn3FPVMisc3 = Zn3FPP3;
+
+// *AES*
+defvar Zn3FPAES0 = Zn3FPP0;
+defvar Zn3FPAES1 = Zn3FPP1;
+
+// *CLM*
+defvar Zn3FPCLM0 = Zn3FPP0;
+defvar Zn3FPCLM1 = Zn3FPP1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
+
+// (v)FADD*
+// Some complex VADD operations are not available in all pipes.
+def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
+
+// All convert operations except pack/unpack
+def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
+
+// All Divide and Square Root except Reciprocal Approximation
+// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
+
+// Moves and Logical operations on Floating Point Data Types
+def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
+
+def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
+
+// Loads, Stores and Move to General Register (EX) Operations
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+defvar Zn3FPLd01 = Zn3FPP45;
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Note that FP stores are supported on two pipelines,
+// but throughput is limited to one per cycle.
+let Super = Zn3FPP45 in
+def Zn3FPSt : ProcResource<1>;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
+
+def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
+def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
+
+// Integer Multiplies, SAD, Blendvb
+def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
+
+// Bit Shift Left/Right operations
+def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
+
+// Moves and Logical operations on Packed Integer Data Types
+def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
+
+// *AES*
+def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
+
+// *CLM*
+def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// Agner, 21.8 Register renaming and out-of-order schedulers
+// The floating point register file has 160 vector registers
+// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
+// anandtech also confirms this.
+def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
+ 6, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// The floating-point scheduler has a 2*32 entry macro op capacity.
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// <...> the scheduler can issue 1 micro op per cycle for each pipe.
+// FIXME: those are two separate schedulers, not a single big one.
+def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0
+ Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1
+ ]> {
+ let BufferSize = !mul(2, 32);
+}
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
+// even if floating-point scheduler is full.
+// FIXME: how to model this properly?
+
+
+//===----------------------------------------------------------------------===//
+// Load-Store Unit
+//
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// The LS unit contains three largely independent pipe-lines
+// enabling the execution of three 256-bit memory operations per cycle.
+def Zn3LSU : ProcResource<3>;
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// All three memory operations can be loads.
+let Super = Zn3LSU in
+def Zn3Load : ProcResource<3> {
+ // AMD SOG 19h, 2.12 Load-Store Unit
+ // The LS unit can process up to 72 out-of-order loads.
+ let BufferSize = 72;
+}
+
+def Zn3LoadQueue : LoadQueue<Zn3Load>;
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// A maximum of two of the memory operations can be stores.
+let Super = Zn3LSU in
+def Zn3Store : ProcResource<2> {
+ // AMD SOG 19h, 2.12 Load-Store Unit
+ // The LS unit utilizes a 64-entry store queue (STQ).
+ let BufferSize = 64;
+}
+
+def Zn3StoreQueue : StoreQueue<Zn3Store>;
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+
+multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
+ int Lat = 1, list<int> Res = [], int UOps = 1> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
+multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat,
+ list<int> Res, int UOps, int LoadLat, int LoadUOps,
+ ProcResourceKind AGU, int LoadRes> {
+ defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+ defm : __zn3WriteRes<SchedRW.Folded,
+ !listconcat([AGU, Zn3Load], ExePorts),
+ !add(Lat, LoadLat),
+ !if(!and(!empty(Res), !eq(LoadRes, 1)),
+ [],
+ !listconcat([1, LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
+ !add(UOps, LoadUOps)>;
+}
+
+// For classes without folded loads.
+multiclass Zn3WriteResInt<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn3WriteResXMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn3WriteResYMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+// For classes with folded loads.
+multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver3Model.LoadLatency,
+ LoadUOps, Zn3AGU012, LoadRes>;
+}
+
+multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver3Model.VecLoadLatency,
+ LoadUOps, Zn3FPLd01, LoadRes>;
+}
+
+multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver3Model.VecLoadLatency,
+ LoadUOps, Zn3FPLd01, LoadRes>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
+
+def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// There is 1 cycle of added latency for a result to cross
+// from F to I or I to F domain.
+def : ReadAdvance<ReadInt2Fpu, -1>;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
+
+def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
+ let Latency = !add(Znver3Model.LoadLatency, 1);
+ let ResourceCycles = [3, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
+
+defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
+defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
+defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
+ let Latency = Znver3Model.LoadLatency;
+ let ResourceCycles = [1, 1, 4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
+
+def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
+ let Latency = Znver3Model.StoreLatency;
+ let ResourceCycles = [4, 1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
+
+// Arithmetic.
+defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
+
+def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
+ AND8i8, AND16i16, AND32i32, AND64i32,
+ OR8i8, OR16i16, OR32i32, OR64i32,
+ SUB8i8, SUB16i16, SUB32i32, SUB64i32,
+ XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
+
+def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
+
+def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
+
+def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
+ PEXT32rr, PEXT64rr)>;
+
+defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
+
+def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 1, 7, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
+
+// This is for simple LEAs with one or two input operands.
+defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
+
+// This write is used for slow LEA instructions.
+def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+
+// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
+// or an LEA with a `Scale` value different than 1.
+def Zn3SlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def Zn3WriteLEA : SchedWriteVariant<[
+ SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
+ SchedVar<NoSchedPred, [WriteLEA]>
+]>;
+
+def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
+
+// Integer multiplication
+defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
+defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
+defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
+defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
+defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
+
+def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
+
+def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = Zn3MULX32rr.NumMicroOps;
+}
+def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>;
+
+defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
+defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
+defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
+
+def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
+
+def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = Zn3MULX64rr.NumMicroOps;
+}
+def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>;
+
+defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
+defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
+defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part.
+
+defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
+defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
+
+defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
+
+def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 3;
+ let ResourceCycles = [12];
+ let NumMicroOps = 3;
+}
+def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
+
+defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap.
+
+def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
+ let ResourceCycles = [1, 1, 12];
+ let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
+}
+def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
+
+def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 3; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [24];
+ let NumMicroOps = 19;
+}
+def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 4; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [59];
+ let NumMicroOps = 28;
+}
+def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
+
+def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
+
+def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = 5;
+}
+def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
+
+def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
+
+// Integer division.
+// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
+// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
+defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
+defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
+defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
+defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
+defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
+defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
+defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
+defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
+
+defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
+defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
+
+defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
+
+def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
+
+defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
+
+def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
+
+defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
+
+def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
+
+defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
+defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
+defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
+defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
+defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
+
+defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
+defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
+defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
+
+defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
+defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
+defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
+
+// Integer shifts and rotates.
+defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
+}
+def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
+ RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
+
+def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 7;
+}
+def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
+}
+def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
+
+def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 9;
+}
+def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
+
+def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
+}
+def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
+
+defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 7;
+}
+def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
+
+def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
+}
+def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
+
+def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 9;
+}
+def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
+
+def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
+}
+def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
+
+// Double shift instructions.
+defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
+defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
+defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
+defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
+defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
+
+// Floating point. This covers both scalar and vector operations.
+defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+
+def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr,
+ VMOVHPDmr, VMOVHPSmr)>;
+
+defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+
+defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
+defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
+defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
+defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
+
+defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub.
+
+def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 24];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
+ SUB_FI16m, SUB_FI32m,
+ SUBR_FI16m, SUBR_FI32m,
+ MUL_FI16m, MUL_FI32m)>;
+
+def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 62];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
+ DIVR_FI16m, DIVR_FI32m)>;
+
+defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
+defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
+defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
+defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
+defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
+defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
+defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare.
+defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
+defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
+defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
+defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare.
+defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
+defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
+defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
+defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
+defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication.
+defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
+defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
+defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
+defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication.
+defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
+defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
+defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
+defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division.
+defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
+defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
+defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division.
+defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
+defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
+defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root.
+defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
+defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM).
+defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root.
+defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
+defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
+defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
+defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate.
+defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
+defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
+defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate.
+defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
+defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
+defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add.
+defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
+defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
+defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
+defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
+defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
+defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
+defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM).
+defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
+defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
+defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
+defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
+defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
+defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
+defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
+defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
+defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
+defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
+defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
+defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
+defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
+defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
+defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
+defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
+defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
+defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
+defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
+
+// Horizontal Add/Sub (float and integer)
+defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
+defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
+defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
+defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
+
+// Vector integer operations.
+defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+
+def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
+
+def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
+}
+def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
+
+def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
+
+defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
+defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
+defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
+defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
+defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
+
+defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
+defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
+
+def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
+
+def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
+
+defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals.
+
+def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
+
+def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
+
+defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
+
+def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
+ PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
+ PAVGBrr, PAVGWrr,
+ PSIGNBrr, PSIGNDrr, PSIGNWrr,
+ VPABSBrr, VPABSDrr, VPABSWrr,
+ VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
+ VPAVGBrr, VPAVGWrr,
+ VPCMPEQQrr,
+ VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
+ PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
+
+def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
+ MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
+ MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr,
+ MMX_PAVGBirr, MMX_PAVGWirr,
+ MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>;
+
+defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
+
+def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
+ VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
+ VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
+ VPAVGBYrr, VPAVGWYrr,
+ VPCMPEQQYrr,
+ VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
+
+defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
+defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals.
+defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
+defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
+defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
+defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
+defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM).
+defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default).
+defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
+defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
+defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default).
+defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
+defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
+defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default).
+defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
+defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
+defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
+defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
+defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
+defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles.
+defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
+defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
+defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
+defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles.
+defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
+defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
+defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
+defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
+defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
+defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
+defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
+defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
+defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
+defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
+defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
+defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
+defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
+defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
+defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS.
+
+// Vector insert/extract operations.
+defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
+defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
+defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
+
+// MOVMSK operations.
+defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
+defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
+defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
+defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
+
+// Conversion between integer and float.
+defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer.
+defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
+
+def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>;
+
+defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer.
+
+defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
+
+defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
+defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
+
+def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
+ let Latency = 2;
+ let ResourceCycles = [6];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>;
+
+defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
+defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
+
+def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>;
+
+defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
+defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
+
+defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
+defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
+defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
+
+defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
+defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
+
+defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
+defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
+defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
+defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
+
+// CRC32 instruction.
+defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
+
+def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
+
+def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
+
+def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
+
+def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
+
+def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
+
+def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
+ let ResourceCycles = [1, 1, 3];
+ let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
+
+def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 3;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
+
+def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
+}
+def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
+
+def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 6;
+ let ResourceCycles = [8];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
+
+def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+// Packed Compare Explicit Length Strings, Return Mask
+defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+// Packed Compare Implicit Length Strings, Return Index
+defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
+// Packed Compare Explicit Length Strings, Return Index
+defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
+
+// AES instructions.
+defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
+defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
+defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
+
+// EMMS/FEMMS
+defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
+
+// Load/store MXCSR
+defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
+defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
+
+// Catch-all for expensive system instructions.
+defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
+
+def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 0; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
+
+def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
+ let Latency = 10; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [24];
+ let NumMicroOps = 18;
+}
+def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
+
+// AVX2.
+defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
+defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
+defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
+
+def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
+
+def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
+
+def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> {
+ let Latency = 7;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
+
+def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1);
+}
+def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
+
+def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
+ let Latency = 6;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
+
+def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
+}
+def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
+
+def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> {
+ let Latency = 5;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>;
+
+def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
+ let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0);
+}
+def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
+
+defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
+defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
+defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
+defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
+
+// Old microcoded instructions that nobody use.
+defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
+
+// Fence instructions.
+defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
+
+def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
+ let Latency = 1;
+ let ResourceCycles = [30];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
+
+def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
+
+// Nop, not very useful expect it provides a model for nops!
+defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Zero Cycle Move
+///////////////////////////////////////////////////////////////////////////////
+
+def Zn3WriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+ let ResourceCycles = [];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
+ MOV64rr, MOV64rr_REV,
+ MOVSX32rr32)>;
+
+def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
+ let Latency = 0;
+ let ResourceCycles = [];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
+ XCHG64rr, XCHG64ar)>;
+
+defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
+
+defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
+defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
+defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
+
+defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
+defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
+defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
+
+def : IsOptimizableRegisterMove<[
+ InstructionEquivalenceClass<[
+ // GPR variants.
+ MOV32rr, MOV32rr_REV,
+ MOV64rr, MOV64rr_REV,
+ MOVSX32rr32,
+ XCHG32rr, XCHG32ar,
+ XCHG64rr, XCHG64ar,
+
+ // MMX variants.
+ // MMX moves are *NOT* eliminated.
+
+ // SSE variants.
+ MOVAPSrr, MOVAPSrr_REV,
+ MOVUPSrr, MOVUPSrr_REV,
+ MOVAPDrr, MOVAPDrr_REV,
+ MOVUPDrr, MOVUPDrr_REV,
+ MOVDQArr, MOVDQArr_REV,
+ MOVDQUrr, MOVDQUrr_REV,
+
+ // AVX variants.
+ VMOVAPSrr, VMOVAPSrr_REV,
+ VMOVUPSrr, VMOVUPSrr_REV,
+ VMOVAPDrr, VMOVAPDrr_REV,
+ VMOVUPDrr, VMOVUPDrr_REV,
+ VMOVDQArr, VMOVDQArr_REV,
+ VMOVDQUrr, VMOVDQUrr_REV,
+
+ // AVX YMM variants.
+ VMOVAPSYrr, VMOVAPSYrr_REV,
+ VMOVUPSYrr, VMOVUPSYrr_REV,
+ VMOVAPDYrr, VMOVAPDYrr_REV,
+ VMOVUPDYrr, VMOVUPDYrr_REV,
+ VMOVDQAYrr, VMOVDQAYrr_REV,
+ VMOVDQUYrr, VMOVDQUYrr_REV,
+ ], TruePred >
+]>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def Zn3WriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
+ XOR64rr, XOR64rr_REV,
+ SUB32rr, SUB32rr_REV,
+ SUB64rr, SUB64rr_REV)>;
+
+def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV,
+ CMP16rr, CMP16rr_REV,
+ CMP32rr, CMP32rr_REV,
+ CMP64rr, CMP64rr_REV)>;
+
+def Zn3WriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+// NOTE: XORPSrr, XORPDrr are not zero-cycle!
+def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
+ VANDNPSrr, VANDNPDrr)>;
+
+def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr)>;
+
+def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+// NOTE: PXORrr,PANDNrr are not zero-cycle!
+def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
+
+def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
+
+def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
+def : InstRW<[Zn3WriteVZeroIdiomALUX],
+ (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
+
+def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[Zn3WriteVZeroIdiomALUY],
+ (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+ VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
+
+def : IsZeroIdiomFunction<[
+ // GPR Zero-idioms.
+ DepBreakingClass<[ XOR32rr, XOR32rr_REV,
+ XOR64rr, XOR64rr_REV,
+ SUB32rr, SUB32rr_REV,
+ SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
+
+ // SSE XMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ XORPSrr, XORPDrr,
+ ANDNPSrr, ANDNPDrr,
+
+ // int variants.
+ PXORrr,
+ PANDNrr,
+ PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+ PSUBSBrr, PSUBSWrr,
+ PSUBUSBrr, PSUBUSWrr,
+ PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX XMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ VXORPSrr, VXORPDrr,
+ VANDNPSrr, VANDNPDrr,
+
+ // int variants.
+ VPXORrr,
+ VPANDNrr,
+ VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPSUBSBrr, VPSUBSWrr,
+ VPSUBUSBrr, VPSUBUSWrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+ ], ZeroIdiomPredicate>,
+
+ // AVX YMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr,
+
+ // int variants.
+ VPXORYrr,
+ VPANDNYrr,
+ VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+ VPSUBSBYrr, VPSUBSWYrr,
+ VPSUBUSBYrr, VPSUBUSWYrr,
+ VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+ ], ZeroIdiomPredicate>,
+]>;
+
+def : IsDepBreakingFunction<[
+ // GPR
+ DepBreakingClass<[ SBB32rr, SBB32rr_REV,
+ SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
+ DepBreakingClass<[ CMP8rr, CMP8rr_REV,
+ CMP16rr, CMP16rr_REV,
+ CMP32rr, CMP32rr_REV,
+ CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
+
+ // MMX
+ DepBreakingClass<[
+ MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr
+ ], ZeroIdiomPredicate>,
+
+ // SSE
+ DepBreakingClass<[
+ PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX XMM
+ DepBreakingClass<[
+ VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX YMM
+ DepBreakingClass<[
+ VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
+ ], ZeroIdiomPredicate>,
+]>;
+
+} // SchedModel
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index e76908e..a3238e6 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -41,11 +41,7 @@
const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
DAG.getSubtarget().getRegisterInfo());
- Register BaseReg = TRI->getBaseRegister();
- for (unsigned R : ClobberSet)
- if (BaseReg == R)
- return true;
- return false;
+ return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
}
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/src/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index aa73d4b..fcaf7c8 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1574,7 +1574,7 @@
MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc Loc = MI.getDebugLoc();
+ const DebugLoc &Loc = MI.getDebugLoc();
// Check if EFLAGS are alive by seeing if there is a def of them or they
// live-in, and then seeing if that def is in turn used.
@@ -1915,8 +1915,9 @@
auto *RC = MRI->getRegClass(Reg);
int Bytes = TRI->getRegSizeInBits(*RC) / 8;
-
unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+ assert((Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8) &&
+ "Unknown register size");
// FIXME: Need to teach this about 32-bit mode.
if (Bytes != 8) {
@@ -1959,7 +1960,7 @@
/// Returns the newly hardened register.
unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc Loc = MI.getDebugLoc();
+ const DebugLoc &Loc = MI.getDebugLoc();
auto &DefOp = MI.getOperand(0);
Register OldDefReg = DefOp.getReg();
@@ -2010,7 +2011,7 @@
/// predicate state from the stack pointer and continue to harden loads.
void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc Loc = MI.getDebugLoc();
+ const DebugLoc &Loc = MI.getDebugLoc();
auto InsertPt = MI.getIterator();
if (FenceCallAndRet)
@@ -2059,7 +2060,7 @@
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
auto InsertPt = MI.getIterator();
- DebugLoc Loc = MI.getDebugLoc();
+ const DebugLoc &Loc = MI.getDebugLoc();
if (FenceCallAndRet) {
if (MI.isReturn())
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
index c95213c..4af0ac2 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -265,13 +265,13 @@
report_fatal_error("64-bit code requested on a subtarget that doesn't "
"support it!");
- // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, NaCl, and for all
// 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
// following the i386 psABI, while on Illumos it is always 16 bytes.
if (StackAlignOverride)
stackAlignment = *StackAlignOverride;
else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
- In64BitMode)
+ isTargetNaCl() || In64BitMode)
stackAlignment = Align(16);
// Consume the vector width attribute or apply any target specific limit.
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
index fa26223..935dbd8 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
@@ -247,9 +247,13 @@
/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
bool HasLZCNTFalseDeps = false;
- /// True if its preferable to combine to a single shuffle using a variable
- /// mask over multiple fixed shuffles.
- bool HasFastVariableShuffle = false;
+ /// True if its preferable to combine to a single cross-lane shuffle
+ /// using a variable mask over multiple fixed shuffles.
+ bool HasFastVariableCrossLaneShuffle = false;
+
+ /// True if its preferable to combine to a single per-lane shuffle
+ /// using a variable mask over multiple fixed shuffles.
+ bool HasFastVariablePerLaneShuffle = false;
/// True if vzeroupper instructions should be inserted after code that uses
/// ymm or zmm registers.
@@ -433,6 +437,9 @@
/// Prefer a left/right vector logical shifts pair over a shift+and pair.
bool HasFastVectorShiftMasks = false;
+ /// Prefer a movbe over a single-use load + bswap / single-use bswap + store.
+ bool HasFastMOVBE = false;
+
/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.
bool UseRetpolineIndirectCalls = false;
@@ -603,14 +610,12 @@
/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
bool isTarget64BitILP32() const {
- return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
- TargetTriple.isOSNaCl());
+ return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
}
/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
bool isTarget64BitLP64() const {
- return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
- !TargetTriple.isOSNaCl());
+ return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
}
PICStyles::Style getPICStyle() const { return PICStyle; }
@@ -701,8 +706,11 @@
bool useLeaForSP() const { return UseLeaForSP; }
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
- bool hasFastVariableShuffle() const {
- return HasFastVariableShuffle;
+ bool hasFastVariableCrossLaneShuffle() const {
+ return HasFastVariableCrossLaneShuffle;
+ }
+ bool hasFastVariablePerLaneShuffle() const {
+ return HasFastVariablePerLaneShuffle;
}
bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
bool hasFastGather() const { return HasFastGather; }
@@ -714,6 +722,7 @@
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
+ bool hasFastMOVBE() const { return HasFastMOVBE; }
bool hasMacroFusion() const { return HasMacroFusion; }
bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
@@ -885,6 +894,7 @@
case CallingConv::Fast:
case CallingConv::Tail:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_ThisCall:
@@ -941,7 +951,7 @@
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
- bool enableAdvancedRASplitCost() const override { return true; }
+ bool enableAdvancedRASplitCost() const override { return false; }
};
} // end namespace llvm
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/src/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
index c8f76c2..ee8cff3 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -62,7 +62,9 @@
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeX86LowerAMXIntrinsicsLegacyPassPass(PR);
initializeX86LowerAMXTypeLegacyPassPass(PR);
+ initializeX86PreAMXConfigPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -73,6 +75,8 @@
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86TileConfigPass(PR);
+ initializeX86FastTileConfigPass(PR);
+ initializeX86LowerTileCopyPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@@ -106,9 +110,7 @@
Ret += DataLayout::getManglingComponent(TT);
// X86 and x32 have 32 bit pointers.
- if ((TT.isArch64Bit() &&
- (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
- !TT.isArch64Bit())
+ if (!TT.isArch64Bit() || TT.isX32() || TT.isOSNaCl())
Ret += "-p:32:32";
// Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
@@ -259,7 +261,7 @@
StringRef Val = PreferVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += "prefer-vector-width=";
+ Key += 'p';
Key += Val;
PreferVectorWidthOverride = Width;
}
@@ -272,7 +274,7 @@
StringRef Val = MinLegalVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += "min-legal-vector-width=";
+ Key += 'm';
Key += Val;
RequiredVectorWidth = Width;
}
@@ -282,7 +284,6 @@
Key += CPU;
// Add tune CPU to the Key.
- Key += "tune=";
Key += TuneCPU;
// Keep track of the start of the feature portion of the string.
@@ -293,8 +294,7 @@
// function before we can generate a subtarget. We also need to use
// it as a key for the subtarget since that can be the only difference
// between two functions.
- bool SoftFloat =
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ bool SoftFloat = F.getFnAttribute("use-soft-float").getValueAsBool();
// If the soft float attribute is set on the function turn on the soft float
// subtarget feature.
if (SoftFloat)
@@ -314,8 +314,8 @@
resetTargetOptions(F);
I = std::make_unique<X86Subtarget>(
TargetTriple, CPU, TuneCPU, FS, *this,
- MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
- RequiredVectorWidth);
+ MaybeAlign(F.getParent()->getOverrideStackAlignment()),
+ PreferVectorWidthOverride, RequiredVectorWidth);
}
return I.get();
}
@@ -377,6 +377,7 @@
bool addPreISel() override;
void addMachineSSAOptimization() override;
void addPreRegAlloc() override;
+ bool addPostFastRegAllocRewrite() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
void addPreEmitPass2() override;
@@ -410,8 +411,15 @@
void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
+
+ // We add both pass anyway and when these two passes run, we skip the pass
+ // based on the option level and option attribute.
+ addPass(createX86LowerAMXIntrinsicsPass());
addPass(createX86LowerAMXTypePass());
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ addPass(createX86PreAMXConfigPass());
+
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOpt::None) {
@@ -464,7 +472,7 @@
}
bool X86PassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
@@ -508,6 +516,7 @@
}
void X86PassConfig::addPostRegAlloc() {
+ addPass(createX86LowerTileCopyPass());
addPass(createX86FloatingPointStackifierPass());
// When -O0 is enabled, the Load Value Injection Hardening pass will fall back
// to using the Speculative Execution Side Effect Suppression pass for
@@ -568,12 +577,21 @@
(!TT.isOSWindows() ||
MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
addPass(createCFIInstrInserter());
- // Identify valid longjmp targets for Windows Control Flow Guard.
- if (TT.isOSWindows())
+
+ if (TT.isOSWindows()) {
+ // Identify valid longjmp targets for Windows Control Flow Guard.
addPass(createCFGuardLongjmpPass());
+ // Identify valid eh continuation targets for Windows EHCont Guard.
+ addPass(createEHContGuardCatchretPass());
+ }
addPass(createX86LoadValueInjectionRetHardeningPass());
}
+bool X86PassConfig::addPostFastRegAllocRewrite() {
+ addPass(createX86FastTileConfigPass());
+ return true;
+}
+
bool X86PassConfig::addPreRewrite() {
addPass(createX86TileConfigPass());
return true;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7145523..971c430 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -129,26 +129,30 @@
return 8;
}
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+TypeSize
+X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
- if (Vector) {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasAVX512() && PreferVectorWidth >= 512)
- return 512;
+ return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
- return 256;
+ return TypeSize::getFixed(256);
if (ST->hasSSE1() && PreferVectorWidth >= 128)
- return 128;
- return 0;
+ return TypeSize::getFixed(128);
+ return TypeSize::getFixed(0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
}
- if (ST->is64Bit())
- return 64;
-
- return 32;
+ llvm_unreachable("Unsupported register kind");
}
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
- return getRegisterBitWidth(true);
+ return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize();
}
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
@@ -169,21 +173,35 @@
return 2;
}
-int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost X86TTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
+
+ // vXi8 multiplications are always promoted to vXi16.
+ if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
+ Ty->getScalarSizeInBits() == 8) {
+ Type *WideVecTy =
+ VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
+ return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ }
+
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -203,7 +221,6 @@
static const CostTblEntry SLMCostTable[] = {
{ ISD::MUL, MVT::v4i32, 11 }, // pmulld
{ ISD::MUL, MVT::v8i16, 2 }, // pmullw
- { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
{ ISD::FMUL, MVT::f64, 2 }, // mulsd
{ ISD::FMUL, MVT::v2f64, 4 }, // mulpd
{ ISD::FMUL, MVT::v4f32, 2 }, // mulps
@@ -261,10 +278,9 @@
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
- int Cost =
+ InstructionCost Cost =
2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
- Op2Info,
- TargetTransformInfo::OP_None,
+ Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
Op2Info,
@@ -491,14 +507,22 @@
}
static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
+ { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
+ { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
+
{ ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v8i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v16i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
@@ -516,6 +540,12 @@
{ ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
{ ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
{ ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
+
+ { ISD::SHL, MVT::v8i32, 1 }, // pslld
+ { ISD::SRL, MVT::v8i32, 1 }, // psrld
+ { ISD::SRA, MVT::v8i32, 1 }, // psrad
+ { ISD::SHL, MVT::v4i64, 1 }, // psllq
+ { ISD::SRL, MVT::v4i64, 1 }, // psrlq
};
if (ST->hasAVX2() &&
@@ -549,9 +579,9 @@
}
static const CostTblEntry AVX512DQCostTable[] = {
- { ISD::MUL, MVT::v2i64, 1 },
- { ISD::MUL, MVT::v4i64, 1 },
- { ISD::MUL, MVT::v8i64, 1 }
+ { ISD::MUL, MVT::v2i64, 2 }, // pmullq
+ { ISD::MUL, MVT::v4i64, 2 }, // pmullq
+ { ISD::MUL, MVT::v8i64, 2 } // pmullq
};
// Look for AVX512DQ lowering tricks for custom cases.
@@ -563,10 +593,6 @@
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
-
- { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
};
// Look for AVX512BW lowering tricks for custom cases.
@@ -575,10 +601,20 @@
return LT.first * Entry->Cost;
static const CostTblEntry AVX512CostTable[] = {
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
{ ISD::SHL, MVT::v8i64, 1 },
{ ISD::SRL, MVT::v8i64, 1 },
@@ -586,21 +622,28 @@
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
- { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
- { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
+ { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
@@ -608,18 +651,18 @@
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShiftCostTable[] = {
- // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
- { ISD::SHL, MVT::v4i32, 1 },
- { ISD::SRL, MVT::v4i32, 1 },
- { ISD::SRA, MVT::v4i32, 1 },
- { ISD::SHL, MVT::v8i32, 1 },
- { ISD::SRL, MVT::v8i32, 1 },
- { ISD::SRA, MVT::v8i32, 1 },
- { ISD::SHL, MVT::v2i64, 1 },
- { ISD::SRL, MVT::v2i64, 1 },
- { ISD::SHL, MVT::v4i64, 1 },
- { ISD::SRL, MVT::v4i64, 1 },
+ { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
+ { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
+ { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
+ { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
+ { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
+ { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
+ { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
+ { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
+ { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
+ { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
};
if (ST->hasAVX512()) {
@@ -634,8 +677,8 @@
TargetTransformInfo::OP_None);
}
- // Look for AVX2 lowering tricks.
- if (ST->hasAVX2()) {
+ // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
+ if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
@@ -733,22 +776,28 @@
}
static const CostTblEntry AVX2CostTable[] = {
- { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
- { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
- { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
- { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
- { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
- { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
- { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
{ ISD::SUB, MVT::v32i8, 1 }, // psubb
{ ISD::ADD, MVT::v32i8, 1 }, // paddb
@@ -759,16 +808,18 @@
{ ISD::SUB, MVT::v4i64, 1 }, // psubq
{ ISD::ADD, MVT::v4i64, 1 }, // paddq
- { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
{ ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
- { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
@@ -790,7 +841,9 @@
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
{ ISD::MUL, MVT::v16i16, 4 },
- { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
+ { ISD::MUL, MVT::v4i64, 12 },
+
{ ISD::SUB, MVT::v32i8, 4 },
{ ISD::ADD, MVT::v32i8, 4 },
{ ISD::SUB, MVT::v16i16, 4 },
@@ -800,14 +853,34 @@
{ ISD::SUB, MVT::v4i64, 4 },
{ ISD::ADD, MVT::v4i64, 4 },
- // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
- // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
- // Because we believe v4i64 to be a legal type, we must also include the
- // extract+insert in the cost table. Therefore, the cost here is 18
- // instead of 8.
- { ISD::MUL, MVT::v4i64, 18 },
+ { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
+ { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
+ { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
- { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+ { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
+ { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
+ { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
+
+ { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
@@ -841,6 +914,8 @@
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+
+ { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
};
if (ST->hasSSE42())
@@ -848,26 +923,16 @@
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
- { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
- { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
+ { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+ { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
{ ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
@@ -879,25 +944,21 @@
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
+ { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
- { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
+ { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
- { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+ { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v8i16, 1 }, // pmullw
{ ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
@@ -907,6 +968,11 @@
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
+
{ ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
@@ -922,25 +988,42 @@
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+ { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
+ { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
{ ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
-
- { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
- { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
- { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
-
- { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
- { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
- { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
+ { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
+ };
+
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+
+ { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+ };
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
// registers. The overhead of division is going to dominate most kernels
@@ -949,7 +1032,7 @@
// to hide "20 cycles" for each lane.
if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM)) {
- int ScalarCost = getArithmeticInstrCost(
+ InstructionCost ScalarCost = getArithmeticInstrCost(
Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
@@ -959,12 +1042,15 @@
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
}
-int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
- int Index, VectorType *SubTp) {
+InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *BaseTp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
+ Kind = improveShuffleKindFromMask(Kind, Mask);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
Kind = TTI::SK_PermuteTwoSrc;
@@ -981,7 +1067,8 @@
int NumElts = LT.second.getVectorNumElements();
if ((Index % NumElts) == 0)
return 0;
- std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
+ std::pair<InstructionCost, MVT> SubLT =
+ TLI->getTypeLegalizationCost(DL, SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
@@ -1006,8 +1093,8 @@
auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
- int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
- ExtractIndex, SubTy);
+ InstructionCost ExtractCost = getShuffleCost(
+ TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
// If the original size is 32-bits or more, we can use pshufd. Otherwise
// if we have SSSE3 we can use pshufb.
@@ -1022,6 +1109,20 @@
}
}
+ // Subvector insertions are cheap if the subvectors are aligned.
+ // Note that in general, the insertion starting at the beginning of a vector
+ // isn't free, because we need to preserve the rest of the wide vector.
+ if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
+ int NumElts = LT.second.getVectorNumElements();
+ std::pair<InstructionCost, MVT> SubLT =
+ TLI->getTypeLegalizationCost(DL, SubTp);
+ if (SubLT.second.isVector()) {
+ int NumSubElts = SubLT.second.getVectorNumElements();
+ if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+ return SubLT.first;
+ }
+ }
+
// Handle some common (illegal) sub-vector types as they are often very cheap
// to shuffle even on targets without PSHUFB.
EVT VT = TLI->getValueType(DL, BaseTp);
@@ -1074,24 +1175,24 @@
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
- unsigned NumOfDests = LT.first;
+ InstructionCost NumOfDests = LT.first;
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
- unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
- return NumOfShuffles *
- getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+ None, 0, nullptr);
}
- return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
// We assume that source and destination have the same vector type.
- int NumOfDests = LT.first;
- int NumOfShufflesPerDest = LT.first * 2 - 1;
+ InstructionCost NumOfDests = LT.first;
+ InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
LT.first = NumOfDests * NumOfShufflesPerDest;
}
@@ -1150,6 +1251,8 @@
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
{TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
+ {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
+ {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
{TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
@@ -1392,26 +1495,29 @@
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// TODO: Allow non-throughput costs that aren't binary.
- auto AdjustCost = [&CostKind](int Cost) {
+ auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
};
+ // The cost tables include both specific, custom (non-legal) src/dst type
+ // conversions and generic, legalized types. We test for customs first, before
+ // falling back to legalization.
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
-
static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
@@ -1446,10 +1552,13 @@
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
@@ -1494,11 +1603,15 @@
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
@@ -1554,33 +1667,40 @@
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
- { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
- { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
+ { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
+ { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
{ ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
- { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
- { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
+ { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
+ { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
@@ -1636,12 +1756,12 @@
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
@@ -1662,6 +1782,9 @@
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
// sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
@@ -1696,14 +1819,31 @@
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
@@ -1711,20 +1851,17 @@
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
- { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
- { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
-
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
-
- { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
- { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
};
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1732,252 +1869,307 @@
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
+
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
+
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
+
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
};
static const TypeConversionCostTblEntry AVXConversionTbl[] = {
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
- // The generic code to compute the scalar overhead is currently broken.
- // Workaround this limitation by estimating the scalarization overhead
- // here. We have roughly 10 instructions per scalar element.
- // Multiply that by the vector width.
- // FIXME: remove that when PR19268 is fixed.
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
- { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
- { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
- { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
- { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
- // This node is expanded into scalarized operations but BasicTTI is overly
- // optimistic estimating its cost. It computes 3 per element (one
- // vector-extract, one scalar conversion and one vector-insert). The
- // problem is that the inserts form a read-modify-write chain so latency
- // should be factored in too. Inflating the cost per element by 1.
- { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
- { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
+
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
{ ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
};
static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
-
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
// These truncates end up widening elements.
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
};
static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
- // These are somewhat magic numbers justified by looking at the output of
- // Intel's IACA, running some kernels and making sure when we take
- // legalization into account the throughput will be overestimated.
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ // These are somewhat magic numbers justified by comparing the
+ // output of llvm-mca for our various supported scheduler models
+ // and basing it off the worst case scenario.
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
- { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
- { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
- { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
- { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-
- { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
- { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
-
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
- { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
- { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
- { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
- { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
// These truncates are really widening elements.
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
@@ -1987,113 +2179,185 @@
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
- { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
- { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
- { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
};
- std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
- std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
-
- if (ST->hasSSE2() && !ST->hasAVX()) {
- if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
- LTDest.second, LTSrc.second))
- return AdjustCost(LTSrc.first * Entry->Cost);
- }
-
+ // Attempt to map directly to (simple) MVT types to let us match custom entries.
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
// The function getSimpleVT only handles simple value types.
- if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
+ if (SrcTy.isSimple() && DstTy.isSimple()) {
+ MVT SimpleSrcTy = SrcTy.getSimpleVT();
+ MVT SimpleDstTy = DstTy.getSimpleVT();
- MVT SimpleSrcTy = SrcTy.getSimpleVT();
- MVT SimpleDstTy = DstTy.getSimpleVT();
+ if (ST->useAVX512Regs()) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
- if (ST->useAVX512Regs()) {
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
if (ST->hasBWI())
- if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
if (ST->hasDQI())
- if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
if (ST->hasAVX512())
- if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+ }
+
+ // Fall back to legalized types.
+ std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<InstructionCost, MVT> LTDest =
+ TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->useAVX512Regs()) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
}
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
if (ST->hasDQI())
if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasAVX2()) {
+ if (ST->hasAVX2())
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
- }
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasAVX()) {
+ if (ST->hasAVX())
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
- }
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasSSE41()) {
+ if (ST->hasSSE41())
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
+ // sitofp.
+ if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
+ 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
+ Type *ExtSrc = Src->getWithNewBitWidth(32);
+ unsigned ExtOpc =
+ (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
+
+ // For scalar loads the extend would be free.
+ InstructionCost ExtCost = 0;
+ if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
+ ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
+
+ return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
+ TTI::CastContextHint::None, CostKind);
}
- if (ST->hasSSE2()) {
- if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
+ // i32.
+ if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
+ 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
+ Type *TruncDst = Dst->getWithNewBitWidth(32);
+ return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
+ getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
+ TTI::CastContextHint::None, CostKind);
}
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
I);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -2279,8 +2543,9 @@
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
-int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
- const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
@@ -2312,6 +2577,9 @@
{ ISD::BITREVERSE, MVT::v16i32, 5 },
{ ISD::BITREVERSE, MVT::v32i16, 5 },
{ ISD::BITREVERSE, MVT::v64i8, 5 },
+ { ISD::BSWAP, MVT::v8i64, 1 },
+ { ISD::BSWAP, MVT::v16i32, 1 },
+ { ISD::BSWAP, MVT::v32i16, 1 },
{ ISD::CTLZ, MVT::v8i64, 23 },
{ ISD::CTLZ, MVT::v16i32, 22 },
{ ISD::CTLZ, MVT::v32i16, 18 },
@@ -2352,6 +2620,9 @@
{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::BITREVERSE, MVT::v32i16, 10 },
{ ISD::BITREVERSE, MVT::v64i8, 10 },
+ { ISD::BSWAP, MVT::v8i64, 4 },
+ { ISD::BSWAP, MVT::v16i32, 4 },
+ { ISD::BSWAP, MVT::v32i16, 4 },
{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },
{ ISD::CTLZ, MVT::v32i16, 28 },
@@ -2670,6 +2941,7 @@
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::BSWAP, MVT::i64, 1 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
@@ -2683,6 +2955,8 @@
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::BSWAP, MVT::i32, 1 },
+ { ISD::BSWAP, MVT::i16, 1 }, // ROL
{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
@@ -2782,7 +3056,7 @@
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -2802,7 +3076,8 @@
return LT.first * Cost;
}
- auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+ auto adjustTableCost = [](const CostTblEntry &Entry,
+ InstructionCost LegalizationCost,
FastMathFlags FMF) {
// If there are no NANs to deal with, then these are reduced to a
// single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
@@ -2893,6 +3168,17 @@
return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
+ if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
+ if (const Instruction *II = ICA.getInst()) {
+ if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
+ return TTI::TCC_Free;
+ if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
+ if (LI->hasOneUse())
+ return TTI::TCC_Free;
+ }
+ }
+ }
+
// TODO - add BMI (TZCNT) scalar handling
if (ST->is64Bit())
@@ -2906,8 +3192,9 @@
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
@@ -2983,7 +3270,8 @@
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, RetTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -3006,7 +3294,8 @@
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
static const CostTblEntry SLMCostTbl[] = {
{ ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
{ ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
@@ -3018,10 +3307,40 @@
Type *ScalarType = Val->getScalarType();
int RegisterFileMoveCost = 0;
+ // Non-immediate extraction/insertion can be handled as a sequence of
+ // aliased loads+stores via the stack.
+ if (Index == -1U && (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
+ // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
+ // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
+
+ // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
+ assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
+ Align VecAlign = DL.getPrefTypeAlign(Val);
+ Align SclAlign = DL.getPrefTypeAlign(ScalarType);
+
+ // Extract - store vector to stack, load scalar.
+ if (Opcode == Instruction::ExtractElement) {
+ return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput);
+ }
+ // Insert - store vector to stack, store scalar, load vector.
+ if (Opcode == Instruction::InsertElement) {
+ return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput);
+ }
+ }
+
if (Index != -1U && (Opcode == Instruction::ExtractElement ||
Opcode == Instruction::InsertElement)) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -3079,13 +3398,14 @@
// subvector move(s).
// If the vector type is already less than 128-bits then don't reduce it.
// TODO: Under what circumstances should we shuffle using the full width?
- int ShuffleCost = 1;
+ InstructionCost ShuffleCost = 1;
if (Opcode == Instruction::InsertElement) {
auto *SubTy = cast<VectorType>(Val);
EVT VT = TLI->getValueType(DL, Val);
if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
SubTy = FixedVectorType::get(ScalarType, SubNumElts);
- ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+ ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
}
int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
@@ -3099,15 +3419,16 @@
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
-unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
- const APInt &DemandedElts,
- bool Insert, bool Extract) {
- unsigned Cost = 0;
+InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert,
+ bool Extract) {
+ InstructionCost Cost = 0;
// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MScalarTy = LT.second.getScalarType();
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
@@ -3131,8 +3452,10 @@
// Case#2: inserting into 5th index needs extracti128 + vpinsrd +
// inserti128.
// Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
- unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
- unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+ const int CostValue = *LT.first.getValue();
+ assert(CostValue >= 0 && "Negative cost!");
+ unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
+ unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
unsigned Scale = NumElts / Num128Lanes;
// We iterate each 128-lane, and check if we need a
@@ -3182,10 +3505,11 @@
return Cost;
}
-int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput) {
if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
@@ -3199,57 +3523,146 @@
return TTI::TCC_Basic;
}
- // Handle non-power-of-two vectors such as <3 x float>
- if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
- unsigned NumElem = VTy->getNumElements();
-
- // Handle a few common cases:
- // <3 x float>
- if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
- // Cost = 64 bit store + extract + 32 bit store.
- return 3;
-
- // <3 x double>
- if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
- // Cost = 128 bit store + unpack + 64 bit store.
- return 3;
-
- // Assume that all other non-power-of-two numbers are scalarized.
- if (!isPowerOf2_32(NumElem)) {
- APInt DemandedElts = APInt::getAllOnesValue(NumElem);
- int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
- AddressSpace, CostKind);
- int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
- Opcode == Instruction::Load,
- Opcode == Instruction::Store);
- return NumElem * Cost + SplitCost;
- }
- }
-
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
// Type legalization can't handle structs
- if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
- "Invalid Opcode");
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- // Each load/store unit costs 1.
- int Cost = LT.first * 1;
+ auto *VTy = dyn_cast<FixedVectorType>(Src);
- // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
- // proxy for a double-pumped AVX memory interface such as on Sandybridge.
- if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
- Cost *= 2;
+ // Handle the simple case of non-vectors.
+ // NOTE: this assumes that legalization never creates vector from scalars!
+ if (!VTy || !LT.second.isVector())
+ // Each load/store unit costs 1.
+ return LT.first * 1;
+
+ bool IsLoad = Opcode == Instruction::Load;
+
+ Type *EltTy = VTy->getElementType();
+
+ const int EltTyBits = DL.getTypeSizeInBits(EltTy);
+
+ InstructionCost Cost = 0;
+
+ // Source of truth: how many elements were there in the original IR vector?
+ const unsigned SrcNumElt = VTy->getNumElements();
+
+ // How far have we gotten?
+ int NumEltRemaining = SrcNumElt;
+ // Note that we intentionally capture by-reference, NumEltRemaining changes.
+ auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
+
+ const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
+
+ // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
+ const unsigned XMMBits = 128;
+ if (XMMBits % EltTyBits != 0)
+ // Vector size must be a multiple of the element size. I.e. no padding.
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ const int NumEltPerXMM = XMMBits / EltTyBits;
+
+ auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
+
+ for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
+ NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
+ // How many elements would a single op deal with at once?
+ if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
+ // Vector size must be a multiple of the element size. I.e. no padding.
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
+
+ assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
+ assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
+ (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
+ "Unless we haven't halved the op size yet, "
+ "we have less than two op's sized units of work left.");
+
+ auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
+ ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
+ : XMMVecTy;
+
+ assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
+ "After halving sizes, the vector elt count is no longer a multiple "
+ "of number of elements per operation?");
+ auto *CoalescedVecTy =
+ CurrNumEltPerOp == 1
+ ? CurrVecTy
+ : FixedVectorType::get(
+ IntegerType::get(Src->getContext(),
+ EltTyBits * CurrNumEltPerOp),
+ CurrVecTy->getNumElements() / CurrNumEltPerOp);
+ assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
+ DL.getTypeSizeInBits(CurrVecTy) &&
+ "coalesciing elements doesn't change vector width.");
+
+ while (NumEltRemaining > 0) {
+ assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
+
+ // Can we use this vector size, as per the remaining element count?
+ // Iff the vector is naturally aligned, we can do a wide load regardless.
+ if (NumEltRemaining < CurrNumEltPerOp &&
+ (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
+ CurrOpSizeBytes != 1)
+ break; // Try smalled vector size.
+
+ bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
+
+ // If we have fully processed the previous reg, we need to replenish it.
+ if (SubVecEltsLeft == 0) {
+ SubVecEltsLeft += CurrVecTy->getNumElements();
+ // And that's free only for the 0'th subvector of a legalized vector.
+ if (!Is0thSubVec)
+ Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+ : TTI::ShuffleKind::SK_ExtractSubvector,
+ VTy, None, NumEltDone(), CurrVecTy);
+ }
+
+ // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
+ // for smaller widths (32/16/8) we have to insert/extract them separately.
+ // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
+ // but let's pretend that it is also true for 16/8 bit wide ops...)
+ if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
+ int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
+ assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
+ int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
+ APInt DemandedElts =
+ APInt::getBitsSet(CoalescedVecTy->getNumElements(),
+ CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
+ assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
+ Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
+ !IsLoad);
+ }
+
+ // This isn't exactly right. We're using slow unaligned 32-byte accesses
+ // as a proxy for a double-pumped AVX memory interface such as on
+ // Sandybridge.
+ if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+ Cost += 2;
+ else
+ Cost += 1;
+
+ SubVecEltsLeft -= CurrNumEltPerOp;
+ NumEltRemaining -= CurrNumEltPerOp;
+ Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
+ }
+ }
+
+ assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
return Cost;
}
-int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
- Align Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
bool IsLoad = (Instruction::Load == Opcode);
bool IsStore = (Instruction::Store == Opcode);
@@ -3262,40 +3675,39 @@
auto *MaskTy =
FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
- (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
- !isPowerOf2_32(NumElem)) {
+ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
// Scalarization
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
- int MaskSplitCost =
+ InstructionCost MaskSplitCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost = getCmpSelInstrCost(
+ InstructionCost ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
- int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
- int ValueSplitCost =
+ InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+ InstructionCost ValueSplitCost =
getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
- int MemopCost =
+ InstructionCost MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace, CostKind);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
auto VT = TLI->getValueType(DL, SrcVTy);
- int Cost = 0;
+ InstructionCost Cost = 0;
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
- // Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
+ // Promotion requires extend/truncate for data and a shuffle for mask.
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
- else if (LT.second.getVectorNumElements() > NumElem) {
+ else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
LT.second.getVectorNumElements());
// Expanding requires fill mask with zeroes
- Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+ Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
}
// Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
@@ -3306,8 +3718,9 @@
return Cost + LT.first;
}
-int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) {
+InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
+ ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -3331,12 +3744,12 @@
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
- bool IsPairwise,
- TTI::TargetCostKind CostKind) {
- // Just use the default implementation for pair reductions.
- if (IsPairwise)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+InstructionCost
+X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF))
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
@@ -3348,6 +3761,7 @@
static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v2f32, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
{ ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
@@ -3394,13 +3808,23 @@
return Entry->Cost;
}
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
auto *ValVTy = cast<FixedVectorType>(ValTy);
- unsigned ArithmeticCost = 0;
+ // Special case: vXi8 mul reductions are performed as vXi16.
+ if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
+ auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
+ auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
+ return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
+ }
+
+ InstructionCost ArithmeticCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 arithmetic ops.
@@ -3470,7 +3894,7 @@
// Handle bool allof/anyof patterns.
if (ValVTy->getElementType()->isIntegerTy(1)) {
- unsigned ArithmeticCost = 0;
+ InstructionCost ArithmeticCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 arithmetic ops.
@@ -3493,8 +3917,7 @@
if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
return ArithmeticCost + Entry->Cost;
- return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
}
unsigned NumVecElts = ValVTy->getNumElements();
@@ -3503,10 +3926,9 @@
// Special case power of 2 reductions where the scalar type isn't changed
// by type legalization.
if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
- return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
- unsigned ReductionCost = 0;
+ InstructionCost ReductionCost = 0;
auto *Ty = ValVTy;
if (LT.first != 1 && MTy.isVector() &&
@@ -3529,7 +3951,7 @@
if (Size > 128) {
auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
ReductionCost +=
- getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
Ty = SubTy;
} else if (Size == 128) {
// Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -3541,7 +3963,7 @@
ShufTy =
FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
ReductionCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else if (Size == 64) {
// Reducing from 64 bits is a shuffle of v4f32/v4i32.
FixedVectorType *ShufTy;
@@ -3552,7 +3974,7 @@
ShufTy =
FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
ReductionCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else {
// Reducing from smaller size is a shift by immediate.
auto *ShiftTy = FixedVectorType::get(
@@ -3572,8 +3994,9 @@
return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
-int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
+ bool IsUnsigned) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MTy = LT.second;
@@ -3691,21 +4114,19 @@
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// Otherwise fall back to cmp+select.
- return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
- CostKind) +
- getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ InstructionCost Result =
+ getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+ CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ return Result;
}
-int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind) {
- // Just use the default implementation for pair reductions.
- if (IsPairwise)
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
- CostKind);
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+InstructionCost
+X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -3785,7 +4206,7 @@
unsigned NumVecElts = ValVTy->getNumElements();
auto *Ty = ValVTy;
- unsigned MinMaxCost = 0;
+ InstructionCost MinMaxCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 operations ops.
@@ -3820,8 +4241,7 @@
// by type legalization.
if (!isPowerOf2_32(ValVTy->getNumElements()) ||
ScalarSize != MTy.getScalarSizeInBits())
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
- CostKind);
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
// Now handle reduction with the legal type, taking into account size changes
// at each level.
@@ -3833,7 +4253,7 @@
if (Size > 128) {
auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
MinMaxCost +=
- getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
Ty = SubTy;
} else if (Size == 128) {
// Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -3844,7 +4264,7 @@
else
ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
MinMaxCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else if (Size == 64) {
// Reducing from 64 bits is a shuffle of v4f32/v4i32.
FixedVectorType *ShufTy;
@@ -3853,7 +4273,7 @@
else
ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
MinMaxCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else {
// Reducing from smaller size is a shift by immediate.
auto *ShiftTy = FixedVectorType::get(
@@ -3878,7 +4298,7 @@
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-int X86TTIImpl::getIntImmCost(int64_t Val) {
+InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
if (Val == 0)
return TTI::TCC_Free;
@@ -3888,8 +4308,8 @@
return 2 * TTI::TCC_Basic;
}
-int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3913,20 +4333,20 @@
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- int Cost = 0;
+ InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialize the constant.
- return std::max(1, Cost);
+ return std::max<InstructionCost>(1, Cost);
}
-int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -4013,7 +4433,7 @@
if (Idx == ImmIdx) {
int NumConstants = divideCeil(BitSize, 64);
- int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+ InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
@@ -4022,9 +4442,9 @@
return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -4058,12 +4478,13 @@
return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned
-X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
return Opcode == Instruction::PHI ? 0 : 1;
// Branches are assumed to be predicted.
- return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+ return 0;
}
int X86TTIImpl::getGatherOverhead() const {
@@ -4088,8 +4509,9 @@
// Return an average cost of Gather / Scatter instruction, maybe improved later.
// FIXME: Add TargetCostKind support.
-int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
- Align Alignment, unsigned AddressSpace) {
+InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
+ const Value *Ptr, Align Alignment,
+ unsigned AddressSpace) {
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
@@ -4131,9 +4553,12 @@
auto *IndexVTy = FixedVectorType::get(
IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
- std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
- std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
- int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ std::pair<InstructionCost, MVT> IdxsLT =
+ TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<InstructionCost, MVT> SrcLT =
+ TLI->getTypeLegalizationCost(DL, SrcVTy);
+ InstructionCost::CostType SplitFactor =
+ *std::max(IdxsLT.first, SrcLT.first).getValue();
if (SplitFactor > 1) {
// Handle splitting of vector of pointers
auto *SplitSrcTy =
@@ -4161,32 +4586,32 @@
/// AddressSpace - pointer[s] address space.
///
/// FIXME: Add TargetCostKind support.
-int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
- bool VariableMask, Align Alignment,
- unsigned AddressSpace) {
+InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, Align Alignment,
+ unsigned AddressSpace) {
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
APInt DemandedElts = APInt::getAllOnesValue(VF);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- int MaskUnpackCost = 0;
+ InstructionCost MaskUnpackCost = 0;
if (VariableMask) {
auto *MaskTy =
FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost = getCmpSelInstrCost(
+ InstructionCost ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
// The cost of the scalar loads/stores.
- int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ InstructionCost MemoryOpCost =
+ VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ MaybeAlign(Alignment), AddressSpace, CostKind);
- int InsertExtractCost = 0;
+ InstructionCost InsertExtractCost = 0;
if (Opcode == Instruction::Load)
for (unsigned i = 0; i < VF; ++i)
// Add the cost of inserting each scalar load into the vector
@@ -4202,11 +4627,10 @@
}
/// Calculate the cost of Gather / Scatter operation
-int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr) {
+InstructionCost X86TTIImpl::getGatherScatterOpCost(
+ unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
if (CostKind != TTI::TCK_RecipThroughput) {
if ((Opcode == Instruction::Load &&
isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
@@ -4218,7 +4642,6 @@
}
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
- unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy && Ptr->getType()->isVectorTy())
PtrTy = dyn_cast<PointerType>(
@@ -4226,22 +4649,10 @@
assert(PtrTy && "Unexpected type for Ptr argument");
unsigned AddressSpace = PtrTy->getAddressSpace();
- bool Scalarize = false;
if ((Opcode == Instruction::Load &&
!isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
(Opcode == Instruction::Store &&
!isLegalMaskedScatter(SrcVTy, Align(Alignment))))
- Scalarize = true;
- // Gather / Scatter for vector 2 is not profitable on KNL / SKX
- // Vector-4 of gather/scatter instruction does not exist on KNL.
- // We can extend it to 8 elements, but zeroing upper bits of
- // the mask vector will add more instructions. Right now we give the scalar
- // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
- // is better in the VariableMask case.
- if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
- Scalarize = true;
-
- if (Scalarize)
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
AddressSpace);
@@ -4377,6 +4788,14 @@
unsigned NumElts = DataVTy->getNumElements();
if (NumElts == 1)
return false;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter
+ // instruction is better in the VariableMask case.
+ if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())))
+ return false;
}
Type *ScalarTy = DataTy->getScalarType();
if (ScalarTy->isPointerTy())
@@ -4493,7 +4912,7 @@
// computing the cost using a generic formula as a function of generic
// shuffles. We therefore use a lookup table instead, filled according to
// the instruction sequences that codegen currently generates.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
@@ -4507,8 +4926,7 @@
// TODO: Support also strided loads (interleaved-groups with gaps).
if (Indices.size() && Indices.size() != Factor)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -4520,86 +4938,78 @@
// (see MachineValueType.h::getVectorVT()).
if (!LegalVT.isVector())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
unsigned VF = VecTy->getNumElements() / Factor;
Type *ScalarTy = VecTy->getElementType();
+ // Deduplicate entries, model floats/pointers as appropriately-sized integers.
+ if (!ScalarTy->isIntegerTy())
+ ScalarTy =
+ Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
- // Calculate the number of memory operations (NumOfMemOps), required
- // for load/store the VecTy.
- unsigned VecTySize = DL.getTypeStoreSize(VecTy);
- unsigned LegalVTSize = LegalVT.getStoreSize();
- unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
-
- // Get the cost of one memory operation.
- auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
- LegalVT.getVectorNumElements());
- unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ // Get the cost of all the memory operations.
+ InstructionCost MemOpCosts = getMemoryOpCost(
+ Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
auto *VT = FixedVectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
if (!ETy.isSimple())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
// TODO: Complete for other data-types and strides.
- // Each combination of Stride, ElementTy and VF results in a different
+ // Each combination of Stride, element bit width and VF results in a different
// sequence; The cost tables are therefore accessed with:
- // Factor (stride) and VectorType=VFxElemType.
+ // Factor (stride) and VectorType=VFxiN.
// The Cost accounts only for the shuffle sequence;
// The cost of the loads/stores is accounted for separately.
//
static const CostTblEntry AVX2InterleavedLoadTbl[] = {
- { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
- { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
+ {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64
- { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
- { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
- { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
- { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
- { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
- { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
+ {3, MVT::v2i8, 10}, // (load 6i8 and) deinterleave into 3 x 2i8
+ {3, MVT::v4i8, 4}, // (load 12i8 and) deinterleave into 3 x 4i8
+ {3, MVT::v8i8, 9}, // (load 24i8 and) deinterleave into 3 x 8i8
+ {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
+ {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8
- { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
- { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
- { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
- { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
- { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+ {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32
- { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
+ {4, MVT::v2i8, 12}, // (load 8i8 and) deinterleave into 4 x 2i8
+ {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
+ {4, MVT::v8i8, 20}, // (load 32i8 and) deinterleave into 4 x 8i8
+ {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8
+ {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8
+
+ {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
- { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
- { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
+ {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store)
- { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
- { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
- { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
- { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
- { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
+ {3, MVT::v2i8, 7}, // interleave 3 x 2i8 into 6i8 (and store)
+ {3, MVT::v4i8, 8}, // interleave 3 x 4i8 into 12i8 (and store)
+ {3, MVT::v8i8, 11}, // interleave 3 x 8i8 into 24i8 (and store)
+ {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
+ {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
- { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
- { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
- { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
- { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
- { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
+ {4, MVT::v2i8, 12}, // interleave 4 x 2i8 into 8i8 (and store)
+ {4, MVT::v4i8, 9}, // interleave 4 x 4i8 into 16i8 (and store)
+ {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
+ {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store)
+ {4, MVT::v32i8, 12} // interleave 4 x 32i8 into 128i8 (and store)
};
if (Opcode == Instruction::Load) {
if (const auto *Entry =
CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
- return NumOfMemOps * MemOpCost + Entry->Cost;
+ return MemOpCosts + Entry->Cost;
} else {
assert(Opcode == Instruction::Store &&
"Expected Store Instruction at this point");
if (const auto *Entry =
CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
- return NumOfMemOps * MemOpCost + Entry->Cost;
+ return MemOpCosts + Entry->Cost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -4610,7 +5020,7 @@
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
@@ -4634,9 +5044,8 @@
// Get the cost of one memory operation.
auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
LegalVT.getVectorNumElements());
- unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ InstructionCost MemOpCost = getMemoryOpCost(
+ Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
unsigned VF = VecTy->getNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
@@ -4665,14 +5074,14 @@
TTI::ShuffleKind ShuffleKind =
(NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
- unsigned ShuffleCost =
- getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+ InstructionCost ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
unsigned NumOfLoadsInInterleaveGrp =
Indices.size() ? Indices.size() : Factor;
auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
VecTy->getNumElements() / Factor);
- unsigned NumOfResults =
+ InstructionCost NumOfResults =
getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
NumOfLoadsInInterleaveGrp;
@@ -4688,12 +5097,12 @@
// The SK_MergeTwoSrc shuffle clobbers one of src operands.
// When we have more than one destination, we need additional instructions
// to keep sources.
- unsigned NumOfMoves = 0;
+ InstructionCost NumOfMoves = 0;
if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
- int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
- NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+ InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
return Cost;
}
@@ -4721,19 +5130,20 @@
// There is no strided stores meanwhile. And store can't be folded in
// shuffle.
unsigned NumOfSources = Factor; // The number of values to be merged.
- unsigned ShuffleCost =
- getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ InstructionCost ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
unsigned NumOfShufflesPerStore = NumOfSources - 1;
// The SK_MergeTwoSrc shuffle clobbers one of src operands.
// We need additional instructions to keep sources.
unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
- int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
- NumOfMoves;
+ InstructionCost Cost =
+ NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
return Cost;
}
-int X86TTIImpl::getInterleavedMemoryOpCost(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 17570f1..69ff658 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/src/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -55,7 +55,8 @@
X86::FeatureFastSHLDRotate,
X86::FeatureFastScalarShiftMasks,
X86::FeatureFastVectorShiftMasks,
- X86::FeatureFastVariableShuffle,
+ X86::FeatureFastVariableCrossLaneShuffle,
+ X86::FeatureFastVariablePerLaneShuffle,
X86::FeatureFastVectorFSQRT,
X86::FeatureLEAForSP,
X86::FeatureLEAUsesAG,
@@ -115,10 +116,10 @@
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(unsigned VF);
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -127,31 +128,37 @@
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
- bool Insert, bool Extract);
- int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
- int getMaskedMemoryOpCost(
- unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
- int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
- bool VariableMask, Align Alignment,
- TTI::TargetCostKind CostKind,
- const Instruction *I);
- int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr);
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
+ InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index);
+ InstructionCost getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract);
+ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost
+ getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+ InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I);
+ InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr);
Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
@@ -167,48 +174,53 @@
unsigned getAtomicMemIntrinsicMaxElementSize() const;
- int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
- int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
+ InstructionCost
+ getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
- int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+ InstructionCost getArithmeticReductionCost(
+ unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
- int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
+ InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
- int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsPairwiseForm, bool IsUnsigned,
- TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind);
- int getInterleavedMemoryOpCost(
+ InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- int getInterleavedMemoryOpCostAVX512(
+ InstructionCost getInterleavedMemoryOpCostAVX512(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- int getInterleavedMemoryOpCostAVX2(
+ InstructionCost getInterleavedMemoryOpCostAVX2(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- int getIntImmCost(int64_t);
+ InstructionCost getIntImmCost(int64_t);
- int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
- int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
@@ -231,19 +243,13 @@
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
- /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
- /// into shuffles and vector math/logic by the backend
- /// (see TTI::shouldExpandReduction)
- bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const {
- return true;
- }
-
private:
- int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
- Align Alignment, unsigned AddressSpace);
- int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
- Align Alignment, unsigned AddressSpace);
+ InstructionCost getGSScalarCost(unsigned Opcode, Type *DataTy,
+ bool VariableMask, Align Alignment,
+ unsigned AddressSpace);
+ InstructionCost getGSVectorCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, Align Alignment,
+ unsigned AddressSpace);
int getGatherOverhead() const;
int getScatterOverhead() const;
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp b/src/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
index ef010bc..8114a0b 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -23,7 +23,6 @@
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -41,28 +40,20 @@
namespace {
-class X86TileConfig : public MachineFunctionPass {
- // context
- MachineFunction *MF = nullptr;
- const X86Subtarget *ST = nullptr;
- const TargetRegisterInfo *TRI;
- const TargetInstrInfo *TII;
- MachineDominatorTree *DomTree = nullptr;
- MachineRegisterInfo *MRI = nullptr;
- VirtRegMap *VRM = nullptr;
- LiveIntervals *LIS = nullptr;
+struct X86TileConfig : public MachineFunctionPass {
- MachineInstr *getTileConfigPoint();
- void tileConfig();
-
-public:
X86TileConfig() : MachineFunctionPass(ID) {}
/// Return the pass name.
StringRef getPassName() const override { return "Tile Register Configure"; }
/// X86TileConfig analysis usage.
- void getAnalysisUsage(AnalysisUsage &AU) const override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveIntervals>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &mf) override;
@@ -81,167 +72,124 @@
INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
false, false)
-void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<LiveIntervals>();
- AU.addPreserved<SlotIndexes>();
- AU.addRequired<VirtRegMap>();
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
+bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ VirtRegMap &VRM = getAnalysis<VirtRegMap>();
-static unsigned getTilePhysRegIndex(Register PhysReg) {
- assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
- "Tile register number is invalid");
- return (PhysReg - X86::TMM0);
-}
-
-static MachineInstr *
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
- const TargetInstrInfo *TII, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) {
-
- unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
- unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
- if (BitSize == TRI->getRegSizeInBits(*RC))
- SubIdx = 0;
- MachineInstr *NewMI =
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
- Offset)
- .addReg(SrcReg, 0, SubIdx);
- return NewMI;
-}
-
-static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- int64_t Imm, unsigned BitSize,
- int FrameIdx, int Offset,
- const TargetInstrInfo *TII) {
- unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
- return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
- FrameIdx, Offset)
- .addImm(Imm);
-}
-
-MachineInstr *X86TileConfig::getTileConfigPoint() {
- for (MachineBasicBlock &MBB : *MF) {
-
- // Traverse the basic block.
- for (MachineInstr &MI : MBB)
- // Refer X86PreTileConfig.cpp.
- // We only support one tile config for now.
- if (MI.getOpcode() == X86::PLDTILECFG)
- return &MI;
- }
-
- return nullptr;
-}
-
-void X86TileConfig::tileConfig() {
- MachineInstr *MI = getTileConfigPoint();
- if (!MI)
- return;
- MachineBasicBlock *MBB = MI->getParent();
- int SS = MI->getOperand(1).getIndex();
- BitVector PhysRegs(TRI->getNumRegs());
-
- // Fill in the palette first.
- auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
- LIS->InsertMachineInstrInMaps(*NewMI);
- // Fill in the shape of each tile physical register.
- for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
- Register VirtReg = Register::index2VirtReg(i);
- if (MRI->reg_nodbg_empty(VirtReg))
- continue;
- const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- if (RC.getID() != X86::TILERegClassID)
- continue;
- Register PhysReg = VRM->getPhys(VirtReg);
- if (PhysRegs.test(PhysReg))
- continue;
- PhysRegs.set(PhysReg);
- ShapeT Shape = VRM->getShape(VirtReg);
- Register RowReg = Shape.getRow()->getReg();
- Register ColReg = Shape.getCol()->getReg();
-
- // Here is the data format for the tile config.
- // 0 palette
- // 1 start_row
- // 2-15 reserved, must be zero
- // 16-17 tile0.colsb Tile 0 bytes per row.
- // 18-19 tile1.colsb Tile 1 bytes per row.
- // 20-21 tile2.colsb Tile 2 bytes per row.
- // ... (sequence continues)
- // 30-31 tile7.colsb Tile 7 bytes per row.
- // 32-47 reserved, must be zero
- // 48 tile0.rows Tile 0 rows.
- // 49 tile1.rows Tile 1 rows.
- // 50 tile2.rows Tile 2 rows.
- // ... (sequence continues)
- // 55 tile7.rows Tile 7 rows.
- // 56-63 reserved, must be zero
- unsigned Index = getTilePhysRegIndex(PhysReg);
- int RowOffset = 48 + Index;
- int ColOffset = 16 + Index * 2;
-
- unsigned BitSize = 8;
- for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
- std::make_pair(ColReg, ColOffset)}) {
- int64_t Imm;
- int ImmCount = 0;
- // All def must be the same value, otherwise it is invalid MIs.
- // Immediate is prefered.
- for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
- const auto *Inst = MO.getParent();
- if (Inst->isMoveImmediate()) {
- ImmCount++;
- Imm = Inst->getOperand(1).getImm();
- break;
- }
- }
- auto StoreConfig = [&](int Offset) {
- MachineInstr *NewMI = nullptr;
- if (ImmCount)
- NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
- else {
- const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
- NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
- Offset, TII, RC, TRI);
- }
- SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
- if (!ImmCount) {
- // Extend the live interval.
- SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
- LiveInterval &Int = LIS->getInterval(Pair.first);
- LIS->extendToIndices(Int, EndPoints);
- }
- };
- StoreConfig(Pair.second);
- BitSize += 8;
- }
- }
-}
-
-bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
- MF = &mf;
- MRI = &mf.getRegInfo();
- ST = &mf.getSubtarget<X86Subtarget>();
- TRI = ST->getRegisterInfo();
- TII = mf.getSubtarget().getInstrInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
- VRM = &getAnalysis<VirtRegMap>();
- LIS = &getAnalysis<LiveIntervals>();
-
- if (VRM->isShapeMapEmpty())
+ if (VRM.isShapeMapEmpty())
return false;
- tileConfig();
+ int SS = INT_MAX;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == X86::LDTILECFG) {
+ SS = MI.getOperand(0).getIndex();
+ break;
+ }
+ }
+ if (SS != INT_MAX)
+ break;
+ }
+
+ // Try to find a point to insert MIs for constant shapes.
+ // Here we are leveraging the palette id inserted in PreRA pass.
+ unsigned ConstPos = 0;
+ MachineInstr *ConstMI = nullptr;
+ for (MachineInstr &MI : MF.front()) {
+ if (MI.getOpcode() == X86::MOV8mi && SS == MI.getOperand(0).getIndex()) {
+ ConstMI = &MI;
+ break;
+ }
+ ++ConstPos;
+ }
+ assert(ConstMI && "Cannot find an insertion point");
+
+ unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
+ SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ Register VirtReg = Register::index2VirtReg(I);
+ if (MRI.reg_nodbg_empty(VirtReg))
+ continue;
+ if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
+ continue;
+ unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0;
+ if (!Phys2Virt[Index])
+ Phys2Virt[Index] = VirtReg;
+ }
+
+ // Fill in the shape of each tile physical register.
+ for (unsigned I = 0; I < AMXRegNum; ++I) {
+ if (!Phys2Virt[I])
+ continue;
+ DebugLoc DL;
+ bool IsRow = true;
+ MachineInstr *NewMI = nullptr;
+ ShapeT Shape = VRM.getShape(Phys2Virt[I]);
+ for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ int64_t Imm = INT64_MAX;
+ int Offset = IsRow ? 48 + I : 16 + I * 2;
+ for (auto &DefMI : MRI.def_instructions(R)) {
+ MachineBasicBlock &MBB = *DefMI.getParent();
+ if (DefMI.isMoveImmediate()) {
+ if (Imm != INT64_MAX) {
+ // FIXME: We should handle this case in future.
+ assert(Imm == DefMI.getOperand(1).getImm() &&
+ "Cannot initialize with different shapes");
+ continue;
+ }
+ Imm = DefMI.getOperand(1).getImm();
+ NewMI = addFrameReference(
+ BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
+ TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)),
+ SS, Offset)
+ .addImm(Imm);
+ ConstMI = NewMI;
+ LIS.InsertMachineInstrInMaps(*NewMI);
+ } else {
+ unsigned SubIdx = IsRow ? X86::sub_8bit : X86::sub_16bit;
+ unsigned RegSize = TRI->getRegSizeInBits(*MRI.getRegClass(R));
+ if ((IsRow && RegSize == 8) || (!IsRow && RegSize == 16))
+ SubIdx = 0;
+ auto Iter = DefMI.getIterator();
+ if (&MBB == &MF.front() &&
+ (unsigned)std::distance(MBB.instr_begin(), Iter) < ConstPos)
+ Iter = ConstMI->getIterator();
+ NewMI = addFrameReference(
+ BuildMI(MBB, ++Iter, DL,
+ TII->get(IsRow ? X86::MOV8mr : X86::MOV16mr)),
+ SS, Offset)
+ .addReg(R, 0, SubIdx);
+ SlotIndex SIdx = LIS.InsertMachineInstrInMaps(*NewMI);
+ LIS.extendToIndices(LIS.getInterval(R), {SIdx.getRegSlot()});
+ }
+ }
+ IsRow = false;
+ }
+ }
return true;
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp b/src/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
index c188c74..c3031b6 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -182,8 +182,7 @@
/// Insert a vzeroupper instruction before I.
void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
MachineBasicBlock &MBB) {
- DebugLoc dl = I->getDebugLoc();
- BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+ BuildMI(MBB, I, I->getDebugLoc(), TII->get(X86::VZEROUPPER));
++NumVZU;
EverMadeChange = true;
}
diff --git a/src/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/src/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index 72593af..9ada0a8 100644
--- a/src/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/src/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -196,7 +196,7 @@
}
void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
MachineBasicBlock *MBB = MI->getParent();
MachineBasicBlock::iterator I = *MI;
diff --git a/src/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/src/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index db3dd7f..51fdfe5 100644
--- a/src/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/src/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -937,8 +937,8 @@
LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
AtomicSDNode *N = cast<AtomicSDNode>(Op);
assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
- assert((N->getOrdering() == AtomicOrdering::Unordered ||
- N->getOrdering() == AtomicOrdering::Monotonic) &&
+ assert((N->getSuccessOrdering() == AtomicOrdering::Unordered ||
+ N->getSuccessOrdering() == AtomicOrdering::Monotonic) &&
"setInsertFencesForAtomic(true) expects unordered / monotonic");
if (N->getMemoryVT() == MVT::i32) {
if (N->getAlignment() < 4)
@@ -968,8 +968,8 @@
LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
AtomicSDNode *N = cast<AtomicSDNode>(Op);
assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
- assert((N->getOrdering() == AtomicOrdering::Unordered ||
- N->getOrdering() == AtomicOrdering::Monotonic) &&
+ assert((N->getSuccessOrdering() == AtomicOrdering::Unordered ||
+ N->getSuccessOrdering() == AtomicOrdering::Monotonic) &&
"setInsertFencesForAtomic(true) expects unordered / monotonic");
if (N->getMemoryVT() == MVT::i32) {
if (N->getAlignment() < 4)
diff --git a/src/llvm-project/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/src/llvm-project/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
index 9fec74a..52a0a09 100644
--- a/src/llvm-project/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/src/llvm-project/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -46,13 +46,13 @@
ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
MergeableConst4Section = Ctx.getELFSection(
".cp.rodata.cst4", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 4, "");
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 4);
MergeableConst8Section = Ctx.getELFSection(
".cp.rodata.cst8", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 8, "");
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 8);
MergeableConst16Section = Ctx.getELFSection(
".cp.rodata.cst16", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 16, "");
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 16);
CStringSection =
Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |