blob: 43b3bf43fe56dbfc3ab95f58117d4ca909de443d [file] [log] [blame]
//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
/// instructions that produce single-use VGPR values. If the value is forwarded
/// to the consumer instruction prior to VGPR writeback, the hardware can
/// then skip (kill) the VGPR write.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUGenSearchableTables.inc"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include <array>
using namespace llvm;
#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
namespace {
class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
private:
const SIInstrInfo *SII;
class SingleUseInstruction {
private:
static const unsigned MaxSkipRange = 0b111;
static const unsigned MaxNumberOfSkipRegions = 2;
unsigned LastEncodedPositionEnd;
MachineInstr *ProducerInstr;
std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
// Adds a skip region into the instruction.
void skip(const unsigned ProducerPosition) {
while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
SkipRegions.push_back(MaxSkipRange);
LastEncodedPositionEnd += MaxSkipRange;
}
SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
LastEncodedPositionEnd = ProducerPosition;
}
bool currentRegionHasSpace() {
const auto Region = SkipRegions.size();
// The first region has an extra bit of encoding space.
return SingleUseRegions[Region] <
((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
}
unsigned encodeImm() {
// Handle the first Single Use Region separately as it has an extra bit
// of encoding space.
unsigned Imm = SingleUseRegions[SkipRegions.size()];
unsigned ShiftAmount = 4;
for (unsigned i = SkipRegions.size(); i > 0; i--) {
Imm |= SkipRegions[i - 1] << ShiftAmount;
ShiftAmount += 3;
Imm |= SingleUseRegions[i - 1] << ShiftAmount;
ShiftAmount += 3;
}
return Imm;
}
public:
SingleUseInstruction(const unsigned ProducerPosition,
MachineInstr *Producer)
: LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
SingleUseRegions({1, 0, 0}) {}
// Returns false if adding a new single use producer failed. This happens
// because it could not be encoded, either because there is no room to
// encode another single use producer region or that this single use
// producer is too far away to encode the amount of instructions to skip.
bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
// Producer is too far away to encode into this instruction or another
// skip region is needed and SkipRegions.size() = 2 so there's no room for
// another skip region, therefore a new instruction is needed.
if (LastEncodedPositionEnd +
(MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
ProducerPosition)
return false;
// If a skip region is needed.
if (LastEncodedPositionEnd != ProducerPosition ||
!currentRegionHasSpace()) {
// If the current region is out of space therefore a skip region would
// be needed, but there is no room for another skip region.
if (SkipRegions.size() == MaxNumberOfSkipRegions)
return false;
skip(ProducerPosition);
}
SingleUseRegions[SkipRegions.size()]++;
LastEncodedPositionEnd = ProducerPosition + 1;
ProducerInstr = MI;
return true;
}
auto emit(const SIInstrInfo *SII) {
return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
SII->get(AMDGPU::S_SINGLEUSE_VDST))
.addImm(encodeImm());
}
};
public:
static char ID;
AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
void insertSingleUseInstructions(
ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
SmallVector<SingleUseInstruction> Instructions;
for (auto &[Position, MI] : SingleUseProducers) {
// Encode this position into the last single use instruction if possible.
if (Instructions.empty() ||
!Instructions.back().tryAddProducer(Position, MI)) {
// If not, add a new instruction.
Instructions.push_back(SingleUseInstruction(Position, MI));
}
}
for (auto &Instruction : Instructions)
Instruction.emit(SII);
}
bool runOnMachineFunction(MachineFunction &MF) override {
const auto &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasVGPRSingleUseHintInsts())
return false;
SII = ST.getInstrInfo();
const auto *TRI = &SII->getRegisterInfo();
bool InstructionEmitted = false;
for (MachineBasicBlock &MBB : MF) {
DenseMap<MCRegUnit, unsigned> RegisterUseCount;
// Handle boundaries at the end of basic block separately to avoid
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
for (const auto &Liveout : MBB.liveouts()) {
for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
++Units) {
const auto [Unit, Mask] = *Units;
if ((Mask & Liveout.LaneMask).any())
RegisterUseCount[Unit] = 2;
}
}
SmallVector<std::pair<unsigned, MachineInstr *>>
SingleUseProducerPositions;
unsigned VALUInstrCount = 0;
for (MachineInstr &MI : reverse(MBB.instrs())) {
// All registers in all operands need to be single use for an
// instruction to be marked as a single use producer.
bool AllProducerOperandsAreSingleUse = true;
// Gather a list of Registers used before updating use counts to avoid
// double counting registers that appear multiple times in a single
// MachineInstr.
SmallVector<MCRegUnit> RegistersUsed;
for (const auto &Operand : MI.all_defs()) {
const auto Reg = Operand.getReg();
const auto RegUnits = TRI->regunits(Reg);
if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
return RegisterUseCount[Unit] > 1;
}))
AllProducerOperandsAreSingleUse = false;
// Reset uses count when a register is no longer live.
for (const MCRegUnit Unit : RegUnits)
RegisterUseCount.erase(Unit);
}
for (const auto &Operand : MI.all_uses()) {
const auto Reg = Operand.getReg();
// Count the number of times each register is read.
for (const MCRegUnit Unit : TRI->regunits(Reg)) {
if (!is_contained(RegistersUsed, Unit))
RegistersUsed.push_back(Unit);
}
}
for (const MCRegUnit Unit : RegistersUsed)
RegisterUseCount[Unit]++;
// Do not attempt to optimise across exec mask changes.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
for (auto &UsedReg : RegisterUseCount)
UsedReg.second = 2;
}
if (!SIInstrInfo::isVALU(MI) ||
AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
continue;
if (AllProducerOperandsAreSingleUse) {
SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
InstructionEmitted = true;
}
VALUInstrCount++;
}
insertSingleUseInstructions(SingleUseProducerPositions);
}
return InstructionEmitted;
}
};
} // namespace
char AMDGPUInsertSingleUseVDST::ID = 0;
char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
"AMDGPU Insert SingleUseVDST", false, false)