blob: 1558278c9e97a02e909815bf15ebebab718f2bae [file] [log] [blame]
/*
* Copyright © 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
namespace aco {
namespace {
struct branch_ctx {
Program* program;
branch_ctx(Program* program_) : program(program_) {}
};
void
remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
{
Block& succ = ctx.program->blocks[succ_index];
ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
assert(std::next(it) == succ.linear_preds.end());
succ.linear_preds.pop_back();
it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
assert(std::next(it) == block.linear_succs.end());
block.linear_succs.pop_back();
if (succ.linear_preds.empty()) {
/* This block became unreachable - Recursively remove successors. */
succ.instructions.clear();
for (unsigned i : succ.linear_succs)
remove_linear_successor(ctx, succ, i);
}
}
/**
* Check if the branch instruction can be removed:
* This is beneficial when executing the next block with an empty exec mask
* is faster than the branch instruction itself.
*
* Override this judgement when:
* - The application prefers to remove control flow
* - The compiler stack knows that it's a divergent branch never taken
*/
bool
can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
{
const uint32_t target = branch->target[0];
const bool uniform_branch =
!((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
branch->operands[0].physReg() == exec);
if (branch->never_taken) {
assert(!uniform_branch);
return true;
}
/* Cannot remove back-edges. */
if (block.index >= target)
return false;
const bool prefer_remove = branch->rarely_taken;
unsigned num_scalar = 0;
unsigned num_vector = 0;
/* Check the instructions between branch and target */
for (unsigned i = block.index + 1; i < target; i++) {
/* Uniform conditional branches must not be ignored if they
* are about to jump over actual instructions */
if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
return false;
for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
if (instr->isSOPP()) {
/* Discard early exits and loop breaks and continues should work fine with
* an empty exec mask.
*/
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
instr->opcode == aco_opcode::s_cbranch_execz ||
instr->opcode == aco_opcode::s_cbranch_execnz) {
bool is_break_continue =
ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
bool discard_early_exit =
ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
if (is_break_continue || discard_early_exit)
continue;
}
return false;
} else if (instr->isSALU()) {
num_scalar++;
} else if (instr->isVALU() || instr->isVINTRP()) {
if (instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
/* writelane ignores exec, writing inactive lanes results in UB. */
return false;
}
num_vector++;
/* VALU which writes SGPRs are always executed on GFX10+ */
if (ctx.program->gfx_level >= GFX10) {
for (Definition& def : instr->definitions) {
if (def.regClass().type() == RegType::sgpr)
num_scalar++;
}
}
} else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
* SMEM might be an invalid access, and barriers are probably expensive. */
return false;
} else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
if (!prefer_remove)
return false;
} else if (instr->opcode != aco_opcode::p_debug_info) {
assert(false && "Pseudo instructions should be lowered by this point.");
return false;
}
if (!prefer_remove) {
/* Under these conditions, we shouldn't remove the branch.
* Don't care about the estimated cycles when the shader prefers flattening.
*/
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (est_cycles > 16)
return false;
}
}
}
return true;
}
void
lower_branch_instruction(branch_ctx& ctx, Block& block)
{
if (block.instructions.empty() || !block.instructions.back()->isBranch())
return;
aco_ptr<Instruction> branch = std::move(block.instructions.back());
const uint32_t target = branch->branch().target[0];
block.instructions.pop_back();
if (can_remove_branch(ctx, block, &branch->branch())) {
if (branch->opcode != aco_opcode::p_branch)
remove_linear_successor(ctx, block, target);
return;
}
/* emit branch instruction */
Builder bld(ctx.program, &block.instructions);
switch (branch->opcode) {
case aco_opcode::p_branch:
assert(block.linear_succs[0] == target);
bld.sopp(aco_opcode::s_branch, target);
break;
case aco_opcode::p_cbranch_nz:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execnz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc1, target);
}
break;
case aco_opcode::p_cbranch_z:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc0, target);
}
break;
default: unreachable("Unknown Pseudo branch instruction!");
}
}
} /* end namespace */
void
lower_branches(Program* program)
{
branch_ctx ctx(program);
for (int i = program->blocks.size() - 1; i >= 0; i--) {
Block& block = program->blocks[i];
lower_branch_instruction(ctx, block);
}
}
} // namespace aco