src/amd/compiler/aco_insert_delay_alu.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2018 Valve Corporation
  *
  * SPDX-License-Identifier: MIT
  */

 #include "aco_builder.h"
 #include "aco_ir.h"

 #include <map>
 #include <stack>
 #include <vector>

 namespace aco {

 namespace {

 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
  * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
  * that we should switch to a different wave and contains info on dependencies as to
  * when we can switch back.
  *
  * This seems to apply only for ALU->ALU dependencies as other instructions have better
  * integration with the frontend.
  *
  * Note that if we do not emit s_delay_alu things will still be correct, but the wave
  * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
  * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
  * instructions can take a different number of cycles based on the exec mask)
  */
 struct alu_delay_info {
    /* These are the values directly above the max representable value, i.e. the wait
     * would turn into a no-op when we try to wait for something further back than
     * this.
     */
    static constexpr int8_t valu_nop = 5;
    static constexpr int8_t trans_nop = 4;

    /* How many VALU instructions ago this value was written */
    int8_t valu_instrs = valu_nop;
    /* Cycles until the writing VALU instruction is finished */
    int8_t valu_cycles = 0;

    /* How many Transcedent instructions ago this value was written */
    int8_t trans_instrs = trans_nop;
    /* Cycles until the writing Transcendent instruction is finished */
    int8_t trans_cycles = 0;

    /* Cycles until the writing SALU instruction is finished*/
    int8_t salu_cycles = 0;

    /* VALU wrote this as lane mask. */
    bool lane_mask_forwarding = true;

    bool combine(const alu_delay_info& other)
    {
       bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
                      other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
                      other.trans_cycles > trans_cycles;
       valu_instrs = std::min(valu_instrs, other.valu_instrs);
       trans_instrs = std::min(trans_instrs, other.trans_instrs);
       salu_cycles = std::max(salu_cycles, other.salu_cycles);
       valu_cycles = std::max(valu_cycles, other.valu_cycles);
       trans_cycles = std::max(trans_cycles, other.trans_cycles);
       lane_mask_forwarding &= other.lane_mask_forwarding;
       return changed;
    }

    /* Needs to be called after any change to keep the data consistent. */
    bool fixup()
    {
       if (valu_instrs >= valu_nop || valu_cycles <= 0) {
          valu_instrs = valu_nop;
          valu_cycles = 0;
       }

       if (trans_instrs >= trans_nop || trans_cycles <= 0) {
          trans_instrs = trans_nop;
          trans_cycles = 0;
       }

       salu_cycles = std::max<int8_t>(salu_cycles, 0);

       return empty();
    }

    /* Returns true if a wait would be a no-op */
    bool empty() const
    {
       return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
    }

    UNUSED void print(FILE* output) const
    {
       if (valu_instrs != valu_nop)
          fprintf(output, "valu_instrs: %u\n", valu_instrs);
       if (valu_cycles)
          fprintf(output, "valu_cycles: %u\n", valu_cycles);
       if (trans_instrs != trans_nop)
          fprintf(output, "trans_instrs: %u\n", trans_instrs);
       if (trans_cycles)
          fprintf(output, "trans_cycles: %u\n", trans_cycles);
       if (salu_cycles)
          fprintf(output, "salu_cycles: %u\n", salu_cycles);
    }
 };

 struct delay_ctx {
    Program* program;
    std::map<PhysReg, alu_delay_info> gpr_map;

    delay_ctx() {}
    delay_ctx(Program* program_) : program(program_) {}

    UNUSED void print(FILE* output) const
    {
       for (const auto& entry : gpr_map) {
          fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
                  entry.first.reg() & 0xff);
          entry.second.print(output);
          fprintf(output, "}\n");
       }
    }
 };

 void
 check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
 {
    for (unsigned i = 0; i < instr->operands.size(); i++) {
       const Operand op = instr->operands[i];
       alu_delay_info op_delay;
       if (op.isConstant() || op.isUndefined())
          continue;

       /* check consecutively read gprs */
       for (unsigned j = 0; j < op.size(); j++) {
          std::map<PhysReg, alu_delay_info>::iterator it =
             ctx.gpr_map.find(PhysReg{op.physReg() + j});
          if (it != ctx.gpr_map.end())
             op_delay.combine(it->second);
       }

       bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 ||
                            instr->opcode == aco_opcode::v_cndmask_b16 ||
                            instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
                           i == 2;
       fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
                       i + 1 == instr->operands.size();
       if (!op_delay.lane_mask_forwarding || !fast_forward)
          delay.combine(op_delay);
    }
 }

 void
 update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
 {
    std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
    while (it != ctx.gpr_map.end()) {
       alu_delay_info& entry = it->second;
       entry.valu_instrs += is_valu ? 1 : 0;
       entry.trans_instrs += is_trans ? 1 : 0;
       entry.salu_cycles -= cycles;
       entry.valu_cycles -= cycles;
       entry.trans_cycles -= cycles;
       it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
    }
 }

 void
 kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
 {
    /* Consider frontend waits first. These are automatically done by the hardware,
     * so we don't need to insert s_delay_alu.
     * They are also lower granularity, waiting for accesses of a counter instead
     * of only the real per register dependencies.
     */
    depctr_wait wait = parse_depctr_wait(instr);

    int8_t implict_cycles = 0;
    if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec ||
        !wait.va_exec) {
       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
       while (it != ctx.gpr_map.end()) {
          alu_delay_info& entry = it->second;
          bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) ||
                           (it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) ||
                           (it->first >= exec && it->first <= exec_hi && !wait.va_exec);
          if (wait_valu) {
             implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
             entry.valu_cycles = 0;
             entry.trans_cycles = 0;
          }
          bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) ||
                           (it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
          if (wait_salu) {
             implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
             entry.salu_cycles = 0;
          }
          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
       }
    }

    /* Previous alu progresses as usual while the frontend waits. */
    if (implict_cycles != 0)
       update_alu(ctx, false, false, implict_cycles);

    if (instr->isVALU() || instr->isSALU())
       check_alu(ctx, delay, instr);

    if (!delay.empty()) {
       update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));

       /* remove all gprs with higher counter from map */
       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
       while (it != ctx.gpr_map.end()) {
          if (delay.valu_instrs <= it->second.valu_instrs)
             it->second.valu_instrs = alu_delay_info::valu_nop;
          if (delay.trans_instrs <= it->second.trans_instrs)
             it->second.trans_instrs = alu_delay_info::trans_nop;
          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
       }
    }
 }

 void
 gen_alu(Instruction* instr, delay_ctx& ctx)
 {
    Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
    bool is_valu = instr->isVALU();
    bool is_trans = instr->isTrans();

    if (is_trans || is_valu || instr->isSALU()) {
       alu_delay_info delay;
       delay.lane_mask_forwarding = false;
       if (is_trans) {
          delay.trans_instrs = 0;
          delay.trans_cycles = cycle_info.latency;
       } else if (is_valu) {
          delay.valu_instrs = 0;
          delay.valu_cycles = cycle_info.latency;
       } else if (instr->isSALU()) {
          delay.salu_cycles = cycle_info.latency;
       }

       for (Definition& def : instr->definitions) {
          if (is_valu && def.regClass() == ctx.program->lane_mask) {
             delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
                                          instr->opcode != aco_opcode::v_readfirstlane_b32;
          }

          for (unsigned j = 0; j < def.size(); j++) {
             auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
             if (!it.second)
                it.first->second.combine(delay);
          }
       }
    }

    update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
               cycle_info.issue_cycles);
 }

 void
 emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
                alu_delay_info& delay)
 {
    uint32_t imm = 0;
    if (delay.trans_instrs != delay.trans_nop) {
       imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
    }

    if (delay.valu_instrs != delay.valu_nop) {
       imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
    }

    /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
     * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
     * getting this wrong isn't an issue. */
    if (delay.salu_cycles && imm <= 0xf) {
       unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
       imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
    }

    Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
    inst->salu().imm = imm;
    instructions.emplace_back(inst);
    delay = alu_delay_info();
 }

 void
 handle_block(Program* program, Block& block, delay_ctx& ctx)
 {
    std::vector<aco_ptr<Instruction>> new_instructions;
    alu_delay_info queued_delay;

    for (size_t i = 0; i < block.instructions.size(); i++) {
       aco_ptr<Instruction>& instr = block.instructions[i];
       assert(instr->opcode != aco_opcode::s_delay_alu);

       kill_alu(queued_delay, instr.get(), ctx);
       gen_alu(instr.get(), ctx);

       if (!queued_delay.empty())
          emit_delay_alu(ctx, new_instructions, queued_delay);
       new_instructions.emplace_back(std::move(instr));
    }

    block.instructions.swap(new_instructions);
 }

 } /* end namespace */

 void
 insert_delay_alu(Program* program)
 {
    /* per BB ctx */
    delay_ctx ctx(program);

    for (unsigned i = 0; i < program->blocks.size(); i++) {
       Block& current = program->blocks[i];

       if (current.instructions.empty())
          continue;

       handle_block(program, current, ctx);

       /* Reset ctx if there is a jump, assuming ALU will be done
        * because branch latency is pretty high.
        */
       if (current.linear_succs.empty() ||
           current.instructions.back()->opcode == aco_opcode::s_branch) {
          ctx = delay_ctx(program);
       }
    }
 }

 void
 combine_delay_alu(Program* program)
 {
    /* Combine s_delay_alu using the skip field. */
    for (Block& block : program->blocks) {
       int i = 0;
       int prev_delay_alu = -1;
       for (aco_ptr<Instruction>& instr : block.instructions) {
          if (instr->opcode != aco_opcode::s_delay_alu) {
             block.instructions[i++] = std::move(instr);
             continue;
          }

          uint16_t imm = instr->salu().imm;
          int skip = i - prev_delay_alu - 1;
          if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
             if (imm >> 7 == 0)
                prev_delay_alu = i;
             block.instructions[i++] = std::move(instr);
             continue;
          }

          block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
          prev_delay_alu = -1;
       }
       block.instructions.resize(i);
    }
 }

 } // namespace aco
	/*
	* Copyright © 2018 Valve Corporation
	*
	* SPDX-License-Identifier: MIT
	*/

	#include "aco_builder.h"
	#include "aco_ir.h"

	#include <map>
	#include <stack>
	#include <vector>

	namespace aco {

	namespace {

	/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
	* wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
	* that we should switch to a different wave and contains info on dependencies as to
	* when we can switch back.
	*
	* This seems to apply only for ALU->ALU dependencies as other instructions have better
	* integration with the frontend.
	*
	* Note that if we do not emit s_delay_alu things will still be correct, but the wave
	* will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
	* I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
	* instructions can take a different number of cycles based on the exec mask)
	*/
	struct alu_delay_info {
	/* These are the values directly above the max representable value, i.e. the wait
	* would turn into a no-op when we try to wait for something further back than
	* this.
	*/
	static constexpr int8_t valu_nop = 5;
	static constexpr int8_t trans_nop = 4;

	/* How many VALU instructions ago this value was written */
	int8_t valu_instrs = valu_nop;
	/* Cycles until the writing VALU instruction is finished */
	int8_t valu_cycles = 0;

	/* How many Transcedent instructions ago this value was written */
	int8_t trans_instrs = trans_nop;
	/* Cycles until the writing Transcendent instruction is finished */
	int8_t trans_cycles = 0;

	/* Cycles until the writing SALU instruction is finished*/
	int8_t salu_cycles = 0;

	/* VALU wrote this as lane mask. */
	bool lane_mask_forwarding = true;

	bool combine(const alu_delay_info& other)
	{
	bool changed = other.valu_instrs < valu_instrs \|\| other.trans_instrs < trans_instrs \|\|
	other.salu_cycles > salu_cycles \|\| other.valu_cycles > valu_cycles \|\|
	other.trans_cycles > trans_cycles;
	valu_instrs = std::min(valu_instrs, other.valu_instrs);
	trans_instrs = std::min(trans_instrs, other.trans_instrs);
	salu_cycles = std::max(salu_cycles, other.salu_cycles);
	valu_cycles = std::max(valu_cycles, other.valu_cycles);
	trans_cycles = std::max(trans_cycles, other.trans_cycles);
	lane_mask_forwarding &= other.lane_mask_forwarding;
	return changed;
	}

	/* Needs to be called after any change to keep the data consistent. */
	bool fixup()
	{
	if (valu_instrs >= valu_nop \|\| valu_cycles <= 0) {
	valu_instrs = valu_nop;
	valu_cycles = 0;
	}

	if (trans_instrs >= trans_nop \|\| trans_cycles <= 0) {
	trans_instrs = trans_nop;
	trans_cycles = 0;
	}

	salu_cycles = std::max<int8_t>(salu_cycles, 0);

	return empty();
	}

	/* Returns true if a wait would be a no-op */
	bool empty() const
	{
	return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
	}

	UNUSED void print(FILE* output) const
	{
	if (valu_instrs != valu_nop)
	fprintf(output, "valu_instrs: %u\n", valu_instrs);
	if (valu_cycles)
	fprintf(output, "valu_cycles: %u\n", valu_cycles);
	if (trans_instrs != trans_nop)
	fprintf(output, "trans_instrs: %u\n", trans_instrs);
	if (trans_cycles)
	fprintf(output, "trans_cycles: %u\n", trans_cycles);
	if (salu_cycles)
	fprintf(output, "salu_cycles: %u\n", salu_cycles);
	}
	};

	struct delay_ctx {
	Program* program;
	std::map<PhysReg, alu_delay_info> gpr_map;

	delay_ctx() {}
	delay_ctx(Program* program_) : program(program_) {}

	UNUSED void print(FILE* output) const
	{
	for (const auto& entry : gpr_map) {
	fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
	entry.first.reg() & 0xff);
	entry.second.print(output);
	fprintf(output, "}\n");
	}
	}
	};

	void
	check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
	{
	for (unsigned i = 0; i < instr->operands.size(); i++) {
	const Operand op = instr->operands[i];
	alu_delay_info op_delay;
	if (op.isConstant() \|\| op.isUndefined())
	continue;

	/* check consecutively read gprs */
	for (unsigned j = 0; j < op.size(); j++) {
	std::map<PhysReg, alu_delay_info>::iterator it =
	ctx.gpr_map.find(PhysReg{op.physReg() + j});
	if (it != ctx.gpr_map.end())
	op_delay.combine(it->second);
	}

	bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 \|\|
	instr->opcode == aco_opcode::v_cndmask_b16 \|\|
	instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
	i == 2;
	fast_forward \|= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
	i + 1 == instr->operands.size();
	if (!op_delay.lane_mask_forwarding \|\| !fast_forward)
	delay.combine(op_delay);
	}
	}

	void
	update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
	{
	std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
	while (it != ctx.gpr_map.end()) {
	alu_delay_info& entry = it->second;
	entry.valu_instrs += is_valu ? 1 : 0;
	entry.trans_instrs += is_trans ? 1 : 0;
	entry.salu_cycles -= cycles;
	entry.valu_cycles -= cycles;
	entry.trans_cycles -= cycles;
	it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
	}
	}

	void
	kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
	{
	/* Consider frontend waits first. These are automatically done by the hardware,
	* so we don't need to insert s_delay_alu.
	* They are also lower granularity, waiting for accesses of a counter instead
	* of only the real per register dependencies.
	*/
	depctr_wait wait = parse_depctr_wait(instr);

	int8_t implict_cycles = 0;
	if (!wait.va_vdst \|\| !wait.va_sdst \|\| !wait.va_vcc \|\| !wait.sa_sdst \|\| !wait.sa_exec \|\|
	!wait.va_exec) {
	std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
	while (it != ctx.gpr_map.end()) {
	alu_delay_info& entry = it->second;
	bool wait_valu = !wait.va_vdst \|\| (it->first < vcc && !wait.va_sdst) \|\|
	(it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) \|\|
	(it->first >= exec && it->first <= exec_hi && !wait.va_exec);
	if (wait_valu) {
	implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
	entry.valu_cycles = 0;
	entry.trans_cycles = 0;
	}
	bool wait_salu = ((it->first <= vcc_hi \|\| it->first == scc) && !wait.sa_sdst) \|\|
	(it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
	if (wait_salu) {
	implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
	entry.salu_cycles = 0;
	}
	it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
	}
	}

	/* Previous alu progresses as usual while the frontend waits. */
	if (implict_cycles != 0)
	update_alu(ctx, false, false, implict_cycles);

	if (instr->isVALU() \|\| instr->isSALU())
	check_alu(ctx, delay, instr);

	if (!delay.empty()) {
	update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));

	/* remove all gprs with higher counter from map */
	std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
	while (it != ctx.gpr_map.end()) {
	if (delay.valu_instrs <= it->second.valu_instrs)
	it->second.valu_instrs = alu_delay_info::valu_nop;
	if (delay.trans_instrs <= it->second.trans_instrs)
	it->second.trans_instrs = alu_delay_info::trans_nop;
	it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
	}
	}
	}

	void
	gen_alu(Instruction* instr, delay_ctx& ctx)
	{
	Instruction_cycle_info cycle_info = get_cycle_info(ctx.program, instr);
	bool is_valu = instr->isVALU();
	bool is_trans = instr->isTrans();

	if (is_trans \|\| is_valu \|\| instr->isSALU()) {
	alu_delay_info delay;
	delay.lane_mask_forwarding = false;
	if (is_trans) {
	delay.trans_instrs = 0;
	delay.trans_cycles = cycle_info.latency;
	} else if (is_valu) {
	delay.valu_instrs = 0;
	delay.valu_cycles = cycle_info.latency;
	} else if (instr->isSALU()) {
	delay.salu_cycles = cycle_info.latency;
	}

	for (Definition& def : instr->definitions) {
	if (is_valu && def.regClass() == ctx.program->lane_mask) {
	delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
	instr->opcode != aco_opcode::v_readfirstlane_b32;
	}

	for (unsigned j = 0; j < def.size(); j++) {
	auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
	if (!it.second)
	it.first->second.combine(delay);
	}
	}
	}

	update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
	cycle_info.issue_cycles);
	}

	void
	emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
	alu_delay_info& delay)
	{
	uint32_t imm = 0;
	if (delay.trans_instrs != delay.trans_nop) {
	imm \|= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
	}

	if (delay.valu_instrs != delay.valu_nop) {
	imm \|= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
	}

	/* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
	* drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
	* getting this wrong isn't an issue. */
	if (delay.salu_cycles && imm <= 0xf) {
	unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
	imm \|= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
	}

	Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
	inst->salu().imm = imm;
	instructions.emplace_back(inst);
	delay = alu_delay_info();
	}

	void
	handle_block(Program* program, Block& block, delay_ctx& ctx)
	{
	std::vector<aco_ptr<Instruction>> new_instructions;
	alu_delay_info queued_delay;

	for (size_t i = 0; i < block.instructions.size(); i++) {
	aco_ptr<Instruction>& instr = block.instructions[i];
	assert(instr->opcode != aco_opcode::s_delay_alu);

	kill_alu(queued_delay, instr.get(), ctx);
	gen_alu(instr.get(), ctx);

	if (!queued_delay.empty())
	emit_delay_alu(ctx, new_instructions, queued_delay);
	new_instructions.emplace_back(std::move(instr));
	}

	block.instructions.swap(new_instructions);
	}

	} /* end namespace */

	void
	insert_delay_alu(Program* program)
	{
	/* per BB ctx */
	delay_ctx ctx(program);

	for (unsigned i = 0; i < program->blocks.size(); i++) {
	Block& current = program->blocks[i];

	if (current.instructions.empty())
	continue;

	handle_block(program, current, ctx);

	/* Reset ctx if there is a jump, assuming ALU will be done
	* because branch latency is pretty high.
	*/
	if (current.linear_succs.empty() \|\|
	current.instructions.back()->opcode == aco_opcode::s_branch) {
	ctx = delay_ctx(program);
	}
	}
	}

	void
	combine_delay_alu(Program* program)
	{
	/* Combine s_delay_alu using the skip field. */
	for (Block& block : program->blocks) {
	int i = 0;
	int prev_delay_alu = -1;
	for (aco_ptr<Instruction>& instr : block.instructions) {
	if (instr->opcode != aco_opcode::s_delay_alu) {
	block.instructions[i++] = std::move(instr);
	continue;
	}

	uint16_t imm = instr->salu().imm;
	int skip = i - prev_delay_alu - 1;
	if (imm >> 7 \|\| prev_delay_alu < 0 \|\| skip >= 6) {
	if (imm >> 7 == 0)
	prev_delay_alu = i;
	block.instructions[i++] = std::move(instr);
	continue;
	}

	block.instructions[prev_delay_alu]->salu().imm \|= (skip << 4) \| (imm << 7);
	prev_delay_alu = -1;
	}
	block.instructions.resize(i);
	}
	}

	} // namespace aco