src/intel/compiler/brw_ir_fs.h - platform/external/mesa3d - Git at Google

 /* -*- c++ -*- */
 /*
  * Copyright © 2010-2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #pragma once

 #include "brw_ir.h"
 #include "brw_ir_allocator.h"

 struct fs_inst : public exec_node {
 private:
    fs_inst &operator=(const fs_inst &);

    void init(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
              const brw_reg *src, unsigned sources);

 public:
    DECLARE_RALLOC_CXX_OPERATORS(fs_inst)

    fs_inst();
    fs_inst(enum opcode opcode, uint8_t exec_size);
    fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst);
    fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
            const brw_reg &src0);
    fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
            const brw_reg &src0, const brw_reg &src1);
    fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
            const brw_reg &src0, const brw_reg &src1, const brw_reg &src2);
    fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
            const brw_reg src[], unsigned sources);
    fs_inst(const fs_inst &that);
    ~fs_inst();

    void resize_sources(uint8_t num_sources);

    bool is_send_from_grf() const;
    bool is_payload(unsigned arg) const;
    bool is_partial_write() const;
    unsigned components_read(unsigned i) const;
    unsigned size_read(const struct intel_device_info *devinfo, int arg) const;
    bool can_do_source_mods(const struct intel_device_info *devinfo) const;
    bool can_do_cmod() const;
    bool can_change_types() const;
    bool has_source_and_destination_hazard() const;

    bool is_3src(const struct brw_compiler *compiler) const;
    bool is_math() const;
    bool is_control_flow_begin() const;
    bool is_control_flow_end() const;
    bool is_control_flow() const;
    bool is_commutative() const;
    bool is_raw_move() const;
    bool can_do_saturate() const;
    bool reads_accumulator_implicitly() const;
    bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;

    /**
     * Instructions that use indirect addressing have additional register
     * regioning restrictions.
     */
    bool uses_indirect_addressing() const;

    void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
    void insert_after(bblock_t *block, fs_inst *inst);
    void insert_before(bblock_t *block, fs_inst *inst);

    /**
     * True if the instruction has side effects other than writing to
     * its destination registers.  You are expected not to reorder or
     * optimize these out unless you know what you are doing.
     */
    bool has_side_effects() const;

    /**
     * True if the instruction might be affected by side effects of other
     * instructions.
     */
    bool is_volatile() const;

    /**
     * Return whether \p arg is a control source of a virtual instruction which
     * shouldn't contribute to the execution type and usual regioning
     * restriction calculations of arithmetic instructions.
     */
    bool is_control_source(unsigned arg) const;

    /**
     * Return the subset of flag registers read by the instruction as a bitset
     * with byte granularity.
     */
    unsigned flags_read(const intel_device_info *devinfo) const;

    /**
     * Return the subset of flag registers updated by the instruction (either
     * partially or fully) as a bitset with byte granularity.
     */
    unsigned flags_written(const intel_device_info *devinfo) const;

    /**
     * Return true if this instruction is a sampler message gathering residency
     * data.
     */
    bool has_sampler_residency() const;

    /**
     * Return true if this instruction is using the address register
     * implicitly.
     */
    bool uses_address_register_implicitly() const;

    uint8_t sources; /**< Number of brw_reg sources. */

    /**
     * Execution size of the instruction.  This is used by the generator to
     * generate the correct binary for the given instruction.  Current valid
     * values are 1, 4, 8, 16, 32.
     */
    uint8_t exec_size;

    /**
     * Channel group from the hardware execution and predication mask that
     * should be applied to the instruction.  The subset of channel enable
     * signals (calculated from the EU control flow and predication state)
     * given by [group, group + exec_size) will be used to mask GRF writes and
     * any other side effects of the instruction.
     */
    uint8_t group;

    uint8_t mlen; /**< SEND message length */
    uint8_t ex_mlen; /**< SENDS extended message length */
    uint8_t sfid; /**< SFID for SEND instructions */
    /** The number of hardware registers used for a message header. */
    uint8_t header_size;
    uint8_t target; /**< MRT target. */
    uint32_t desc; /**< SEND[S] message descriptor immediate */
    uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */

    uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
    unsigned size_written; /**< Data written to the destination register in bytes. */

    enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
    enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
    enum brw_predicate predicate;

    tgl_swsb sched; /**< Scheduling info. */

    union {
       struct {
          /* Chooses which flag subregister (f0.0 to f3.1) is used for
           * conditional mod and predication.
           */
          unsigned flag_subreg:3;

          /**
           * Systolic depth used by DPAS instruction.
           */
          unsigned sdepth:4;

          /**
           * Repeat count used by DPAS instruction.
           */
          unsigned rcount:4;

          unsigned pad:2;

          bool predicate_inverse:1;
          bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
          bool force_writemask_all:1;
          bool no_dd_clear:1;
          bool no_dd_check:1;
          bool saturate:1;
          bool shadow_compare:1;
          bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
          bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
          bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
          bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
                               *   bindless surface offset (26bits instead of
                               *   20bits)
                               */
          /**
           * The predication mask applied to this instruction is guaranteed to
           * be uniform and a superset of the execution mask of the present block.
           * No currently enabled channel will be disabled by the predicate.
           */
          bool predicate_trivial:1;
          bool eot:1;
          bool last_rt:1;
          bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
          bool keep_payload_trailing_zeros:1;
          /**
           * Whether the parameters of the SEND instructions are build with
           * NoMask (for A32 messages this covers only the surface handle, for
           * A64 messages this covers the load address).
           */
          bool has_no_mask_send_params:1;
       };
       uint32_t bits;
    };

    brw_reg dst;
    brw_reg *src;
    brw_reg builtin_src[4];

 #ifndef NDEBUG
    /** @{
     * Annotation for the generated IR.
     */
    const char *annotation;
    /** @} */
 #endif
 };

 /**
  * Make the execution of \p inst dependent on the evaluation of a possibly
  * inverted predicate.
  */
 static inline fs_inst *
 set_predicate_inv(enum brw_predicate pred, bool inverse,
                   fs_inst *inst)
 {
    inst->predicate = pred;
    inst->predicate_inverse = inverse;
    return inst;
 }

 /**
  * Make the execution of \p inst dependent on the evaluation of a predicate.
  */
 static inline fs_inst *
 set_predicate(enum brw_predicate pred, fs_inst *inst)
 {
    return set_predicate_inv(pred, false, inst);
 }

 /**
  * Write the result of evaluating the condition given by \p mod to a flag
  * register.
  */
 static inline fs_inst *
 set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
 {
    inst->conditional_mod = mod;
    return inst;
 }

 /**
  * Clamp the result of \p inst to the saturation range of its destination
  * datatype.
  */
 static inline fs_inst *
 set_saturate(bool saturate, fs_inst *inst)
 {
    inst->saturate = saturate;
    return inst;
 }

 /**
  * Return the number of dataflow registers written by the instruction (either
  * fully or partially) counted from 'floor(reg_offset(inst->dst) /
  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
  * UNIFORM and IMM files and 32B for all other files.
  */
 inline unsigned
 regs_written(const fs_inst *inst)
 {
    assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
    return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
                        inst->size_written -
                        MIN2(inst->size_written, reg_padding(inst->dst)),
                        REG_SIZE);
 }

 /**
  * Return the number of dataflow registers read by the instruction (either
  * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
  * UNIFORM files and 32B for all other files.
  */
 inline unsigned
 regs_read(const struct intel_device_info *devinfo, const fs_inst *inst, unsigned i)
 {
    if (inst->src[i].file == IMM)
       return 1;

    const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
    return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
                        inst->size_read(devinfo, i) -
                        MIN2(inst->size_read(devinfo, i), reg_padding(inst->src[i])),
                        reg_size);
 }

 static inline enum brw_reg_type
 get_exec_type(const fs_inst *inst)
 {
    brw_reg_type exec_type = BRW_TYPE_B;

    for (int i = 0; i < inst->sources; i++) {
       if (inst->src[i].file != BAD_FILE &&
           !inst->is_control_source(i)) {
          const brw_reg_type t = get_exec_type(inst->src[i].type);
          if (brw_type_size_bytes(t) > brw_type_size_bytes(exec_type))
             exec_type = t;
          else if (brw_type_size_bytes(t) == brw_type_size_bytes(exec_type) &&
                   brw_type_is_float(t))
             exec_type = t;
       }
    }

    if (exec_type == BRW_TYPE_B)
       exec_type = inst->dst.type;

    assert(exec_type != BRW_TYPE_B);

    /* Promotion of the execution type to 32-bit for conversions from or to
     * half-float seems to be consistent with the following text from the
     * Cherryview PRM Vol. 7, "Execution Data Type":
     *
     * "When single precision and half precision floats are mixed between
     *  source operands or between source and destination operand [..] single
     *  precision float is the execution datatype."
     *
     * and from "Register Region Restrictions":
     *
     * "Conversion between Integer and HF (Half Float) must be DWord aligned
     *  and strided by a DWord on the destination."
     */
    if (brw_type_size_bytes(exec_type) == 2 &&
        inst->dst.type != exec_type) {
       if (exec_type == BRW_TYPE_HF)
          exec_type = BRW_TYPE_F;
       else if (inst->dst.type == BRW_TYPE_HF)
          exec_type = BRW_TYPE_D;
    }

    return exec_type;
 }

 static inline unsigned
 get_exec_type_size(const fs_inst *inst)
 {
    return brw_type_size_bytes(get_exec_type(inst));
 }

 static inline bool
 is_send(const fs_inst *inst)
 {
    return inst->mlen || inst->is_send_from_grf();
 }

 /**
  * Return whether the instruction isn't an ALU instruction and cannot be
  * assumed to complete in-order.
  */
 static inline bool
 is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
 {
    return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
           inst->opcode == BRW_OPCODE_DPAS ||
           (devinfo->has_64bit_float_via_math_pipe &&
            (get_exec_type(inst) == BRW_TYPE_DF ||
             inst->dst.type == BRW_TYPE_DF));
 }

 /**
  * Return whether the following regioning restriction applies to the specified
  * instruction.  From the Cherryview PRM Vol 7. "Register Region
  * Restrictions":
  *
  * "When source or destination datatype is 64b or operation is integer DWord
  *  multiply, regioning in Align1 must follow these rules:
  *
  *  1. Source and Destination horizontal stride must be aligned to the same qword.
  *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
  *  3. Source and Destination offset must be the same, except the case of
  *     scalar source."
  */
 static inline bool
 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
                                    const fs_inst *inst,
                                    brw_reg_type dst_type)
 {
    const brw_reg_type exec_type = get_exec_type(inst);
    /* Even though the hardware spec claims that "integer DWord multiply"
     * operations are restricted, empirical evidence and the behavior of the
     * simulator suggest that only 32x32-bit integer multiplication is
     * restricted.
     */
    const bool is_dword_multiply = !brw_type_is_float(exec_type) &&
       ((inst->opcode == BRW_OPCODE_MUL &&
         MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4) ||
        (inst->opcode == BRW_OPCODE_MAD &&
         MIN2(brw_type_size_bytes(inst->src[1].type), brw_type_size_bytes(inst->src[2].type)) >= 4));

    if (brw_type_size_bytes(dst_type) > 4 || brw_type_size_bytes(exec_type) > 4 ||
        (brw_type_size_bytes(exec_type) == 4 && is_dword_multiply))
       return intel_device_info_is_9lp(devinfo) || devinfo->verx10 >= 125;

    else if (brw_type_is_float(dst_type))
       return devinfo->verx10 >= 125;

    else
       return false;
 }

 static inline bool
 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
                                    const fs_inst *inst)
 {
    return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
 }

 /**
  * Return true if the instruction can be potentially affected by the Xe2+
  * regioning restrictions that apply to integer types smaller than a dword.
  * The restriction isn't quoted here due to its length, see BSpec #56640 for
  * details.
  */
 static inline bool
 has_subdword_integer_region_restriction(const intel_device_info *devinfo,
                                         const fs_inst *inst,
                                         const brw_reg *srcs, unsigned num_srcs)
 {
    if (devinfo->ver >= 20 &&
        brw_type_is_int(inst->dst.type) &&
        MAX2(byte_stride(inst->dst),
             brw_type_size_bytes(inst->dst.type)) < 4) {
       for (unsigned i = 0; i < num_srcs; i++) {
          if (brw_type_is_int(srcs[i].type) &&
              ((brw_type_size_bytes(srcs[i].type) < 4 &&
                byte_stride(srcs[i]) >= 4) ||
               (MAX2(byte_stride(inst->dst),
                    brw_type_size_bytes(inst->dst.type)) == 1 &&
                brw_type_size_bytes(srcs[i].type) == 1 &&
                byte_stride(srcs[i]) >= 2)))
             return true;
       }
    }

    return false;
 }

 static inline bool
 has_subdword_integer_region_restriction(const intel_device_info *devinfo,
                                         const fs_inst *inst)
 {
    return has_subdword_integer_region_restriction(devinfo, inst,
                                                   inst->src, inst->sources);
 }

 /**
  * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
  * the specified register file into a VGRF.
  *
  * This implies identity register regions without any source-destination
  * overlap, but otherwise has no implications on the location of sources and
  * destination in the register file: Gathering any number of portions from
  * multiple virtual registers in any order is allowed.
  */
 inline bool
 is_copy_payload(const struct intel_device_info *devinfo,
                 brw_reg_file file, const fs_inst *inst)
 {
    if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
        inst->is_partial_write() || inst->saturate ||
        inst->dst.file != VGRF)
       return false;

    for (unsigned i = 0; i < inst->sources; i++) {
       if (inst->src[i].file != file ||
           inst->src[i].abs || inst->src[i].negate)
          return false;

       if (!inst->src[i].is_contiguous())
          return false;

       if (regions_overlap(inst->dst, inst->size_written,
                           inst->src[i], inst->size_read(devinfo, i)))
          return false;
    }

    return true;
 }

 /**
  * Like is_copy_payload(), but the instruction is required to copy a single
  * contiguous block of registers from the given register file into the
  * destination without any reordering.
  */
 inline bool
 is_identity_payload(const struct intel_device_info *devinfo,
                     brw_reg_file file, const fs_inst *inst)
 {
    if (is_copy_payload(devinfo, file, inst)) {
       brw_reg reg = inst->src[0];

       for (unsigned i = 0; i < inst->sources; i++) {
          reg.type = inst->src[i].type;
          if (!inst->src[i].equals(reg))
             return false;

          reg = byte_offset(reg, inst->size_read(devinfo, i));
       }

       return true;
    } else {
       return false;
    }
 }

 /**
  * Like is_copy_payload(), but the instruction is required to source data from
  * at least two disjoint VGRFs.
  *
  * This doesn't necessarily rule out the elimination of this instruction
  * through register coalescing, but due to limitations of the register
  * coalesce pass it might be impossible to do so directly until a later stage,
  * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
  * instructions.
  */
 inline bool
 is_multi_copy_payload(const struct intel_device_info *devinfo,
                       const fs_inst *inst)
 {
    if (is_copy_payload(devinfo, VGRF, inst)) {
       for (unsigned i = 0; i < inst->sources; i++) {
             if (inst->src[i].nr != inst->src[0].nr)
                return true;
       }
    }

    return false;
 }

 /**
  * Like is_identity_payload(), but the instruction is required to copy the
  * whole contents of a single VGRF into the destination.
  *
  * This means that there is a good chance that the instruction will be
  * eliminated through register coalescing, but it's neither a necessary nor a
  * sufficient condition for that to happen -- E.g. consider the case where
  * source and destination registers diverge due to other instructions in the
  * program overwriting part of their contents, which isn't something we can
  * predict up front based on a cheap strictly local test of the copy
  * instruction.
  */
 inline bool
 is_coalescing_payload(const struct intel_device_info *devinfo,
                       const brw::simple_allocator &alloc, const fs_inst *inst)
 {
    return is_identity_payload(devinfo, VGRF, inst) &&
           inst->src[0].offset == 0 &&
           alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
 }

 bool
 has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);

 /* Return the subset of flag registers that an instruction could
  * potentially read or write based on the execution controls and flag
  * subregister number of the instruction.
  */
 static inline unsigned
 brw_fs_flag_mask(const fs_inst *inst, unsigned width)
 {
    assert(util_is_power_of_two_nonzero(width));
    const unsigned start = (inst->flag_subreg * 16 + inst->group) &
                           ~(width - 1);
   const unsigned end = start + ALIGN(inst->exec_size, width);
    return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
 }

 static inline unsigned
 brw_fs_bit_mask(unsigned n)
 {
    return (n >= CHAR_BIT * sizeof(brw_fs_bit_mask(n)) ? ~0u : (1u << n) - 1);
 }

 static inline unsigned
 brw_fs_flag_mask(const brw_reg &r, unsigned sz)
 {
    if (r.file == ARF) {
       const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
       const unsigned end = start + sz;
       return brw_fs_bit_mask(end) & ~brw_fs_bit_mask(start);
    } else {
       return 0;
    }
 }
	/* -- c++ -- */
	/*
	* Copyright © 2010-2015 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#pragma once

	#include "brw_ir.h"
	#include "brw_ir_allocator.h"

	struct fs_inst : public exec_node {
	private:
	fs_inst &operator=(const fs_inst &);

	void init(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
	const brw_reg *src, unsigned sources);

	public:
	DECLARE_RALLOC_CXX_OPERATORS(fs_inst)

	fs_inst();
	fs_inst(enum opcode opcode, uint8_t exec_size);
	fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst);
	fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
	const brw_reg &src0);
	fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
	const brw_reg &src0, const brw_reg &src1);
	fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
	const brw_reg &src0, const brw_reg &src1, const brw_reg &src2);
	fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
	const brw_reg src[], unsigned sources);
	fs_inst(const fs_inst &that);
	~fs_inst();

	void resize_sources(uint8_t num_sources);

	bool is_send_from_grf() const;
	bool is_payload(unsigned arg) const;
	bool is_partial_write() const;
	unsigned components_read(unsigned i) const;
	unsigned size_read(const struct intel_device_info *devinfo, int arg) const;
	bool can_do_source_mods(const struct intel_device_info *devinfo) const;
	bool can_do_cmod() const;
	bool can_change_types() const;
	bool has_source_and_destination_hazard() const;

	bool is_3src(const struct brw_compiler *compiler) const;
	bool is_math() const;
	bool is_control_flow_begin() const;
	bool is_control_flow_end() const;
	bool is_control_flow() const;
	bool is_commutative() const;
	bool is_raw_move() const;
	bool can_do_saturate() const;
	bool reads_accumulator_implicitly() const;
	bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;

	/**
	* Instructions that use indirect addressing have additional register
	* regioning restrictions.
	*/
	bool uses_indirect_addressing() const;

	void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
	void insert_after(bblock_t block, fs_inst inst);
	void insert_before(bblock_t block, fs_inst inst);

	/**
	* True if the instruction has side effects other than writing to
	* its destination registers. You are expected not to reorder or
	* optimize these out unless you know what you are doing.
	*/
	bool has_side_effects() const;

	/**
	* True if the instruction might be affected by side effects of other
	* instructions.
	*/
	bool is_volatile() const;

	/**
	* Return whether \p arg is a control source of a virtual instruction which
	* shouldn't contribute to the execution type and usual regioning
	* restriction calculations of arithmetic instructions.
	*/
	bool is_control_source(unsigned arg) const;

	/**
	* Return the subset of flag registers read by the instruction as a bitset
	* with byte granularity.
	*/
	unsigned flags_read(const intel_device_info *devinfo) const;

	/**
	* Return the subset of flag registers updated by the instruction (either
	* partially or fully) as a bitset with byte granularity.
	*/
	unsigned flags_written(const intel_device_info *devinfo) const;

	/**
	* Return true if this instruction is a sampler message gathering residency
	* data.
	*/
	bool has_sampler_residency() const;

	/**
	* Return true if this instruction is using the address register
	* implicitly.
	*/
	bool uses_address_register_implicitly() const;

	uint8_t sources; /*< Number of brw_reg sources. /

	/**
	* Execution size of the instruction. This is used by the generator to
	* generate the correct binary for the given instruction. Current valid
	* values are 1, 4, 8, 16, 32.
	*/
	uint8_t exec_size;

	/**
	* Channel group from the hardware execution and predication mask that
	* should be applied to the instruction. The subset of channel enable
	* signals (calculated from the EU control flow and predication state)
	* given by [group, group + exec_size) will be used to mask GRF writes and
	* any other side effects of the instruction.
	*/
	uint8_t group;

	uint8_t mlen; /*< SEND message length /
	uint8_t ex_mlen; /*< SENDS extended message length /
	uint8_t sfid; /*< SFID for SEND instructions /
	/** The number of hardware registers used for a message header. */
	uint8_t header_size;
	uint8_t target; /*< MRT target. /
	uint32_t desc; /*< SEND[S] message descriptor immediate /
	uint32_t ex_desc; /*< SEND[S] extended message descriptor immediate /

	uint32_t offset; /*< spill/unspill offset or texture offset bitfield /
	unsigned size_written; /*< Data written to the destination register in bytes. /

	enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
	enum brw_conditional_mod conditional_mod; /*< BRW_CONDITIONAL_ */
	enum brw_predicate predicate;

	tgl_swsb sched; /*< Scheduling info. /

	union {
	struct {
	/* Chooses which flag subregister (f0.0 to f3.1) is used for
	* conditional mod and predication.
	*/
	unsigned flag_subreg:3;

	/**
	* Systolic depth used by DPAS instruction.
	*/
	unsigned sdepth:4;

	/**
	* Repeat count used by DPAS instruction.
	*/
	unsigned rcount:4;

	unsigned pad:2;

	bool predicate_inverse:1;
	bool writes_accumulator:1; /*< instruction implicitly writes accumulator /
	bool force_writemask_all:1;
	bool no_dd_clear:1;
	bool no_dd_check:1;
	bool saturate:1;
	bool shadow_compare:1;
	bool check_tdr:1; /*< Only valid for SEND; turns it into a SENDC /
	bool send_has_side_effects:1; /*< Only valid for SHADER_OPCODE_SEND /
	bool send_is_volatile:1; /*< Only valid for SHADER_OPCODE_SEND /
	bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
	* bindless surface offset (26bits instead of
	* 20bits)
	*/
	/**
	* The predication mask applied to this instruction is guaranteed to
	* be uniform and a superset of the execution mask of the present block.
	* No currently enabled channel will be disabled by the predicate.
	*/
	bool predicate_trivial:1;
	bool eot:1;
	bool last_rt:1;
	bool pi_noperspective:1; /*< Pixel interpolator noperspective flag /
	bool keep_payload_trailing_zeros:1;
	/**
	* Whether the parameters of the SEND instructions are build with
	* NoMask (for A32 messages this covers only the surface handle, for
	* A64 messages this covers the load address).
	*/
	bool has_no_mask_send_params:1;
	};
	uint32_t bits;
	};

	brw_reg dst;
	brw_reg *src;
	brw_reg builtin_src[4];

	#ifndef NDEBUG
	/** @{
	* Annotation for the generated IR.
	*/
	const char *annotation;
	/** @} */
	#endif
	};

	/**
	* Make the execution of \p inst dependent on the evaluation of a possibly
	* inverted predicate.
	*/
	static inline fs_inst *
	set_predicate_inv(enum brw_predicate pred, bool inverse,
	fs_inst *inst)
	{
	inst->predicate = pred;
	inst->predicate_inverse = inverse;
	return inst;
	}

	/**
	* Make the execution of \p inst dependent on the evaluation of a predicate.
	*/
	static inline fs_inst *
	set_predicate(enum brw_predicate pred, fs_inst *inst)
	{
	return set_predicate_inv(pred, false, inst);
	}

	/**
	* Write the result of evaluating the condition given by \p mod to a flag
	* register.
	*/
	static inline fs_inst *
	set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
	{
	inst->conditional_mod = mod;
	return inst;
	}

	/**
	* Clamp the result of \p inst to the saturation range of its destination
	* datatype.
	*/
	static inline fs_inst *
	set_saturate(bool saturate, fs_inst *inst)
	{
	inst->saturate = saturate;
	return inst;
	}

	/**
	* Return the number of dataflow registers written by the instruction (either
	* fully or partially) counted from 'floor(reg_offset(inst->dst) /
	* register_size)'. The somewhat arbitrary register size unit is 4B for the
	* UNIFORM and IMM files and 32B for all other files.
	*/
	inline unsigned
	regs_written(const fs_inst *inst)
	{
	assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
	return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
	inst->size_written -
	MIN2(inst->size_written, reg_padding(inst->dst)),
	REG_SIZE);
	}

	/**
	* Return the number of dataflow registers read by the instruction (either
	* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
	* register_size)'. The somewhat arbitrary register size unit is 4B for the
	* UNIFORM files and 32B for all other files.
	*/
	inline unsigned
	regs_read(const struct intel_device_info devinfo, const fs_inst inst, unsigned i)
	{
	if (inst->src[i].file == IMM)
	return 1;

	const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
	return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
	inst->size_read(devinfo, i) -
	MIN2(inst->size_read(devinfo, i), reg_padding(inst->src[i])),
	reg_size);
	}

	static inline enum brw_reg_type
	get_exec_type(const fs_inst *inst)
	{
	brw_reg_type exec_type = BRW_TYPE_B;

	for (int i = 0; i < inst->sources; i++) {
	if (inst->src[i].file != BAD_FILE &&
	!inst->is_control_source(i)) {
	const brw_reg_type t = get_exec_type(inst->src[i].type);
	if (brw_type_size_bytes(t) > brw_type_size_bytes(exec_type))
	exec_type = t;
	else if (brw_type_size_bytes(t) == brw_type_size_bytes(exec_type) &&
	brw_type_is_float(t))
	exec_type = t;
	}
	}

	if (exec_type == BRW_TYPE_B)
	exec_type = inst->dst.type;

	assert(exec_type != BRW_TYPE_B);

	/* Promotion of the execution type to 32-bit for conversions from or to
	* half-float seems to be consistent with the following text from the
	* Cherryview PRM Vol. 7, "Execution Data Type":
	*
	* "When single precision and half precision floats are mixed between
	* source operands or between source and destination operand [..] single
	* precision float is the execution datatype."
	*
	* and from "Register Region Restrictions":
	*
	* "Conversion between Integer and HF (Half Float) must be DWord aligned
	* and strided by a DWord on the destination."
	*/
	if (brw_type_size_bytes(exec_type) == 2 &&
	inst->dst.type != exec_type) {
	if (exec_type == BRW_TYPE_HF)
	exec_type = BRW_TYPE_F;
	else if (inst->dst.type == BRW_TYPE_HF)
	exec_type = BRW_TYPE_D;
	}

	return exec_type;
	}

	static inline unsigned
	get_exec_type_size(const fs_inst *inst)
	{
	return brw_type_size_bytes(get_exec_type(inst));
	}

	static inline bool
	is_send(const fs_inst *inst)
	{
	return inst->mlen \|\| inst->is_send_from_grf();
	}

	/**
	* Return whether the instruction isn't an ALU instruction and cannot be
	* assumed to complete in-order.
	*/
	static inline bool
	is_unordered(const intel_device_info devinfo, const fs_inst inst)
	{
	return is_send(inst) \|\| (devinfo->ver < 20 && inst->is_math()) \|\|
	inst->opcode == BRW_OPCODE_DPAS \|\|
	(devinfo->has_64bit_float_via_math_pipe &&
	(get_exec_type(inst) == BRW_TYPE_DF \|\|
	inst->dst.type == BRW_TYPE_DF));
	}

	/**
	* Return whether the following regioning restriction applies to the specified
	* instruction. From the Cherryview PRM Vol 7. "Register Region
	* Restrictions":
	*
	* "When source or destination datatype is 64b or operation is integer DWord
	* multiply, regioning in Align1 must follow these rules:
	*
	* 1. Source and Destination horizontal stride must be aligned to the same qword.
	* 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
	* 3. Source and Destination offset must be the same, except the case of
	* scalar source."
	*/
	static inline bool
	has_dst_aligned_region_restriction(const intel_device_info *devinfo,
	const fs_inst *inst,
	brw_reg_type dst_type)
	{
	const brw_reg_type exec_type = get_exec_type(inst);
	/* Even though the hardware spec claims that "integer DWord multiply"
	* operations are restricted, empirical evidence and the behavior of the
	* simulator suggest that only 32x32-bit integer multiplication is
	* restricted.
	*/
	const bool is_dword_multiply = !brw_type_is_float(exec_type) &&
	((inst->opcode == BRW_OPCODE_MUL &&
	MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4) \|\|
	(inst->opcode == BRW_OPCODE_MAD &&
	MIN2(brw_type_size_bytes(inst->src[1].type), brw_type_size_bytes(inst->src[2].type)) >= 4));

	if (brw_type_size_bytes(dst_type) > 4 \|\| brw_type_size_bytes(exec_type) > 4 \|\|
	(brw_type_size_bytes(exec_type) == 4 && is_dword_multiply))
	return intel_device_info_is_9lp(devinfo) \|\| devinfo->verx10 >= 125;

	else if (brw_type_is_float(dst_type))
	return devinfo->verx10 >= 125;

	else
	return false;
	}

	static inline bool
	has_dst_aligned_region_restriction(const intel_device_info *devinfo,
	const fs_inst *inst)
	{
	return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
	}

	/**
	* Return true if the instruction can be potentially affected by the Xe2+
	* regioning restrictions that apply to integer types smaller than a dword.
	* The restriction isn't quoted here due to its length, see BSpec #56640 for
	* details.
	*/
	static inline bool
	has_subdword_integer_region_restriction(const intel_device_info *devinfo,
	const fs_inst *inst,
	const brw_reg *srcs, unsigned num_srcs)
	{
	if (devinfo->ver >= 20 &&
	brw_type_is_int(inst->dst.type) &&
	MAX2(byte_stride(inst->dst),
	brw_type_size_bytes(inst->dst.type)) < 4) {
	for (unsigned i = 0; i < num_srcs; i++) {
	if (brw_type_is_int(srcs[i].type) &&
	((brw_type_size_bytes(srcs[i].type) < 4 &&
	byte_stride(srcs[i]) >= 4) \|\|
	(MAX2(byte_stride(inst->dst),
	brw_type_size_bytes(inst->dst.type)) == 1 &&
	brw_type_size_bytes(srcs[i].type) == 1 &&
	byte_stride(srcs[i]) >= 2)))
	return true;
	}
	}

	return false;
	}

	static inline bool
	has_subdword_integer_region_restriction(const intel_device_info *devinfo,
	const fs_inst *inst)
	{
	return has_subdword_integer_region_restriction(devinfo, inst,
	inst->src, inst->sources);
	}

	/**
	* Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
	* the specified register file into a VGRF.
	*
	* This implies identity register regions without any source-destination
	* overlap, but otherwise has no implications on the location of sources and
	* destination in the register file: Gathering any number of portions from
	* multiple virtual registers in any order is allowed.
	*/
	inline bool
	is_copy_payload(const struct intel_device_info *devinfo,
	brw_reg_file file, const fs_inst *inst)
	{
	if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD \|\|
	inst->is_partial_write() \|\| inst->saturate \|\|
	inst->dst.file != VGRF)
	return false;

	for (unsigned i = 0; i < inst->sources; i++) {
	if (inst->src[i].file != file \|\|
	inst->src[i].abs \|\| inst->src[i].negate)
	return false;

	if (!inst->src[i].is_contiguous())
	return false;

	if (regions_overlap(inst->dst, inst->size_written,
	inst->src[i], inst->size_read(devinfo, i)))
	return false;
	}

	return true;
	}

	/**
	* Like is_copy_payload(), but the instruction is required to copy a single
	* contiguous block of registers from the given register file into the
	* destination without any reordering.
	*/
	inline bool
	is_identity_payload(const struct intel_device_info *devinfo,
	brw_reg_file file, const fs_inst *inst)
	{
	if (is_copy_payload(devinfo, file, inst)) {
	brw_reg reg = inst->src[0];

	for (unsigned i = 0; i < inst->sources; i++) {
	reg.type = inst->src[i].type;
	if (!inst->src[i].equals(reg))
	return false;

	reg = byte_offset(reg, inst->size_read(devinfo, i));
	}

	return true;
	} else {
	return false;
	}
	}

	/**
	* Like is_copy_payload(), but the instruction is required to source data from
	* at least two disjoint VGRFs.
	*
	* This doesn't necessarily rule out the elimination of this instruction
	* through register coalescing, but due to limitations of the register
	* coalesce pass it might be impossible to do so directly until a later stage,
	* when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
	* instructions.
	*/
	inline bool
	is_multi_copy_payload(const struct intel_device_info *devinfo,
	const fs_inst *inst)
	{
	if (is_copy_payload(devinfo, VGRF, inst)) {
	for (unsigned i = 0; i < inst->sources; i++) {
	if (inst->src[i].nr != inst->src[0].nr)
	return true;
	}
	}

	return false;
	}

	/**
	* Like is_identity_payload(), but the instruction is required to copy the
	* whole contents of a single VGRF into the destination.
	*
	* This means that there is a good chance that the instruction will be
	* eliminated through register coalescing, but it's neither a necessary nor a
	* sufficient condition for that to happen -- E.g. consider the case where
	* source and destination registers diverge due to other instructions in the
	* program overwriting part of their contents, which isn't something we can
	* predict up front based on a cheap strictly local test of the copy
	* instruction.
	*/
	inline bool
	is_coalescing_payload(const struct intel_device_info *devinfo,
	const brw::simple_allocator &alloc, const fs_inst *inst)
	{
	return is_identity_payload(devinfo, VGRF, inst) &&
	inst->src[0].offset == 0 &&
	alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
	}

	bool
	has_bank_conflict(const struct brw_isa_info isa, const fs_inst inst);

	/* Return the subset of flag registers that an instruction could
	* potentially read or write based on the execution controls and flag
	* subregister number of the instruction.
	*/
	static inline unsigned
	brw_fs_flag_mask(const fs_inst *inst, unsigned width)
	{
	assert(util_is_power_of_two_nonzero(width));
	const unsigned start = (inst->flag_subreg * 16 + inst->group) &
	~(width - 1);
	const unsigned end = start + ALIGN(inst->exec_size, width);
	return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
	}

	static inline unsigned
	brw_fs_bit_mask(unsigned n)
	{
	return (n >= CHAR_BIT * sizeof(brw_fs_bit_mask(n)) ? ~0u : (1u << n) - 1);
	}

	static inline unsigned
	brw_fs_flag_mask(const brw_reg &r, unsigned sz)
	{
	if (r.file == ARF) {
	const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
	const unsigned end = start + sz;
	return brw_fs_bit_mask(end) & ~brw_fs_bit_mask(start);
	} else {
	return 0;
	}
	}