| /* |
| * Copyright © 2018 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "brw_fs.h" |
| #include "brw_cfg.h" |
| #include "brw_fs_builder.h" |
| |
| using namespace brw; |
| |
| namespace { |
| /* From the SKL PRM Vol 2a, "Move": |
| * |
| * "A mov with the same source and destination type, no source modifier, |
| * and no saturation is a raw move. A packed byte destination region (B |
| * or UB type with HorzStride == 1 and ExecSize > 1) can only be written |
| * using raw move." |
| */ |
| bool |
| is_byte_raw_mov(const fs_inst *inst) |
| { |
| return brw_type_size_bytes(inst->dst.type) == 1 && |
| inst->opcode == BRW_OPCODE_MOV && |
| inst->src[0].type == inst->dst.type && |
| !inst->saturate && |
| !inst->src[0].negate && |
| !inst->src[0].abs; |
| } |
| |
| /* |
| * Return an acceptable byte stride for the specified source of an |
| * instruction affected by a regioning restriction. |
| */ |
| unsigned |
| required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst, |
| unsigned i) |
| { |
| if (has_dst_aligned_region_restriction(devinfo, inst)) { |
| return MAX2(brw_type_size_bytes(inst->dst.type), |
| byte_stride(inst->dst)); |
| |
| } else if (has_subdword_integer_region_restriction(devinfo, inst, |
| &inst->src[i], 1)) { |
| /* Use a stride of 32bits if possible, since that will guarantee that |
| * the copy emitted to lower this region won't be affected by the |
| * sub-dword integer region restrictions. This may not be possible |
| * for the second source of an instruction if we're required to use |
| * packed data due to Wa_16012383669. |
| */ |
| return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4); |
| |
| } else { |
| return byte_stride(inst->src[i]); |
| } |
| } |
| |
| /* |
| * Return an acceptable byte sub-register offset for the specified source |
| * of an instruction affected by a regioning restriction. |
| */ |
| unsigned |
| required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst, |
| unsigned i) |
| { |
| if (has_dst_aligned_region_restriction(devinfo, inst)) { |
| return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); |
| |
| } else if (has_subdword_integer_region_restriction(devinfo, inst, |
| &inst->src[i], 1)) { |
| const unsigned dst_byte_stride = |
| MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type)); |
| const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i); |
| const unsigned dst_byte_offset = |
| reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); |
| const unsigned src_byte_offset = |
| reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); |
| |
| if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) { |
| assert(src_byte_stride >= dst_byte_stride); |
| /* The source is affected by the Xe2+ sub-dword integer regioning |
| * restrictions. For the case of source 0 BSpec#56640 specifies a |
| * number of equations relating the source and destination |
| * sub-register numbers in all cases where a source stride of |
| * 32bits is allowed. These equations have the form: |
| * |
| * k * Dst.SubReg % m = Src.SubReg / l |
| * |
| * For some constants k, l and m different for each combination of |
| * source and destination types and strides. The expression in |
| * the return statement below computes a valid source offset by |
| * inverting the equation like: |
| * |
| * Src.SubReg = l * k * (Dst.SubReg % m) |
| * |
| * and then scaling by the element type sizes in order to get an |
| * expression in terms of byte offsets instead of sub-register |
| * numbers. It can be easily verified that in all cases listed on |
| * the hardware spec where the source has a well-defined uniform |
| * stride the product l*k is equal to the ratio between the source |
| * and destination strides. |
| */ |
| const unsigned m = 64 * dst_byte_stride / src_byte_stride; |
| return dst_byte_offset % m * src_byte_stride / dst_byte_stride; |
| } else { |
| assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type)); |
| /* A packed source is required, likely due to the stricter |
| * requirements of the second source region. The source being |
| * packed guarantees that the region of the original instruction |
| * will be valid, but the copy may break the regioning |
| * restrictions. Do our best to try to prevent that from |
| * happening by making sure the offset of the temporary matches |
| * the original source based on the same equation above -- However |
| * that may not be sufficient if the source had a stride larger |
| * than 32bits, lowering the copy recursively may be necessary. |
| */ |
| return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]); |
| } |
| |
| } else { |
| return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); |
| } |
| } |
| |
| /* |
| * Return an acceptable byte stride for the destination of an instruction |
| * that requires it to have some particular alignment. |
| */ |
| unsigned |
| required_dst_byte_stride(const fs_inst *inst) |
| { |
| if (inst->dst.is_accumulator()) { |
| /* If the destination is an accumulator, insist that we leave the |
| * stride alone. We cannot "fix" accumulator destinations by writing |
| * to a temporary and emitting a MOV into the original destination. |
| * For multiply instructions (our one use of the accumulator), the |
| * MUL writes the full 66 bits of the accumulator whereas the MOV we |
| * would emit only writes 33 bits and leaves the top 33 bits |
| * undefined. |
| * |
| * It's safe to just require the original stride here because the |
| * lowering pass will detect the mismatch in has_invalid_src_region |
| * and fix the sources of the multiply instead of the destination. |
| */ |
| return inst->dst.hstride * brw_type_size_bytes(inst->dst.type); |
| } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) && |
| !is_byte_raw_mov(inst)) { |
| return get_exec_type_size(inst); |
| } else { |
| /* Calculate the maximum byte stride and the minimum/maximum type |
| * size across all source and destination operands we are required to |
| * lower. |
| */ |
| unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type); |
| unsigned min_size = brw_type_size_bytes(inst->dst.type); |
| unsigned max_size = brw_type_size_bytes(inst->dst.type); |
| |
| for (unsigned i = 0; i < inst->sources; i++) { |
| if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) { |
| const unsigned size = brw_type_size_bytes(inst->src[i].type); |
| max_stride = MAX2(max_stride, inst->src[i].stride * size); |
| min_size = MIN2(min_size, size); |
| max_size = MAX2(max_size, size); |
| } |
| } |
| |
| /* All operands involved in lowering need to fit in the calculated |
| * stride. |
| */ |
| assert(max_size <= 4 * min_size); |
| |
| /* Attempt to use the largest byte stride among all present operands, |
| * but never exceed a stride of 4 since that would lead to illegal |
| * destination regions during lowering. |
| */ |
| return MIN2(max_stride, 4 * min_size); |
| } |
| } |
| |
| /* |
| * Return an acceptable byte sub-register offset for the destination of an |
| * instruction that requires it to be aligned to the sub-register offset of |
| * the sources. |
| */ |
| unsigned |
| required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst) |
| { |
| for (unsigned i = 0; i < inst->sources; i++) { |
| if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) |
| if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) != |
| reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE)) |
| return 0; |
| } |
| |
| return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); |
| } |
| |
| /* |
| * Return the closest legal execution type for an instruction on |
| * the specified platform. |
| */ |
| brw_reg_type |
| required_exec_type(const intel_device_info *devinfo, const fs_inst *inst) |
| { |
| const brw_reg_type t = get_exec_type(inst); |
| const bool has_64bit = brw_type_is_float(t) ? |
| devinfo->has_64bit_float : devinfo->has_64bit_int; |
| |
| switch (inst->opcode) { |
| case SHADER_OPCODE_SHUFFLE: |
| /* IVB has an issue (which we found empirically) where it reads |
| * two address register components per channel for indirectly |
| * addressed 64-bit sources. |
| * |
| * From the Cherryview PRM Vol 7. "Register Region Restrictions": |
| * |
| * "When source or destination datatype is 64b or operation is |
| * integer DWord multiply, indirect addressing must not be |
| * used." |
| * |
| * Work around both of the above and handle platforms that |
| * don't support 64-bit types at all. |
| */ |
| if ((!devinfo->has_64bit_int || |
| intel_device_info_is_9lp(devinfo) || |
| devinfo->ver >= 20) && brw_type_size_bytes(t) > 4) |
| return BRW_TYPE_UD; |
| else if (has_dst_aligned_region_restriction(devinfo, inst)) |
| return brw_int_type(brw_type_size_bytes(t), false); |
| else |
| return t; |
| |
| case SHADER_OPCODE_SEL_EXEC: |
| if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) && |
| brw_type_size_bytes(t) > 4) |
| return BRW_TYPE_UD; |
| else |
| return t; |
| |
| case SHADER_OPCODE_QUAD_SWIZZLE: |
| if (has_dst_aligned_region_restriction(devinfo, inst)) |
| return brw_int_type(brw_type_size_bytes(t), false); |
| else |
| return t; |
| |
| case SHADER_OPCODE_CLUSTER_BROADCAST: |
| /* From the Cherryview PRM Vol 7. "Register Region Restrictions": |
| * |
| * "When source or destination datatype is 64b or operation is |
| * integer DWord multiply, indirect addressing must not be |
| * used." |
| * |
| * For MTL (verx10 == 125), float64 is supported, but int64 is not. |
| * Therefore we need to lower cluster broadcast using 32-bit int ops. |
| * |
| * For gfx12.5+ platforms that support int64, the register regions |
| * used by cluster broadcast aren't supported by the 64-bit pipeline. |
| * |
| * Work around the above and handle platforms that don't |
| * support 64-bit types at all. |
| */ |
| if ((!has_64bit || devinfo->verx10 >= 125 || |
| intel_device_info_is_9lp(devinfo) || |
| devinfo->ver >= 20) && brw_type_size_bytes(t) > 4) |
| return BRW_TYPE_UD; |
| else |
| return brw_int_type(brw_type_size_bytes(t), false); |
| |
| default: |
| return t; |
| } |
| } |
| |
| /* |
| * Return whether the instruction has an unsupported channel bit layout |
| * specified for the i-th source region. |
| */ |
| bool |
| has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst, |
| unsigned i) |
| { |
| /* Wa_22016140776: |
| * |
| * Scalar broadcast on HF math (packed or unpacked) must not be used. |
| * Compiler must use a mov instruction to expand the scalar value to |
| * a vector before using in a HF (packed or unpacked) math operation. |
| */ |
| if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) && |
| is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) { |
| return true; |
| } |
| |
| if (is_send(inst) || inst->is_control_source(i) || |
| inst->opcode == BRW_OPCODE_DPAS) { |
| return false; |
| } |
| |
| const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); |
| const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); |
| |
| return (has_dst_aligned_region_restriction(devinfo, inst) && |
| !is_uniform(inst->src[i]) && |
| (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) || |
| src_byte_offset != dst_byte_offset)) || |
| (has_subdword_integer_region_restriction(devinfo, inst) && |
| (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) || |
| src_byte_offset != required_src_byte_offset(devinfo, inst, i))); |
| } |
| |
| /* |
| * Return whether the instruction has an unsupported channel bit layout |
| * specified for the destination region. |
| */ |
| bool |
| has_invalid_dst_region(const intel_device_info *devinfo, |
| const fs_inst *inst) |
| { |
| if (is_send(inst)) { |
| return false; |
| } else { |
| const brw_reg_type exec_type = get_exec_type(inst); |
| const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); |
| const bool is_narrowing_conversion = !is_byte_raw_mov(inst) && |
| brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type); |
| |
| return (has_dst_aligned_region_restriction(devinfo, inst) && |
| (required_dst_byte_stride(inst) != byte_stride(inst->dst) || |
| required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) || |
| (is_narrowing_conversion && |
| required_dst_byte_stride(inst) != byte_stride(inst->dst)); |
| } |
| } |
| |
| /** |
| * Return a non-zero value if the execution type of the instruction is |
| * unsupported. The destination and sources matching the returned mask |
| * will be bit-cast to an integer type of appropriate size, lowering any |
| * source or destination modifiers into separate MOV instructions. |
| */ |
| unsigned |
| has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst) |
| { |
| if (required_exec_type(devinfo, inst) != get_exec_type(inst)) { |
| switch (inst->opcode) { |
| case SHADER_OPCODE_SHUFFLE: |
| case SHADER_OPCODE_QUAD_SWIZZLE: |
| case SHADER_OPCODE_CLUSTER_BROADCAST: |
| case SHADER_OPCODE_BROADCAST: |
| case SHADER_OPCODE_MOV_INDIRECT: |
| return 0x1; |
| |
| case SHADER_OPCODE_SEL_EXEC: |
| return 0x3; |
| |
| default: |
| unreachable("Unknown invalid execution type source mask."); |
| } |
| } else { |
| return 0; |
| } |
| } |
| |
| /** |
| * Return whether the instruction has an unsupported type conversion |
| * that must be handled by expanding the source operand. |
| */ |
| bool |
| has_invalid_src_conversion(const intel_device_info *devinfo, |
| const fs_inst *inst) |
| { |
| /* Scalar byte to float conversion is not allowed on DG2+ */ |
| return devinfo->verx10 >= 125 && |
| inst->opcode == BRW_OPCODE_MOV && |
| brw_type_is_float(inst->dst.type) && |
| brw_type_size_bits(inst->src[0].type) == 8 && |
| is_uniform(inst->src[0]); |
| } |
| |
| /* |
| * Return whether the instruction has unsupported source modifiers |
| * specified for the i-th source region. |
| */ |
| bool |
| has_invalid_src_modifiers(const intel_device_info *devinfo, |
| const fs_inst *inst, unsigned i) |
| { |
| return (!inst->can_do_source_mods(devinfo) && |
| (inst->src[i].negate || inst->src[i].abs)) || |
| ((has_invalid_exec_type(devinfo, inst) & (1u << i)) && |
| (inst->src[i].negate || inst->src[i].abs || |
| inst->src[i].type != get_exec_type(inst))) || |
| has_invalid_src_conversion(devinfo, inst); |
| } |
| |
| /* |
| * Return whether the instruction has an unsupported type conversion |
| * specified for the destination. |
| */ |
| bool |
| has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst) |
| { |
| switch (inst->opcode) { |
| case BRW_OPCODE_MOV: |
| return false; |
| case BRW_OPCODE_SEL: |
| return inst->dst.type != get_exec_type(inst); |
| default: |
| /* FIXME: We assume the opcodes not explicitly mentioned before just |
| * work fine with arbitrary conversions, unless they need to be |
| * bit-cast. |
| */ |
| return has_invalid_exec_type(devinfo, inst) && |
| inst->dst.type != get_exec_type(inst); |
| } |
| } |
| |
| /** |
| * Return whether the instruction has unsupported destination modifiers. |
| */ |
| bool |
| has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst) |
| { |
| return (has_invalid_exec_type(devinfo, inst) && |
| (inst->saturate || inst->conditional_mod)) || |
| has_invalid_conversion(devinfo, inst); |
| } |
| |
| /** |
| * Return whether the instruction has non-standard semantics for the |
| * conditional mod which don't cause the flag register to be updated with |
| * the comparison result. |
| */ |
| bool |
| has_inconsistent_cmod(const fs_inst *inst) |
| { |
| return inst->opcode == BRW_OPCODE_SEL || |
| inst->opcode == BRW_OPCODE_CSEL || |
| inst->opcode == BRW_OPCODE_IF || |
| inst->opcode == BRW_OPCODE_WHILE; |
| } |
| |
| bool |
| lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); |
| } |
| |
| namespace brw { |
| /** |
| * Remove any modifiers from the \p i-th source region of the instruction, |
| * including negate, abs and any implicit type conversion to the execution |
| * type. Instead any source modifiers will be implemented as a separate |
| * MOV instruction prior to the original instruction. |
| */ |
| bool |
| lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) |
| { |
| assert(inst->components_read(i) == 1); |
| assert(v->devinfo->has_integer_dword_mul || |
| inst->opcode != BRW_OPCODE_MUL || |
| brw_type_is_float(get_exec_type(inst)) || |
| MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 || |
| brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst)); |
| |
| const fs_builder ibld(v, block, inst); |
| const brw_reg tmp = ibld.vgrf(get_exec_type(inst)); |
| |
| lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); |
| inst->src[i] = tmp; |
| |
| return true; |
| } |
| } |
| |
| namespace { |
| /** |
| * Remove any modifiers from the destination region of the instruction, |
| * including saturate, conditional mod and any implicit type conversion |
| * from the execution type. Instead any destination modifiers will be |
| * implemented as a separate MOV instruction after the original |
| * instruction. |
| */ |
| bool |
| lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst) |
| { |
| const fs_builder ibld(v, block, inst); |
| const brw_reg_type type = get_exec_type(inst); |
| /* Not strictly necessary, but if possible use a temporary with the same |
| * channel alignment as the current destination in order to avoid |
| * violating the restrictions enforced later on by lower_src_region() |
| * and lower_dst_region(), which would introduce additional copy |
| * instructions into the program unnecessarily. |
| */ |
| const unsigned stride = |
| brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 : |
| brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type); |
| brw_reg tmp = ibld.vgrf(type, stride); |
| ibld.UNDEF(tmp); |
| tmp = horiz_stride(tmp, stride); |
| |
| /* Emit a MOV taking care of all the destination modifiers. */ |
| fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); |
| mov->saturate = inst->saturate; |
| if (!has_inconsistent_cmod(inst)) |
| mov->conditional_mod = inst->conditional_mod; |
| if (inst->opcode != BRW_OPCODE_SEL) { |
| mov->predicate = inst->predicate; |
| mov->predicate_inverse = inst->predicate_inverse; |
| } |
| mov->flag_subreg = inst->flag_subreg; |
| lower_instruction(v, block, mov); |
| |
| /* Point the original instruction at the temporary, and clean up any |
| * destination modifiers. |
| */ |
| assert(inst->size_written == inst->dst.component_size(inst->exec_size)); |
| inst->dst = tmp; |
| inst->size_written = inst->dst.component_size(inst->exec_size); |
| inst->saturate = false; |
| if (!has_inconsistent_cmod(inst)) |
| inst->conditional_mod = BRW_CONDITIONAL_NONE; |
| |
| assert(!inst->flags_written(v->devinfo) || !mov->predicate); |
| return true; |
| } |
| |
| /** |
| * Remove any non-trivial shuffling of data from the \p i-th source region |
| * of the instruction. Instead implement the region as a series of integer |
| * copies into a temporary with the same channel layout as the destination. |
| */ |
| bool |
| lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) |
| { |
| assert(inst->components_read(i) == 1); |
| const intel_device_info *devinfo = v->devinfo; |
| const fs_builder ibld(v, block, inst); |
| const unsigned stride = required_src_byte_stride(devinfo, inst, i) / |
| brw_type_size_bytes(inst->src[i].type); |
| assert(stride > 0); |
| /* Calculate the size of the temporary allocation manually instead of |
| * relying on the builder, since we may have to add some amount of |
| * padding mandated by the hardware for Xe2+ instructions with sub-dword |
| * integer regions. |
| */ |
| const unsigned size = |
| DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) + |
| inst->exec_size * stride * |
| brw_type_size_bytes(inst->src[i].type), |
| reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo); |
| brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type); |
| ibld.UNDEF(tmp); |
| tmp = byte_offset(horiz_stride(tmp, stride), |
| required_src_byte_offset(devinfo, inst, i)); |
| |
| /* Emit a series of 32-bit integer copies with any source modifiers |
| * cleaned up (because their semantics are dependent on the type). |
| */ |
| const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4), |
| false); |
| const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type); |
| brw_reg raw_src = inst->src[i]; |
| raw_src.negate = false; |
| raw_src.abs = false; |
| |
| for (unsigned j = 0; j < n; j++) { |
| fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j), |
| subscript(raw_src, raw_type, j)); |
| if (has_subdword_integer_region_restriction(devinfo, jnst)) { |
| /* The copy isn't guaranteed to comply with all subdword integer |
| * regioning restrictions in some cases. Lower it recursively. |
| */ |
| lower_instruction(v, block, jnst); |
| } |
| } |
| |
| /* Point the original instruction at the temporary, making sure to keep |
| * any source modifiers in the instruction. |
| */ |
| brw_reg lower_src = tmp; |
| lower_src.negate = inst->src[i].negate; |
| lower_src.abs = inst->src[i].abs; |
| inst->src[i] = lower_src; |
| |
| return true; |
| } |
| |
| /** |
| * Remove any non-trivial shuffling of data from the destination region of |
| * the instruction. Instead implement the region as a series of integer |
| * copies from a temporary with a channel layout compatible with the |
| * sources. |
| */ |
| bool |
| lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) |
| { |
| /* We cannot replace the result of an integer multiply which writes the |
| * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit |
| * value whereas the MOV will act on only 32 or 33 bits of the |
| * accumulator. |
| */ |
| assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() || |
| brw_type_is_float(inst->dst.type)); |
| |
| const fs_builder ibld(v, block, inst); |
| const unsigned stride = required_dst_byte_stride(inst) / |
| brw_type_size_bytes(inst->dst.type); |
| assert(stride > 0); |
| brw_reg tmp = ibld.vgrf(inst->dst.type, stride); |
| ibld.UNDEF(tmp); |
| tmp = horiz_stride(tmp, stride); |
| |
| if (!inst->dst.is_null()) { |
| /* Emit a series of 32-bit integer copies from the temporary into the |
| * original destination. |
| */ |
| const brw_reg_type raw_type = |
| brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4), false); |
| |
| const unsigned n = |
| brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type); |
| |
| if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { |
| /* Note that in general we cannot simply predicate the copies on |
| * the same flag register as the original instruction, since it |
| * may have been overwritten by the instruction itself. Instead |
| * initialize the temporary with the previous contents of the |
| * destination register. |
| */ |
| for (unsigned j = 0; j < n; j++) |
| ibld.MOV(subscript(tmp, raw_type, j), |
| subscript(inst->dst, raw_type, j)); |
| } |
| |
| for (unsigned j = 0; j < n; j++) { |
| fs_inst *jnst = ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j), |
| subscript(tmp, raw_type, j)); |
| if (has_subdword_integer_region_restriction(v->devinfo, jnst)) { |
| /* The copy isn't guaranteed to comply with all subdword integer |
| * regioning restrictions in some cases. Lower it recursively. |
| */ |
| lower_instruction(v, block, jnst); |
| } |
| } |
| |
| /* If the destination was an accumulator, after lowering it will be a |
| * GRF. Clear writes_accumulator for the instruction. |
| */ |
| if (inst->dst.is_accumulator()) |
| inst->writes_accumulator = false; |
| } |
| |
| /* Point the original instruction at the temporary, making sure to keep |
| * any destination modifiers in the instruction. |
| */ |
| assert(inst->size_written == inst->dst.component_size(inst->exec_size)); |
| inst->dst = tmp; |
| inst->size_written = inst->dst.component_size(inst->exec_size); |
| |
| return true; |
| } |
| |
| /** |
| * Change sources and destination of the instruction to an |
| * appropriate legal type, splitting the instruction into multiple |
| * ones of smaller execution type if necessary, to be used in cases |
| * where the execution type of an instruction is unsupported. |
| */ |
| bool |
| lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst) |
| { |
| assert(inst->dst.type == get_exec_type(inst)); |
| const unsigned mask = has_invalid_exec_type(v->devinfo, inst); |
| const brw_reg_type raw_type = required_exec_type(v->devinfo, inst); |
| const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type); |
| const fs_builder ibld(v, block, inst); |
| |
| brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride); |
| ibld.UNDEF(tmp); |
| tmp = horiz_stride(tmp, inst->dst.stride); |
| |
| for (unsigned j = 0; j < n; j++) { |
| fs_inst sub_inst = *inst; |
| |
| for (unsigned i = 0; i < inst->sources; i++) { |
| if (mask & (1u << i)) { |
| assert(inst->src[i].type == inst->dst.type); |
| sub_inst.src[i] = subscript(inst->src[i], raw_type, j); |
| } |
| } |
| |
| sub_inst.dst = subscript(tmp, raw_type, j); |
| |
| assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size)); |
| assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate); |
| ibld.emit(sub_inst); |
| |
| fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j), |
| subscript(tmp, raw_type, j)); |
| if (inst->opcode != BRW_OPCODE_SEL) { |
| mov->predicate = inst->predicate; |
| mov->predicate_inverse = inst->predicate_inverse; |
| } |
| lower_instruction(v, block, mov); |
| } |
| |
| inst->remove(block); |
| |
| return true; |
| } |
| |
| /** |
| * Fast-path for very specific kinds of invalid regions. |
| * |
| * Gfx12.5+ does not allow moves of B or UB sources to floating-point |
| * destinations. This restriction can be resolved more efficiently than by |
| * the general lowering in lower_src_modifiers or lower_src_region. |
| */ |
| void |
| lower_src_conversion(fs_visitor *v, bblock_t *block, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = v->devinfo; |
| const fs_builder ibld = fs_builder(v, block, inst).scalar_group(); |
| |
| /* We only handle scalar conversions from small types for now. */ |
| assert(is_uniform(inst->src[0])); |
| |
| brw_reg tmp = ibld.vgrf(brw_type_with_size(inst->src[0].type, 32)); |
| fs_inst *mov = ibld.MOV(tmp, inst->src[0]); |
| |
| inst->src[0] = component(tmp, 0); |
| |
| /* Assert that neither the added MOV nor the original instruction will need |
| * any additional lowering. |
| */ |
| assert(!has_invalid_src_region(devinfo, mov, 0)); |
| assert(!has_invalid_src_modifiers(devinfo, mov, 0)); |
| assert(!has_invalid_dst_region(devinfo, mov)); |
| |
| assert(!has_invalid_src_region(devinfo, inst, 0)); |
| assert(!has_invalid_src_modifiers(devinfo, inst, 0)); |
| } |
| |
| /** |
| * Legalize the source and destination regioning controls of the specified |
| * instruction. |
| */ |
| bool |
| lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = v->devinfo; |
| bool progress = false; |
| |
| /* BROADCAST is special. It's destination region is a bit of a lie, and |
| * it gets lower in brw_eu_emit. For the purposes of region |
| * restrictions, let's assume that the final code emission will do the |
| * right thing. Doing a bunch of shuffling here is only going to make a |
| * mess of things. |
| */ |
| if (inst->opcode == SHADER_OPCODE_BROADCAST) |
| return false; |
| |
| if (has_invalid_dst_modifiers(devinfo, inst)) |
| progress |= lower_dst_modifiers(v, block, inst); |
| |
| if (has_invalid_dst_region(devinfo, inst)) |
| progress |= lower_dst_region(v, block, inst); |
| |
| if (has_invalid_src_conversion(devinfo, inst)) { |
| lower_src_conversion(v, block, inst); |
| progress = true; |
| } |
| |
| for (unsigned i = 0; i < inst->sources; i++) { |
| if (has_invalid_src_modifiers(devinfo, inst, i)) |
| progress |= lower_src_modifiers(v, block, inst, i); |
| |
| if (has_invalid_src_region(devinfo, inst, i)) |
| progress |= lower_src_region(v, block, inst, i); |
| } |
| |
| if (has_invalid_exec_type(devinfo, inst)) |
| progress |= lower_exec_type(v, block, inst); |
| |
| return progress; |
| } |
| } |
| |
| bool |
| brw_lower_regioning(fs_visitor &s) |
| { |
| bool progress = false; |
| |
| foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) |
| progress |= lower_instruction(&s, block, inst); |
| |
| if (progress) |
| s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); |
| |
| return progress; |
| } |