| /* |
| * Copyright © 2023 Advanced Micro Devices, Inc. |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| /* Introduction |
| * ============ |
| * |
| * This pass optimizes varyings between 2 shaders, which means dead input/ |
| * output removal, constant and uniform load propagation, deduplication, |
| * compaction, and inter-shader code motion. This is used during the shader |
| * linking process. |
| * |
| * |
| * Notes on behavior |
| * ================= |
| * |
| * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector |
| * varyings are not allowed. |
| * |
| * Indirectly-indexed varying slots (not vertices) are not optimized or |
| * compacted, but unused slots of indirectly-indexed varyings are still filled |
| * with directly-indexed varyings during compaction. Indirectly-indexed |
| * varyings are still removed if they are unused by the other shader. |
| * |
| * Indirectly-indexed vertices don't disallow optimizations, but compromises |
| * are made depending on how they are accessed. They are common in TCS, TES, |
| * and GS, so there is a desire to optimize them as much as possible. More on |
| * that in various sections below. |
| * |
| * Transform feedback doesn't prevent most optimizations such as constant |
| * propagation and compaction. Shaders can be left with output stores that set |
| * the no_varying flag, meaning the output is not consumed by the next shader, |
| * which means that optimizations did their job and now the output is only |
| * consumed by transform feedback. |
| * |
| * All legacy varying slots are optimized when it's allowed. |
| * |
| * |
| * Convergence property of shader outputs |
| * ====================================== |
| * |
| * When an output stores an SSA that is convergent and all stores of that |
| * output appear in unconditional blocks or conditional blocks with |
| * a convergent entry condition and the shader is not GS, it implies that all |
| * vertices of that output have the same value, therefore the output can be |
| * promoted to flat because all interpolation modes lead to the same result |
| * as flat. Such outputs are opportunistically compacted with both flat and |
| * non-flat varyings based on whichever has unused slots in their vec4s. This |
| * pass refers to such inputs, outputs, and varyings as "convergent" (meaning |
| * all vertices are always equal). |
| * |
| * By default, flat varyings are the only ones that are not considered convergent |
| * because we want the flexibility to pack convergent varyings with both flat |
| * and non-flat varyings, and since flat varyings can contain integers and |
| * doubles, we can never interpolate them as FP32 or FP16. Optimizations start |
| * with separate interpolated, flat, and convergent groups of varyings, and |
| * they choose whether they want to promote convergent to interpolated or |
| * flat, or whether to leave that decision to the end when the compaction |
| * happens. |
| * |
| * The above default behavior doesn't apply when the hw supports convergent |
| * flat loads with interpolated vec4 slots. (there is a NIR option) |
| * |
| * TES patch inputs are always convergent because they are uniform within |
| * a primitive. |
| * |
| * |
| * Optimization steps |
| * ================== |
| * |
| * 1. Determine which varying slots can be optimized and how. |
| * |
| * * When a varying is said to be "optimized" in the following text, it |
| * means all optimizations are performed, such as removal, constant |
| * propagation, and deduplication. |
| * * All VARn, PATCHn, and FOGC varyings are always optimized and |
| * compacted. |
| * * PRIMITIVE_ID is treated as VARn in (GS, FS). |
| * * TEXn are removed if they are dead (except TEXn inputs, which can't be |
| * removed due to being affected by the coord replace state). TEXn can’t |
| * also be optimized or compacted due to being affected by the coord |
| * replace state. TEXn not consumed by FS are treated as VARn. |
| * * COLn and BFCn only propagate constants if they are between 0 and 1 |
| * because of the clamp vertex color state, and they are only |
| * deduplicated and compacted among themselves because they are affected |
| * by the flat shade, provoking vertex, two-side color selection, and |
| * clamp vertex color states. COLn and BFCn not consumed by FS are |
| * treated as VARn. |
| * * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be |
| * removed, but they are demoted to sysval-only outputs by setting |
| * the "no_varying" flag (i.e. they can be removed as varyings), so |
| * drivers should look at the "no_varying" flag. If an output is not |
| * a sysval output in a specific stage, it's treated as VARn. (such as |
| * POS in TCS) |
| * * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing. |
| * |
| * 2. Remove unused inputs and outputs |
| * |
| * * Outputs not used in the next shader are removed. |
| * * Inputs not initialized by the previous shader are replaced with undef |
| * except: |
| * * LAYER and VIEWPORT are replaced with 0 in FS. |
| * * TEXn.xy is untouched because the coord replace state can set it, and |
| * TEXn.zw is replaced by (0, 1), which is equal to the coord replace |
| * value. |
| * * Output loads that have no output stores anywhere in the shader are |
| * replaced with undef. (for TCS, though it works with any shader) |
| * * Output stores with transform feedback are preserved, but get |
| * the “no_varying” flag, meaning they are not consumed by the next |
| * shader stage. Later, transform-feedback-only varyings are compacted |
| * (relocated) such that they are always last. |
| * * TCS outputs that are read by TCS, but not used by TES get |
| * the "no_varying" flag to indicate that they are only read by TCS and |
| * not consumed by TES. Later, such TCS outputs are compacted (relocated) |
| * such that they are always last to keep all outputs consumed by TES |
| * consecutive without holes. |
| * |
| * 3. Constant, uniform, UBO load, and uniform expression propagation |
| * |
| * * Define “uniform expressions” as ALU expressions only sourcing |
| * constants, uniforms, and UBO loads. |
| * * Constants, uniforms, UBO loads, and uniform expressions stored |
| * in outputs are moved into the next shader, and the outputs are removed. |
| * * The same propagation is done from output stores to output loads. |
| * (for TCS, though it works with any shader) |
| * * If there are multiple stores to the same output, all such stores |
| * should store the same constant, uniform, UBO load, or uniform |
| * expression for the expression to be propagated. If an output has |
| * multiple vertices, all vertices should store the same expression. |
| * * nir->options has callbacks that are used to estimate the cost of |
| * uniform expressions that drivers can set to control the complexity of |
| * uniform expressions that are propagated. This is to ensure that |
| * we don't increase the GPU overhead measurably by moving code across |
| * pipeline stages that amplify GPU work. |
| * * Special cases: |
| * * Constant COLn and BFCn are propagated only if the constants are |
| * in the [0, 1] range because of the clamp vertex color state. |
| * If both COLn and BFCn are written, they must write the same |
| * constant. If BFCn is written but not COLn, the constant is |
| * propagated from BFCn to COLn. |
| * * TEX.xy is untouched because of the coord replace state. |
| * If TEX.zw is (0, 1), only those constants are propagated because |
| * they match the coord replace values. |
| * * CLIP_DISTn, LAYER and VIEWPORT are always propagated. |
| * Eliminated output stores get the "no_varying" flag if they are also |
| * xfb stores or write sysval outputs. |
| * |
| * 4. Remove duplicated output components |
| * |
| * * By comparing SSA defs. |
| * * If there are multiple stores to the same output, all such stores |
| * should store the same SSA as all stores of another output for |
| * the output to be considered duplicated. If an output has multiple |
| * vertices, all vertices should store the same SSA. |
| * * Deduplication can only be done between outputs of the same category. |
| * Those are: interpolated, patch, flat, interpolated color, flat color, |
| * and conditionally interpolated color based on the flat |
| * shade state |
| * * Everything is deduplicated except TEXn due to the coord replace state. |
| * * Eliminated output stores get the "no_varying" flag if they are also |
| * xfb stores or write sysval outputs. |
| * |
| * 5. Backward inter-shader code motion |
| * |
| * "Backward" refers to moving code in the opposite direction that shaders |
| * are executed, i.e. moving code from the consumer to the producer. |
| * |
| * Fragment shader example: |
| * ``` |
| * result = input0 * uniform + input1 * constant + UBO.variable; |
| * ``` |
| * |
| * The computation of "result" in the above example can be moved into |
| * the previous shader and both inputs can be replaced with a new input |
| * holding the value of "result", thus making the shader smaller and |
| * possibly reducing the number of inputs, uniforms, and UBOs by 1. |
| * |
| * Such code motion can be performed for any expression sourcing only |
| * inputs, constants, and uniforms except for fragment shaders, which can |
| * also do it but with the following limitations: |
| * * Only these transformations can be perfomed with interpolated inputs |
| * and any composition of these transformations (such as lerp), which can |
| * all be proven mathematically: |
| * * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j) |
| * * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j) |
| * * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j) |
| * * all of these transformations are considered "inexact" in NIR |
| * * interp interpolates an input according to the barycentric |
| * coordinates (i, j), which are different for perspective, |
| * noperspective, center, centroid, sample, at_offset, and at_sample |
| * modes. |
| * * convergent_expr is any expression sourcing only constants, |
| * uniforms, and convergent inputs. The only requirement on |
| * convergent_expr is that it doesn't vary between vertices of |
| * the same primitive, but it can vary between primitives. |
| * * If inputs are flat or convergent, there are no limitations on |
| * expressions that can be moved. |
| * * Interpolated and flat inputs can't mix in the same expression, but |
| * convergent inputs can mix with both. |
| * * The interpolation qualifier of the new input is inherited from |
| * the removed non-convergent inputs that should all have the same (i, j). |
| * If there are no non-convergent inputs, then the new input is declared |
| * as flat (for simplicity; we can't choose the barycentric coordinates |
| * at random because AMD doesn't like when there are multiple sets of |
| * barycentric coordinates in the same shader unnecessarily). |
| * * Inf values break code motion across interpolation. See the section |
| * discussing how we handle it near the end. |
| * |
| * The above rules also apply to open-coded TES input interpolation, which |
| * is handled the same as FS input interpolation. The only differences are: |
| * * Open-coded TES input interpolation must match one of the allowed |
| * equations. Different interpolation equations are treated the same as |
| * different interpolation qualifiers in FS. |
| * * Patch varyings are always treated as convergent. |
| * |
| * Prerequisites: |
| * * We need a post-dominator tree that is constructed from a graph where |
| * vertices are instructions and directed edges going into them are |
| * the values of their source operands. This is different from how NIR |
| * dominance works, which represents all instructions within a basic |
| * block as a linear chain of vertices in the graph. |
| * In our graph, all loads without source operands and all constants are |
| * entry nodes in the graph, and all stores and discards are exit nodes |
| * in the graph. Each shader can have multiple disjoint graphs where |
| * the Lowest Common Ancestor of 2 instructions doesn't exist. |
| * * Given the above definition, the instruction whose result is the best |
| * candidate for a new input is the farthest instruction that |
| * post-dominates one of more inputs and is movable between shaders. |
| * |
| * Algorithm Idea Part 1: Search |
| * * Pick any input load that is hypothetically movable and call it |
| * the iterator. |
| * * Get the immediate post-dominator of the iterator, and if it's movable, |
| * replace the iterator with it. |
| * * Repeat the previous step until the obtained immediate post-dominator |
| * is not movable. |
| * * The iterator now contains the farthest post-dominator that is movable. |
| * * Gather all input loads that the post-dominator consumes. |
| * * For each of those input loads, all matching output stores must be |
| * in the same block (because they will be replaced by a single store). |
| * |
| * Algorithm Idea Part 2: Code Motion |
| * * Clone the post-dominator in the producer except input loads, which |
| * should be replaced by stored output values. Uniform and UBO loads, |
| * if any, should be cloned too. |
| * * Remove the original output stores. |
| * * Replace the post-dominator from the consumer with a new input load. |
| * * The step above makes the post-dominated input load that we picked |
| * at the beginning dead, but other input loads used by the post- |
| * dominator might still have other uses (shown in the example below). |
| * |
| * Example SSA-use graph - initial shader and the result: |
| * ``` |
| * input0 input1 input0 input1 |
| * \ / \ | \ |
| * constant alu ... ======> | ... |
| * \ / |
| * alu |
| * (post-dominator) |
| * ``` |
| * |
| * Description: |
| * On the right, the algorithm moved the constant and both ALU opcodes |
| * into the previous shader and input0 now contains the value of |
| * the post-dominator. input1 stays the same because it still has one |
| * use left. If input1 hadn't had the other use, it would have been |
| * removed. |
| * |
| * If the algorithm moves any code, the algorithm is repeated until there |
| * is no code that it can move. |
| * |
| * Which shader pairs are supported: |
| * * (VS, FS), (TES, FS): yes, fully |
| * * Limitation: If Infs must be preserved, no code is moved across |
| * interpolation, so only flat varyings are optimized. |
| * * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO |
| * * Current behavior: |
| * * Per-vertex inputs are rejected. |
| * * Possible solution: |
| * * All input loads used by an accepted post-dominator must use |
| * the same vertex index. The post-dominator must use all loads with |
| * that vertex index. |
| * * If a post-dominator is found for an input load from a specific |
| * slot, all other input loads from that slot must also have |
| * an accepted post-dominator, and all such post-dominators should |
| * be identical expressions. |
| * * (TCS, TES), (VS, TES): yes, with limitations |
| * * Limitations: |
| * * Only 1 store and 1 load per slot allowed. |
| * * No output loads allowed. |
| * * All stores used by an accepted post-dominator must be in |
| * the same block. |
| * * TCS barriers don't matter because there are no output loads. |
| * * Patch varyings are handled trivially with the above constraints. |
| * * Per-vertex outputs should only be indexed by gl_InvocationID. |
| * * An interpolated TES load is any ALU instruction that computes |
| * the result of linear interpolation of per-vertex inputs from |
| * the same slot using gl_TessCoord. If such an ALU instruction is |
| * found, it must be the only one, and all per-vertex input loads |
| * from that slot must feed into it. The interpolation equation must |
| * be equal to one of the allowed equations. Then the same rules as |
| * for interpolated FS inputs are used, treating different |
| * interpolation equations just like different interpolation |
| * qualifiers. |
| * * Patch inputs are treated as convergent, which means they are |
| * allowed to be in the same movable expression as interpolated TES |
| * inputs, and the same rules as for convergent FS inputs apply. |
| * * (GS, FS), (MS, FS): no |
| * * Workaround: Add a passthrough VS between GS/MS and FS, run |
| * the pass on the (VS, FS) pair to move code out of FS, |
| * and inline that VS at the end of your hw-specific |
| * GS/MS if it's possible. |
| * * (TS, MS): no |
| * |
| * The disadvantage of using the post-dominator tree is that it's a tree, |
| * which means there is only 1 post-dominator of each input. This example |
| * shows a case that could be optimized by replacing 3 inputs with 2 inputs, |
| * reducing the number of inputs by 1, but the immediate post-dominator of |
| * all input loads is NULL: |
| * ``` |
| * temp0 = input0 + input1 + input2; |
| * temp1 = input0 + input1 * const1 + input2 * const2; |
| * ``` |
| * |
| * If there is a graph algorithm that returns the best solution to |
| * the above case (which is temp0 and temp1 to replace all 3 inputs), let |
| * us know. |
| * |
| * 6. Forward inter-shader code motion |
| * |
| * TODO: Not implemented. The text below is a draft of the description. |
| * |
| * "Forward" refers to moving code in the direction that shaders are |
| * executed, i.e. moving code from the producer to the consumer. |
| * |
| * Vertex shader example: |
| * ``` |
| * output0 = value + 1; |
| * output1 = value * 2; |
| * ``` |
| * |
| * Both outputs can be replaced by 1 output storing "value", and both ALU |
| * operations can be moved into the next shader. |
| * |
| * The same dominance algorithm as in the previous optimization is used, |
| * except that: |
| * * Instead of inputs, we use outputs. |
| * * Instead of a post-dominator tree, we use a dominator tree of the exact |
| * same graph. |
| * |
| * The algorithm idea is: For each pair of 2 output stores, find their |
| * Lowest Common Ancestor in the dominator tree, and that's a candidate |
| * for a new output. All movable loads like load_const should be removed |
| * from the graph, otherwise the LCA wouldn't exist. |
| * |
| * The limitations on instructions that can be moved between shaders across |
| * interpolated loads are exactly the same as the previous optimization. |
| * |
| * nir->options has callbacks that are used to estimate the cost of |
| * expressions that drivers can set to control the complexity of |
| * expressions that can be moved to later shaders. This is to ensure that |
| * we don't increase the GPU overhead measurably by moving code across |
| * pipeline stages that amplify GPU work. |
| * |
| * 7. Compaction to vec4 slots (AKA packing) |
| * |
| * First, varyings are divided into these groups, and components from each |
| * group are assigned locations in this order (effectively forcing |
| * components from the same group to be in the same vec4 slot or adjacent |
| * vec4 slots) with some exceptions listed below: |
| * |
| * Non-FS groups (patch and non-patch are packed separately): |
| * * 32-bit cross-invocation (TCS inputs using cross-invocation access) |
| * * 16-bit cross-invocation (TCS inputs using cross-invocation access) |
| * * 32-bit flat |
| * * 16-bit flat |
| * * 32-bit no-varying (TCS outputs read by TCS but not TES) |
| * * 16-bit no-varying (TCS outputs read by TCS but not TES) |
| * |
| * FS groups: |
| * * 32-bit interpolated (always FP32) |
| * * 32-bit flat |
| * * 32-bit convergent (always FP32) |
| * * 16-bit interpolated (always FP16) |
| * * 16-bit flat |
| * * 16-bit convergent (always FP16) |
| * * 32-bit transform feedback only |
| * * 16-bit transform feedback only |
| * |
| * When the driver/hw can't mix different interpolation qualifiers |
| * in the same vec4, the interpolated groups are further split into 6 |
| * groups, one for each qualifier. |
| * |
| * Then, all scalar varyings are relocated into new slots, starting from |
| * VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit |
| * increments. Rules: |
| * * Both 32-bit and 16-bit flat varyings are packed in the same vec4. |
| * * Convergent varyings can be packed with interpolated varyings of |
| * the same type or flat. The group to pack with is chosen based on |
| * whichever has unused scalar slots because we want to reduce the total |
| * number of vec4s. After filling all unused scalar slots, the remaining |
| * convergent varyings are packed as flat. |
| * * Transform-feedback-only slots and no-varying slots are packed last, |
| * so that they are consecutive and not intermixed with varyings consumed |
| * by the next shader stage, and 32-bit and 16-bit slots are packed in |
| * the same vec4. This allows reducing memory for outputs by ignoring |
| * the trailing outputs that the next shader stage doesn't read. |
| * |
| * In the end, we should end up with these groups for FS: |
| * * 32-bit interpolated (always FP32) on separate vec4s |
| * * 16-bit interpolated (always FP16) on separate vec4s |
| * * 32-bit flat and 16-bit flat, mixed in the same vec4 |
| * * 32-bit and 16-bit transform feedback only, sharing vec4s with flat |
| * |
| * Colors are compacted the same but separately because they can't be mixed |
| * with VARn. Colors are divided into 3 FS groups. They are: |
| * * 32-bit maybe-interpolated (affected by the flat-shade state) |
| * * 32-bit interpolated (not affected by the flat-shade state) |
| * * 32-bit flat (not affected by the flat-shade state) |
| * |
| * To facilitate driver-specific output merging, color channels are |
| * assigned in a rotated order depending on which one the first unused VARn |
| * channel is. For example, if the first unused VARn channel is VAR0.z, |
| * color channels are allocated in this order: |
| * COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y |
| * The reason is that some drivers merge outputs if each output sets |
| * different components, for example 2 outputs defining VAR0.xy and COL0.z. |
| * If drivers do interpolation in the fragment shader and color |
| * interpolation can differ for each component, VAR0.xy and COL.z can be |
| * stored in the same output storage slot, and the consumer can load VAR0 |
| * and COL0 from the same slot. |
| * |
| * If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to |
| * VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to |
| * VARn for better packing. |
| * |
| * |
| * Issue: Interpolation converts Infs to NaNs |
| * ========================================== |
| * |
| * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which |
| * impacts and limits backward inter-shader code motion, uniform expression |
| * propagation, and compaction. |
| * |
| * When we decide not to interpolate a varying, we need to convert Infs to |
| * NaNs manually. Infs can be converted to NaNs like this: x*0 + x |
| * (suggested by Ian Romanick, the multiplication must be "exact") |
| * |
| * Changes to optimizations: |
| * - When we propagate a uniform expression and NaNs must be preserved, |
| * convert Infs in the result to NaNs using "x*0 + x" in the consumer. |
| * - When we change interpolation to flat for convergent varyings and NaNs |
| * must be preserved, apply "x*0 + x" to the stored output value |
| * in the producer. |
| * - There is no solution for backward inter-shader code motion with |
| * interpolation if Infs must be preserved. As an alternative, we can allow |
| * code motion across interpolation only for specific shader hashes in |
| * can_move_alu_across_interp. We can use shader-db to automatically produce |
| * a list of shader hashes that benefit from this optimization. |
| * |
| * |
| * Usage |
| * ===== |
| * |
| * Requirements: |
| * - ALUs should be scalarized |
| * - Dot products and other vector opcodes should be lowered (recommended) |
| * - Input loads and output stores should be scalarized |
| * - 64-bit varyings should be lowered to 32 bits |
| * - nir_vertex_divergence_analysis must be called on the producer if |
| * the constumer is a fragment shader |
| * |
| * It's recommended to run this for all shader pairs from the first shader |
| * to the last shader first (to propagate constants etc.). If the optimization |
| * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then |
| * re-run this for all shader pairs in the descending order from S1 to VS. |
| * |
| * NIR optimizations should be performed after every run that changes the IR. |
| * |
| * |
| * Analyzing the optimization potential of linking separate shaders |
| * ================================================================ |
| * |
| * We can use this pass in an analysis pass that decides whether a separate |
| * shader has the potential to benefit from full draw-time linking. The way |
| * it would work is that we would create a passthrough shader adjacent to |
| * the separate shader, run this pass on both shaders, and check if the number |
| * of varyings decreased. This way we can decide to perform the draw-time |
| * linking only if we are confident that it would help performance. |
| * |
| * TODO: not implemented, mention the pass that implements it |
| */ |
| |
| #include "nir.h" |
| #include "nir_builder.h" |
| #include "util/hash_table.h" |
| #include "util/u_math.h" |
| #include "util/u_memory.h" |
| |
| /* nir_opt_varyings works at scalar 16-bit granularity across all varyings. |
| * |
| * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels. |
| * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels |
| * don't set these slots as used in bitmasks. |
| */ |
| #define NUM_SCALAR_SLOTS (NUM_TOTAL_VARYING_SLOTS * 8) |
| |
| /* Fragment shader input slots can be packed with indirectly-indexed vec4 |
| * slots if there are unused components, but only if the vec4 slot has |
| * the same interpolation type. There are only 3 types: FLAT, FP32, FP16. |
| */ |
| enum fs_vec4_type { |
| FS_VEC4_TYPE_NONE = 0, |
| FS_VEC4_TYPE_FLAT, |
| FS_VEC4_TYPE_INTERP_EXPLICIT, |
| FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT, |
| FS_VEC4_TYPE_PER_PRIMITIVE, |
| /* When nir_io_has_flexible_input_interpolation_except_flat is set: */ |
| FS_VEC4_TYPE_INTERP_FP32, |
| FS_VEC4_TYPE_INTERP_FP16, |
| FS_VEC4_TYPE_INTERP_COLOR, /* only for glShadeModel, i.e. INTERP_MODE_NONE */ |
| /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */ |
| FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, |
| FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID, |
| FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE, |
| FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL, |
| FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID, |
| FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE, |
| FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, |
| FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID, |
| FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE, |
| FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL, |
| FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID, |
| FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE, |
| FS_VEC4_TYPE_INTERP_COLOR_PIXEL, /* only for glShadeModel, i.e. INTERP_MODE_NONE */ |
| FS_VEC4_TYPE_INTERP_COLOR_CENTROID, /* same */ |
| FS_VEC4_TYPE_INTERP_COLOR_SAMPLE, /* same */ |
| }; |
| |
| enum { |
| PERSP_PIXEL, |
| PERSP_CENTROID, |
| PERSP_SAMPLE, |
| LINEAR_PIXEL, |
| LINEAR_CENTROID, |
| LINEAR_SAMPLE, |
| NUM_INTERP_QUALIFIERS, |
| }; |
| |
| enum { |
| COLOR_PIXEL, |
| COLOR_CENTROID, |
| COLOR_SAMPLE, |
| NUM_COLOR_QUALIFIERS, |
| }; |
| |
| #if PRINT_RELOCATE_SLOT |
| static const char *fs_vec4_type_strings[] = { |
| "NONE", |
| "FLAT", |
| "INTERP_EXPLICIT", |
| "INTERP_EXPLICIT_STRICT", |
| "PER_PRIMITIVE", |
| "INTERP_FP32", |
| "INTERP_FP16", |
| "INTERP_COLOR", |
| "INTERP_FP32_PERSP_PIXEL", |
| "INTERP_FP32_PERSP_CENTROID", |
| "INTERP_FP32_PERSP_SAMPLE", |
| "INTERP_FP32_LINEAR_PIXEL", |
| "INTERP_FP32_LINEAR_CENTROID", |
| "INTERP_FP32_LINEAR_SAMPLE", |
| "INTERP_FP16_PERSP_PIXEL", |
| "INTERP_FP16_PERSP_CENTROID", |
| "INTERP_FP16_PERSP_SAMPLE", |
| "INTERP_FP16_LINEAR_PIXEL", |
| "INTERP_FP16_LINEAR_CENTROID", |
| "INTERP_FP16_LINEAR_SAMPLE", |
| "INTERP_COLOR_PIXEL", |
| "INTERP_COLOR_CENTROID", |
| "INTERP_COLOR_SAMPLE", |
| }; |
| #endif // PRINT_RELOCATE_SLOT |
| |
| typedef BITSET_WORD INTERP_QUAL_BITSET[NUM_INTERP_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)]; |
| typedef BITSET_WORD COLOR_QUAL_BITSET[NUM_COLOR_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)]; |
| |
| static unsigned |
| get_scalar_16bit_slot(nir_io_semantics sem, unsigned component) |
| { |
| return sem.location * 8 + component * 2 + sem.high_16bits; |
| } |
| |
| static unsigned |
| intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr) |
| { |
| return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr), |
| nir_intrinsic_component(intr)); |
| } |
| |
| static unsigned |
| vec4_slot(unsigned scalar_slot) |
| { |
| return scalar_slot / 8; |
| } |
| |
| struct list_node { |
| struct list_head head; |
| nir_intrinsic_instr *instr; |
| }; |
| |
| /* Information about 1 scalar varying slot for both shader stages. */ |
| struct scalar_slot { |
| struct { |
| /* Linked list of all store instructions writing into the scalar slot |
| * in the producer. |
| */ |
| struct list_head stores; |
| |
| /* Only for TCS: Linked list of all load instructions read the scalar |
| * slot in the producer. |
| */ |
| struct list_head loads; |
| |
| /* If there is only one store instruction or if all store instructions |
| * store the same value in the producer, this is the instruction |
| * computing the stored value. Used by constant and uniform propagation |
| * to the next shader. |
| */ |
| nir_instr *value; |
| } producer; |
| |
| struct { |
| /* Linked list of all load instructions loading from the scalar slot |
| * in the consumer. |
| */ |
| struct list_head loads; |
| |
| /* The result of TES input interpolation. */ |
| nir_alu_instr *tes_interp_load; |
| unsigned tes_interp_mode; /* FLAG_INTERP_TES_* */ |
| nir_def *tes_load_tess_coord; |
| } consumer; |
| |
| /* The number of accessed slots if this slot has indirect indexing. */ |
| unsigned num_slots; |
| }; |
| |
| struct linkage_info { |
| struct scalar_slot slot[NUM_SCALAR_SLOTS]; |
| |
| bool spirv; |
| bool can_move_uniforms; |
| bool can_move_ubos; |
| bool can_mix_convergent_flat_with_interpolated; |
| bool has_flexible_interp; |
| bool always_interpolate_convergent_fs_inputs; |
| |
| gl_shader_stage producer_stage; |
| gl_shader_stage consumer_stage; |
| nir_builder producer_builder; |
| nir_builder consumer_builder; |
| unsigned max_varying_expression_cost; |
| unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr); |
| |
| /* Memory context for linear_alloc_child (fast allocation). */ |
| void *linear_mem_ctx; |
| |
| /* Hash table for efficient cloning instructions between shaders. */ |
| struct hash_table *clones_ht; |
| |
| /* If any component of a vec4 slot is accessed indirectly, this is its |
| * FS vec4 qualifier type, which is either FLAT, FP32, or FP16. |
| * Components with different qualifier types can't be compacted |
| * in the same vec4. |
| */ |
| uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS]; |
| |
| /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn |
| * varyings can't be removed. |
| */ |
| BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of all slots that have transform feedback info. */ |
| BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of all slots that have transform feedback info, but are not used |
| * by the next shader. Separate masks for 32-bit and 16-bit outputs. |
| */ |
| BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of all TCS inputs using cross-invocation access. */ |
| BITSET_DECLARE(tcs_cross_invoc32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(tcs_cross_invoc16_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of all TCS->TES slots that are read by TCS, but not TES. */ |
| BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of all slots accessed with indirect indexing. */ |
| BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS); |
| |
| /* The following masks only contain slots that can be compacted and |
| * describe the groups in which they should be compacted. Non-fragment |
| * shaders only use the flat bitmasks. |
| * |
| * Some legacy varyings are excluded when they can't be compacted due to |
| * being affected by pipeline states (like coord replace). That only |
| * applies to xx->FS shader pairs. Other shader pairs get all legacy |
| * varyings compacted and relocated to VARn. |
| * |
| * Indirectly-indexed varyings are also excluded because they are not |
| * compacted. |
| */ |
| BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(interp_explicit32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(interp_explicit16_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(interp_explicit_strict32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(interp_explicit_strict16_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(per_primitive32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(per_primitive16_mask, NUM_SCALAR_SLOTS); |
| |
| /* Color interpolation unqualified (follows the flat-shade state). */ |
| BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS); |
| |
| /* A separate bitmask for each qualifier when |
| * nir_io_has_flexible_input_interpolation_except_flat is not set. |
| */ |
| INTERP_QUAL_BITSET interp_fp32_qual_masks; |
| INTERP_QUAL_BITSET interp_fp16_qual_masks; |
| COLOR_QUAL_BITSET color32_qual_masks; |
| |
| /* Mask of output components that have only one store instruction, or if |
| * they have multiple store instructions, all those instructions store |
| * the same value. If the output has multiple vertices, all vertices store |
| * the same value. This is a useful property for: |
| * - constant and uniform propagation to the next shader |
| * - deduplicating outputs |
| */ |
| BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS); |
| |
| /* Mask of output components that store values that are convergent, |
| * i.e. all values stored into the outputs are equal within a primitive. |
| * |
| * This is different from output_equal_mask, which says that all stores |
| * to the same slot in the same thread are equal, while this says that |
| * each store to the same slot can be different, but it always stores |
| * a convergent value, which means the stored value is equal among all |
| * threads within a primitive. |
| * |
| * The advantage is that these varyings can always be promoted to flat |
| * regardless of the original interpolation mode, and they can always be |
| * compacted with both interpolated and flat varyings. |
| */ |
| BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS); |
| BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS); |
| }; |
| |
| /****************************************************************** |
| * HELPERS |
| ******************************************************************/ |
| |
| /* Return whether the low or high 16-bit slot is 1. */ |
| #define BITSET_TEST32(m, b) \ |
| (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1)) |
| |
| #define BITSET3_TEST_ANY(bitsets, b) (BITSET_TEST((bitsets)[0], (b)) || \ |
| BITSET_TEST((bitsets)[1], (b)) || \ |
| BITSET_TEST((bitsets)[2], (b))) |
| #define BITSET6_TEST_ANY(bitsets, b) (BITSET3_TEST_ANY((bitsets), (b)) || \ |
| BITSET3_TEST_ANY(&(bitsets)[3], (b))) |
| |
| static void |
| print_linkage(struct linkage_info *linkage) |
| { |
| printf("Linkage: %s -> %s\n", |
| _mesa_shader_stage_to_abbrev(linkage->producer_stage), |
| _mesa_shader_stage_to_abbrev(linkage->consumer_stage)); |
| |
| for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| |
| if (!slot->num_slots && |
| list_is_empty(&slot->producer.stores) && |
| list_is_empty(&slot->producer.loads) && |
| list_is_empty(&slot->consumer.loads) && |
| !BITSET_TEST(linkage->removable_mask, i) && |
| !BITSET_TEST(linkage->indirect_mask, i) && |
| !BITSET_TEST(linkage->xfb32_only_mask, i) && |
| !BITSET_TEST(linkage->xfb16_only_mask, i) && |
| !BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) && |
| !BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) && |
| !BITSET_TEST(linkage->no_varying32_mask, i) && |
| !BITSET_TEST(linkage->no_varying16_mask, i) && |
| !BITSET_TEST(linkage->interp_fp32_mask, i) && |
| !BITSET_TEST(linkage->interp_fp16_mask, i) && |
| !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) && |
| !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i) && |
| !BITSET_TEST(linkage->color32_mask, i) && |
| !BITSET3_TEST_ANY(linkage->color32_qual_masks, i) && |
| !BITSET_TEST(linkage->flat32_mask, i) && |
| !BITSET_TEST(linkage->flat16_mask, i) && |
| !BITSET_TEST(linkage->interp_explicit32_mask, i) && |
| !BITSET_TEST(linkage->interp_explicit16_mask, i) && |
| !BITSET_TEST(linkage->interp_explicit_strict32_mask, i) && |
| !BITSET_TEST(linkage->interp_explicit_strict16_mask, i) && |
| !BITSET_TEST(linkage->per_primitive32_mask, i) && |
| !BITSET_TEST(linkage->per_primitive16_mask, i) && |
| !BITSET_TEST(linkage->convergent32_mask, i) && |
| !BITSET_TEST(linkage->convergent16_mask, i) && |
| !BITSET_TEST(linkage->output_equal_mask, i)) |
| continue; |
| |
| printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", |
| gl_varying_slot_name_for_stage(vec4_slot(i), |
| linkage->producer_stage) + 13, |
| "xyzw"[(i / 2) % 4], |
| i % 2 ? "hi" : "lo", |
| slot->num_slots, |
| BITSET_TEST(linkage->removable_mask, i) ? " removable" : "", |
| BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "", |
| BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "", |
| BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "", |
| BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) ? " tcs_cross_invoc32" : "", |
| BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) ? " tcs_cross_invoc16" : "", |
| BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "", |
| BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "", |
| BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[0], i) ? " interp_fp32_persp_pixel" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[1], i) ? " interp_fp32_persp_centroid" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[2], i) ? " interp_fp32_persp_sample" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[3], i) ? " interp_fp32_linear_pixel" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[4], i) ? " interp_fp32_linear_centroid" : "", |
| BITSET_TEST(linkage->interp_fp32_qual_masks[5], i) ? " interp_fp32_linear_sample" : "", |
| BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[0], i) ? " interp_fp16_persp_pixel" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[1], i) ? " interp_fp16_persp_centroid" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[2], i) ? " interp_fp16_persp_sample" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[3], i) ? " interp_fp16_linear_pixel" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[4], i) ? " interp_fp16_linear_centroid" : "", |
| BITSET_TEST(linkage->interp_fp16_qual_masks[5], i) ? " interp_fp16_linear_sample" : "", |
| BITSET_TEST(linkage->color32_mask, i) ? " color32" : "", |
| BITSET_TEST(linkage->color32_qual_masks[0], i) ? " color32_pixel" : "", |
| BITSET_TEST(linkage->color32_qual_masks[1], i) ? " color32_centroid" : "", |
| BITSET_TEST(linkage->color32_qual_masks[2], i) ? " color32_sample" : "", |
| BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "", |
| BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "", |
| BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "", |
| BITSET_TEST(linkage->interp_explicit16_mask, i) ? " interp_explicit16" : "", |
| BITSET_TEST(linkage->interp_explicit_strict32_mask, i) ? " interp_explicit_strict32" : "", |
| BITSET_TEST(linkage->interp_explicit_strict16_mask, i) ? " interp_explicit_strict16" : "", |
| BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive32" : "", |
| BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive16" : "", |
| BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "", |
| BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "", |
| BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "", |
| !list_is_empty(&slot->producer.stores) ? " producer_stores" : "", |
| !list_is_empty(&slot->producer.loads) ? " producer_loads" : "", |
| !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : ""); |
| } |
| } |
| |
| static void |
| slot_disable_optimizations_and_compaction(struct linkage_info *linkage, |
| unsigned i) |
| { |
| BITSET_CLEAR(linkage->output_equal_mask, i); |
| BITSET_CLEAR(linkage->convergent32_mask, i); |
| BITSET_CLEAR(linkage->convergent16_mask, i); |
| BITSET_CLEAR(linkage->interp_fp32_mask, i); |
| BITSET_CLEAR(linkage->interp_fp16_mask, i); |
| for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) { |
| BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i); |
| BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i); |
| } |
| BITSET_CLEAR(linkage->flat32_mask, i); |
| BITSET_CLEAR(linkage->flat16_mask, i); |
| BITSET_CLEAR(linkage->interp_explicit32_mask, i); |
| BITSET_CLEAR(linkage->interp_explicit16_mask, i); |
| BITSET_CLEAR(linkage->interp_explicit_strict32_mask, i); |
| BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i); |
| BITSET_CLEAR(linkage->per_primitive32_mask, i); |
| BITSET_CLEAR(linkage->per_primitive16_mask, i); |
| BITSET_CLEAR(linkage->tcs_cross_invoc32_mask, i); |
| BITSET_CLEAR(linkage->tcs_cross_invoc16_mask, i); |
| BITSET_CLEAR(linkage->no_varying32_mask, i); |
| BITSET_CLEAR(linkage->no_varying16_mask, i); |
| BITSET_CLEAR(linkage->color32_mask, i); |
| for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++) |
| BITSET_CLEAR(linkage->color32_qual_masks[b], i); |
| } |
| |
| static void |
| clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb) |
| { |
| slot_disable_optimizations_and_compaction(linkage, i); |
| |
| if (uses_xfb) |
| return; |
| |
| linkage->slot[i].num_slots = 0; |
| |
| BITSET_CLEAR(linkage->indirect_mask, i); |
| BITSET_CLEAR(linkage->removable_mask, i); |
| |
| /* Transform feedback stores can't be removed. */ |
| assert(!BITSET_TEST(linkage->xfb32_only_mask, i)); |
| assert(!BITSET_TEST(linkage->xfb16_only_mask, i)); |
| } |
| |
| static bool |
| has_xfb(nir_intrinsic_instr *intr) |
| { |
| /* This means whether the instrinsic is ABLE to have xfb info. */ |
| if (!nir_intrinsic_has_io_xfb(intr)) |
| return false; |
| |
| unsigned comp = nir_intrinsic_component(intr); |
| |
| if (comp >= 2) |
| return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0; |
| else |
| return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0; |
| } |
| |
| static bool |
| is_interpolated_color(struct linkage_info *linkage, unsigned i) |
| { |
| if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) |
| return false; |
| |
| /* BFCn stores are bunched in the COLn slots with COLn, so we should never |
| * get BFCn here. |
| */ |
| assert(vec4_slot(i) != VARYING_SLOT_BFC0 && |
| vec4_slot(i) != VARYING_SLOT_BFC1); |
| |
| return vec4_slot(i) == VARYING_SLOT_COL0 || |
| vec4_slot(i) == VARYING_SLOT_COL1; |
| } |
| |
| static bool |
| is_interpolated_texcoord(struct linkage_info *linkage, unsigned i) |
| { |
| if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) |
| return false; |
| |
| return vec4_slot(i) >= VARYING_SLOT_TEX0 && |
| vec4_slot(i) <= VARYING_SLOT_TEX7; |
| } |
| |
| static bool |
| color_uses_shade_model(struct linkage_info *linkage, unsigned i) |
| { |
| if (!is_interpolated_color(linkage, i)) |
| return false; |
| |
| list_for_each_entry(struct list_node, iter, |
| &linkage->slot[i].consumer.loads, head) { |
| assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input); |
| |
| nir_intrinsic_instr *baryc = |
| nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr); |
| if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static enum fs_vec4_type |
| get_interp_vec4_type(struct linkage_info *linkage, unsigned slot, |
| nir_intrinsic_instr *load) |
| { |
| assert(!linkage->has_flexible_interp); |
| assert(load->intrinsic == nir_intrinsic_load_interpolated_input); |
| |
| nir_intrinsic_instr *baryc = |
| nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); |
| enum fs_vec4_type base; |
| |
| if (color_uses_shade_model(linkage, slot)) |
| base = FS_VEC4_TYPE_INTERP_COLOR_PIXEL; |
| else if (load->def.bit_size == 32) |
| base = FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL; |
| else if (load->def.bit_size == 16) |
| base = FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL; |
| else |
| unreachable("invalid load_interpolated_input type"); |
| |
| bool linear = nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NOPERSPECTIVE; |
| |
| if (linear) |
| base += 3; |
| |
| switch (baryc->intrinsic) { |
| case nir_intrinsic_load_barycentric_pixel: |
| case nir_intrinsic_load_barycentric_at_offset: |
| case nir_intrinsic_load_barycentric_at_sample: |
| return base; |
| case nir_intrinsic_load_barycentric_centroid: |
| return base + 1; |
| case nir_intrinsic_load_barycentric_sample: |
| return base + 2; |
| default: |
| unreachable("unexpected barycentric intrinsic"); |
| } |
| } |
| |
| static bool |
| preserve_infs_nans(nir_shader *nir, unsigned bit_size) |
| { |
| unsigned mode = nir->info.float_controls_execution_mode; |
| |
| return nir_is_float_control_inf_preserve(mode, bit_size) || |
| nir_is_float_control_nan_preserve(mode, bit_size); |
| } |
| |
| static bool |
| preserve_nans(nir_shader *nir, unsigned bit_size) |
| { |
| unsigned mode = nir->info.float_controls_execution_mode; |
| |
| return nir_is_float_control_nan_preserve(mode, bit_size); |
| } |
| |
| static nir_def * |
| build_convert_inf_to_nan(nir_builder *b, nir_def *x) |
| { |
| /* Do x*0 + x. The multiplication by 0 can't be optimized out. */ |
| nir_def *fma = nir_ffma_imm1(b, x, 0, x); |
| nir_instr_as_alu(fma->parent_instr)->exact = true; |
| return fma; |
| } |
| |
| static bool |
| is_sysval(nir_instr *instr, gl_system_value sysval) |
| { |
| if (instr->type == nir_instr_type_intrinsic) { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| if (intr->intrinsic == nir_intrinsic_from_system_value(sysval)) |
| return true; |
| |
| if (intr->intrinsic == nir_intrinsic_load_deref) { |
| nir_deref_instr *deref = |
| nir_instr_as_deref(intr->src[0].ssa->parent_instr); |
| |
| return nir_deref_mode_is_one_of(deref, nir_var_system_value) && |
| nir_deref_instr_get_variable(deref)->data.location == sysval; |
| } |
| } |
| |
| return false; |
| } |
| |
| /****************************************************************** |
| * GATHERING INPUTS & OUTPUTS |
| ******************************************************************/ |
| |
| static bool |
| is_active_sysval_output(struct linkage_info *linkage, unsigned slot, |
| nir_intrinsic_instr *intr) |
| { |
| return nir_slot_is_sysval_output(vec4_slot(slot), |
| linkage->consumer_stage) && |
| !nir_intrinsic_io_semantics(intr).no_sysval_output; |
| } |
| |
| /** |
| * This function acts like a filter. The pass won't touch varyings that |
| * return false here, and the return value is saved in the linkage bitmasks, |
| * so that all subpasses will *automatically* skip such varyings. |
| */ |
| static bool |
| can_remove_varying(struct linkage_info *linkage, gl_varying_slot location) |
| { |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| /* User-defined varyings and fog coordinates can always be removed. */ |
| if (location >= VARYING_SLOT_VAR0 || |
| location == VARYING_SLOT_FOGC) |
| return true; |
| |
| /* These can be removed as varyings, which means they will be demoted to |
| * sysval-only outputs keeping their culling/rasterization functions |
| * while not passing the values to FS. Drivers should handle |
| * the "no_varying" semantic to benefit from this. |
| * |
| * Note: When removing unset LAYER and VIEWPORT FS inputs, they will |
| * be replaced by 0 instead of undef. |
| */ |
| if (location == VARYING_SLOT_CLIP_DIST0 || |
| location == VARYING_SLOT_CLIP_DIST1 || |
| location == VARYING_SLOT_CULL_DIST0 || |
| location == VARYING_SLOT_CULL_DIST1 || |
| location == VARYING_SLOT_LAYER || |
| location == VARYING_SLOT_VIEWPORT) |
| return true; |
| |
| /* COLn inputs can be removed only if both COLn and BFCn are not |
| * written. Both COLn and BFCn outputs can be removed if COLn inputs |
| * aren't read. |
| * |
| * TEXn inputs can never be removed in FS because of the coord replace |
| * state, but TEXn outputs can be removed if they are not read by FS. |
| */ |
| if (location == VARYING_SLOT_COL0 || |
| location == VARYING_SLOT_COL1 || |
| location == VARYING_SLOT_BFC0 || |
| location == VARYING_SLOT_BFC1 || |
| (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)) |
| return true; |
| |
| /* "GS -> FS" can remove the primitive ID if not written or not read. */ |
| if ((linkage->producer_stage == MESA_SHADER_GEOMETRY || |
| linkage->producer_stage == MESA_SHADER_MESH) && |
| location == VARYING_SLOT_PRIMITIVE_ID) |
| return true; |
| |
| /* No other varyings can be removed. */ |
| return false; |
| } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) { |
| /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values |
| * come from glPatchParameterfv. |
| * |
| * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which |
| * means they will be demoted to sysval-only outputs, so that drivers |
| * know that TES doesn't read them. |
| */ |
| if (linkage->producer_stage == MESA_SHADER_VERTEX && |
| (location == VARYING_SLOT_TESS_LEVEL_INNER || |
| location == VARYING_SLOT_TESS_LEVEL_OUTER)) |
| return false; |
| |
| return true; |
| } |
| |
| /* All other varyings can be removed. */ |
| return true; |
| } |
| |
| struct opt_options { |
| bool propagate_uniform_expr:1; |
| bool deduplicate:1; |
| bool inter_shader_code_motion:1; |
| bool compact:1; |
| bool disable_all:1; |
| }; |
| |
| /** |
| * Return which optimizations are allowed. |
| */ |
| static struct opt_options |
| can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location) |
| { |
| struct opt_options options_var = { |
| .propagate_uniform_expr = true, |
| .deduplicate = true, |
| .inter_shader_code_motion = true, |
| .compact = true, |
| }; |
| struct opt_options options_color = { |
| .propagate_uniform_expr = true, /* only constants in [0, 1] */ |
| .deduplicate = true, |
| .compact = true, |
| }; |
| struct opt_options options_tex = { |
| .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */ |
| }; |
| struct opt_options options_sysval_output = { |
| .propagate_uniform_expr = true, |
| .deduplicate = true, |
| }; |
| struct opt_options options_tess_levels = { |
| .propagate_uniform_expr = true, |
| .deduplicate = true, |
| }; |
| struct opt_options options_disable_all = { |
| .disable_all = true, |
| }; |
| |
| assert(can_remove_varying(linkage, location)); |
| |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| /* xx -> FS */ |
| /* User-defined varyings and fog coordinates can always be optimized. */ |
| if (location >= VARYING_SLOT_VAR0 || |
| location == VARYING_SLOT_FOGC) |
| return options_var; |
| |
| /* The primitive ID can always be optimized in GS -> FS and MS -> FS. */ |
| if ((linkage->producer_stage == MESA_SHADER_GEOMETRY || |
| linkage->producer_stage == MESA_SHADER_MESH) && |
| location == VARYING_SLOT_PRIMITIVE_ID) |
| return options_var; |
| |
| /* Colors can only do constant propagation if COLn and BFCn store the |
| * same constant and the constant is between 0 and 1 (because clamp |
| * vertex color state is unknown). Uniform propagation isn't possible |
| * because of the clamping. |
| * |
| * Color components can only be deduplicated and compacted among |
| * themselves if they have the same interpolation qualifier, and can't |
| * be mixed with other varyings. |
| */ |
| if (location == VARYING_SLOT_COL0 || |
| location == VARYING_SLOT_COL1 || |
| location == VARYING_SLOT_BFC0 || |
| location == VARYING_SLOT_BFC1) |
| return options_color; |
| |
| /* TEXn.zw can only be constant-propagated if the value is (0, 1) |
| * because it matches the coord replace values. |
| */ |
| if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7) |
| return options_tex; |
| |
| /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate |
| * uniform expressions and be compacted (moved to VARn while keeping |
| * the sysval outputs where they are). |
| */ |
| if (location == VARYING_SLOT_LAYER || |
| location == VARYING_SLOT_VIEWPORT || |
| location == VARYING_SLOT_CLIP_DIST0 || |
| location == VARYING_SLOT_CLIP_DIST1 || |
| location == VARYING_SLOT_CULL_DIST0 || |
| location == VARYING_SLOT_CULL_DIST1) |
| return options_sysval_output; |
| |
| /* Everything else can't be read by the consumer, such as POS, PSIZ, |
| * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc. |
| */ |
| return options_disable_all; |
| } |
| |
| if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) { |
| /* TESS_LEVEL_* can only propagate uniform expressions. |
| * Compaction is disabled because AMD doesn't want the varying to be |
| * moved to PATCHn while keeping the sysval output where it is. |
| */ |
| if (location == VARYING_SLOT_TESS_LEVEL_INNER || |
| location == VARYING_SLOT_TESS_LEVEL_OUTER) |
| return options_tess_levels; |
| } |
| |
| /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES), |
| * (TES, GS), and (VS, GS) can compact and optimize all varyings. |
| */ |
| return options_var; |
| } |
| |
| static bool |
| gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data) |
| { |
| struct linkage_info *linkage = (struct linkage_info *)cb_data; |
| |
| if (intr->intrinsic != nir_intrinsic_load_input && |
| intr->intrinsic != nir_intrinsic_load_per_vertex_input && |
| intr->intrinsic != nir_intrinsic_load_per_primitive_input && |
| intr->intrinsic != nir_intrinsic_load_interpolated_input && |
| intr->intrinsic != nir_intrinsic_load_input_vertex) |
| return false; |
| |
| /* nir_lower_io_to_scalar is required before this */ |
| assert(intr->def.num_components == 1); |
| /* Non-zero constant offsets should have been folded by |
| * nir_io_add_const_offset_to_base. |
| */ |
| nir_src offset = *nir_get_io_offset_src(intr); |
| assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0); |
| |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| |
| if (!can_remove_varying(linkage, sem.location)) |
| return false; |
| |
| /* Insert the load into the list of loads for this scalar slot. */ |
| unsigned slot = intr_get_scalar_16bit_slot(intr); |
| struct scalar_slot *in = &linkage->slot[slot]; |
| struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, |
| sizeof(struct list_node)); |
| node->instr = intr; |
| list_addtail(&node->head, &in->consumer.loads); |
| in->num_slots = MAX2(in->num_slots, sem.num_slots); |
| |
| BITSET_SET(linkage->removable_mask, slot); |
| |
| enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE; |
| |
| /* Determine the type of the input for compaction. Other inputs |
| * can be compacted with indirectly-indexed vec4 slots if they |
| * have unused components, but only if they are of the same type. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_input: |
| fs_vec4_type = FS_VEC4_TYPE_FLAT; |
| break; |
| case nir_intrinsic_load_per_primitive_input: |
| fs_vec4_type = FS_VEC4_TYPE_PER_PRIMITIVE; |
| break; |
| case nir_intrinsic_load_input_vertex: |
| if (sem.interp_explicit_strict) |
| fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT; |
| else |
| fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT; |
| break; |
| case nir_intrinsic_load_interpolated_input: |
| if (linkage->has_flexible_interp) { |
| if (color_uses_shade_model(linkage, slot)) |
| fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR; |
| else if (intr->def.bit_size == 32) |
| fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32; |
| else if (intr->def.bit_size == 16) |
| fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16; |
| else |
| unreachable("invalid load_interpolated_input type"); |
| } else { |
| fs_vec4_type = get_interp_vec4_type(linkage, slot, intr); |
| } |
| break; |
| default: |
| unreachable("unexpected input load intrinsic"); |
| } |
| |
| linkage->fs_vec4_type[sem.location] = fs_vec4_type; |
| } |
| |
| /* Indirect indexing. */ |
| if (!nir_src_is_const(offset)) { |
| /* Only the indirectly-indexed component is marked as indirect. */ |
| for (unsigned i = 0; i < sem.num_slots; i++) |
| BITSET_SET(linkage->indirect_mask, slot + i * 8); |
| |
| /* Set the same vec4 type as the first element in all slots. */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| for (unsigned i = 1; i < sem.num_slots; i++) |
| linkage->fs_vec4_type[sem.location + i] = fs_vec4_type; |
| } |
| return false; |
| } |
| |
| if (!can_optimize_varying(linkage, sem.location).compact) |
| return false; |
| |
| /* Record inputs that can be compacted. */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| unsigned i; |
| assert(intr->def.bit_size == 32 || intr->def.bit_size == 16); |
| |
| switch (fs_vec4_type) { |
| case FS_VEC4_TYPE_FLAT: |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->flat32_mask, slot); |
| else |
| BITSET_SET(linkage->flat16_mask, slot); |
| break; |
| case FS_VEC4_TYPE_INTERP_EXPLICIT: |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->interp_explicit32_mask, slot); |
| else |
| BITSET_SET(linkage->interp_explicit16_mask, slot); |
| break; |
| case FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT: |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->interp_explicit_strict32_mask, slot); |
| else |
| BITSET_SET(linkage->interp_explicit_strict16_mask, slot); |
| break; |
| case FS_VEC4_TYPE_PER_PRIMITIVE: |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->per_primitive32_mask, slot); |
| else |
| BITSET_SET(linkage->per_primitive16_mask, slot); |
| break; |
| |
| case FS_VEC4_TYPE_INTERP_FP32: |
| BITSET_SET(linkage->interp_fp32_mask, slot); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP16: |
| BITSET_SET(linkage->interp_fp16_mask, slot); |
| break; |
| case FS_VEC4_TYPE_INTERP_COLOR: |
| BITSET_SET(linkage->color32_mask, slot); |
| break; |
| |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE: |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE: |
| i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL; |
| BITSET_SET(linkage->interp_fp32_qual_masks[i], slot); |
| break; |
| |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE: |
| i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL; |
| BITSET_SET(linkage->interp_fp16_qual_masks[i], slot); |
| break; |
| |
| case FS_VEC4_TYPE_INTERP_COLOR_PIXEL: |
| case FS_VEC4_TYPE_INTERP_COLOR_CENTROID: |
| case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE: |
| i = fs_vec4_type - FS_VEC4_TYPE_INTERP_COLOR_PIXEL; |
| BITSET_SET(linkage->color32_qual_masks[i], slot); |
| break; |
| |
| case FS_VEC4_TYPE_NONE: |
| unreachable("unexpected fs_vec4_type"); |
| } |
| |
| if (!linkage->has_flexible_interp && |
| intr->intrinsic == nir_intrinsic_load_interpolated_input) { |
| /* interpolateAtCentroid can occur simultaneously with any other |
| * qualifier. If centroid is flagged with any other qualifier, |
| * unflag centroid. Even though we track such outputs as the other |
| * qualifier, the load_barycentric_centroid intrinsic must be |
| * preserved by all optimizations. The only case when it's not |
| * preserved is when the input is convergent, in which case |
| * all qualifiers have the same behavior and we opportunistically |
| * change it during compaction. |
| */ |
| if (color_uses_shade_model(linkage, slot)) { |
| if (BITSET_TEST(linkage->color32_qual_masks[COLOR_CENTROID], slot) && |
| (BITSET_TEST(linkage->color32_qual_masks[COLOR_PIXEL], slot) || |
| BITSET_TEST(linkage->color32_qual_masks[COLOR_SAMPLE], slot))) |
| BITSET_CLEAR(linkage->color32_qual_masks[COLOR_CENTROID], slot); |
| } else { |
| INTERP_QUAL_BITSET *bitsets = |
| intr->def.bit_size == 32 ? &linkage->interp_fp32_qual_masks : |
| &linkage->interp_fp16_qual_masks; |
| |
| if (BITSET_TEST((*bitsets)[PERSP_CENTROID], slot) && |
| (BITSET_TEST((*bitsets)[PERSP_PIXEL], slot) || |
| BITSET_TEST((*bitsets)[PERSP_SAMPLE], slot))) |
| BITSET_CLEAR((*bitsets)[PERSP_CENTROID], slot); |
| |
| if (BITSET_TEST((*bitsets)[LINEAR_CENTROID], slot) && |
| (BITSET_TEST((*bitsets)[LINEAR_PIXEL], slot) || |
| BITSET_TEST((*bitsets)[LINEAR_SAMPLE], slot))) |
| BITSET_CLEAR((*bitsets)[LINEAR_CENTROID], slot); |
| } |
| } |
| } else { |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->flat32_mask, slot); |
| else if (intr->def.bit_size == 16) |
| BITSET_SET(linkage->flat16_mask, slot); |
| else |
| unreachable("invalid load_input type"); |
| |
| if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL && |
| intr->intrinsic == nir_intrinsic_load_per_vertex_input) { |
| nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intr); |
| nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr; |
| |
| if (!is_sysval(vertex_index_instr, SYSTEM_VALUE_INVOCATION_ID)) { |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->tcs_cross_invoc32_mask, slot); |
| else if (intr->def.bit_size == 16) |
| BITSET_SET(linkage->tcs_cross_invoc16_mask, slot); |
| else |
| unreachable("invalid load_input type"); |
| } |
| } |
| } |
| return false; |
| } |
| |
| static bool |
| gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data) |
| { |
| struct linkage_info *linkage = (struct linkage_info *)cb_data; |
| |
| if (intr->intrinsic != nir_intrinsic_store_output && |
| intr->intrinsic != nir_intrinsic_load_output && |
| intr->intrinsic != nir_intrinsic_store_per_vertex_output && |
| intr->intrinsic != nir_intrinsic_store_per_view_output && |
| intr->intrinsic != nir_intrinsic_store_per_primitive_output && |
| intr->intrinsic != nir_intrinsic_load_per_vertex_output && |
| intr->intrinsic != nir_intrinsic_load_per_view_output && |
| intr->intrinsic != nir_intrinsic_load_per_primitive_output) |
| return false; |
| |
| bool is_store = |
| intr->intrinsic == nir_intrinsic_store_output || |
| intr->intrinsic == nir_intrinsic_store_per_vertex_output || |
| intr->intrinsic == nir_intrinsic_store_per_view_output || |
| intr->intrinsic == nir_intrinsic_store_per_primitive_output; |
| |
| if (is_store) { |
| /* nir_lower_io_to_scalar is required before this */ |
| assert(intr->src[0].ssa->num_components == 1); |
| /* nit_opt_undef is required before this. */ |
| assert(intr->src[0].ssa->parent_instr->type != |
| nir_instr_type_undef); |
| } else { |
| /* nir_lower_io_to_scalar is required before this */ |
| assert(intr->def.num_components == 1); |
| /* Outputs loads are only allowed in TCS. */ |
| assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); |
| } |
| |
| /* Non-zero constant offsets should have been folded by |
| * nir_io_add_const_offset_to_base. |
| */ |
| nir_src offset = *nir_get_io_offset_src(intr); |
| assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0); |
| |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| |
| if (!can_remove_varying(linkage, sem.location)) |
| return false; |
| |
| /* For "xx -> FS", treat BFCn stores as COLn to make dead varying |
| * elimination do the right thing automatically. The rules are: |
| * - COLn inputs can be removed only if both COLn and BFCn are not |
| * written. |
| * - Both COLn and BFCn outputs can be removed if COLn inputs |
| * aren't read. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| if (sem.location == VARYING_SLOT_BFC0) |
| sem.location = VARYING_SLOT_COL0; |
| else if (sem.location == VARYING_SLOT_BFC1) |
| sem.location = VARYING_SLOT_COL1; |
| } |
| |
| /* Insert the instruction into the list of stores or loads for this |
| * scalar slot. |
| */ |
| unsigned slot = |
| get_scalar_16bit_slot(sem, nir_intrinsic_component(intr)); |
| |
| struct scalar_slot *out = &linkage->slot[slot]; |
| struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, |
| sizeof(struct list_node)); |
| node->instr = intr; |
| out->num_slots = MAX2(out->num_slots, sem.num_slots); |
| |
| if (is_store) { |
| list_addtail(&node->head, &out->producer.stores); |
| |
| if (has_xfb(intr)) { |
| BITSET_SET(linkage->xfb_mask, slot); |
| |
| if (sem.no_varying && |
| !is_active_sysval_output(linkage, slot, intr)) { |
| if (intr->src[0].ssa->bit_size == 32) |
| BITSET_SET(linkage->xfb32_only_mask, slot); |
| else if (intr->src[0].ssa->bit_size == 16) |
| BITSET_SET(linkage->xfb16_only_mask, slot); |
| else |
| unreachable("invalid load_input type"); |
| } |
| } |
| } else { |
| list_addtail(&node->head, &out->producer.loads); |
| } |
| |
| BITSET_SET(linkage->removable_mask, slot); |
| |
| /* Indirect indexing. */ |
| if (!nir_src_is_const(offset)) { |
| /* Only the indirectly-indexed component is marked as indirect. */ |
| for (unsigned i = 0; i < sem.num_slots; i++) |
| BITSET_SET(linkage->indirect_mask, slot + i * 8); |
| |
| /* Set the same vec4 type as the first element in all slots. */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| enum fs_vec4_type fs_vec4_type = |
| linkage->fs_vec4_type[sem.location]; |
| |
| for (unsigned i = 1; i < sem.num_slots; i++) |
| linkage->fs_vec4_type[sem.location + i] = fs_vec4_type; |
| } |
| return false; |
| } |
| |
| if (can_optimize_varying(linkage, sem.location).disable_all) |
| return false; |
| |
| if (is_store) { |
| nir_def *value = intr->src[0].ssa; |
| |
| const bool constant = value->parent_instr->type == nir_instr_type_load_const; |
| |
| /* If the store instruction is executed in a divergent block, the value |
| * that's stored in the output becomes divergent. |
| * |
| * Mesh shaders get special treatment because we can't follow their topology, |
| * so we only propagate constants. |
| * TODO: revisit this when workgroup divergence analysis is merged. |
| */ |
| const bool divergent = (!constant && linkage->producer_stage == MESA_SHADER_MESH) || |
| intr->instr.block->divergent || |
| nir_src_is_divergent(&intr->src[0]); |
| |
| if (!out->producer.value) { |
| /* This is the first store to this output. */ |
| BITSET_SET(linkage->output_equal_mask, slot); |
| out->producer.value = value->parent_instr; |
| |
| /* Set whether the value is convergent. Such varyings can be |
| * promoted to flat regardless of their original interpolation |
| * mode. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) { |
| if (value->bit_size == 32) |
| BITSET_SET(linkage->convergent32_mask, slot); |
| else if (value->bit_size == 16) |
| BITSET_SET(linkage->convergent16_mask, slot); |
| else |
| unreachable("invalid store_output type"); |
| } |
| } else { |
| /* There are multiple stores to the same output. If they store |
| * different values, clear the mask. |
| */ |
| if (out->producer.value != value->parent_instr) |
| BITSET_CLEAR(linkage->output_equal_mask, slot); |
| |
| /* Update divergence information. */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) { |
| if (value->bit_size == 32) |
| BITSET_CLEAR(linkage->convergent32_mask, slot); |
| else if (value->bit_size == 16) |
| BITSET_CLEAR(linkage->convergent16_mask, slot); |
| else |
| unreachable("invalid store_output type"); |
| } |
| } |
| } else { |
| /* Only TCS output loads can get here. |
| * |
| * We need to record output loads as flat32 or flat16, otherwise |
| * compaction will think that the slot is free and will put some |
| * other output in its place. |
| */ |
| assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); |
| |
| if (!can_optimize_varying(linkage, sem.location).compact) |
| return false; |
| |
| if (intr->def.bit_size == 32) |
| BITSET_SET(linkage->flat32_mask, slot); |
| else if (intr->def.bit_size == 16) |
| BITSET_SET(linkage->flat16_mask, slot); |
| else |
| unreachable("invalid load_input type"); |
| } |
| return false; |
| } |
| |
| /****************************************************************** |
| * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL) |
| ******************************************************************/ |
| |
| static void |
| tidy_up_indirect_varyings(struct linkage_info *linkage) |
| { |
| unsigned i; |
| |
| /* Indirectly-indexed slots can have direct access too and thus set |
| * various bitmasks, so clear those bitmasks to make sure they are not |
| * touched. |
| */ |
| BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) { |
| slot_disable_optimizations_and_compaction(linkage, i); |
| } |
| |
| /* If some slots have both direct and indirect accesses, move instructions |
| * of such slots to the slot representing the first array element, so that |
| * we can remove all loads/stores of dead indirectly-indexed varyings |
| * by only looking at the first element. |
| */ |
| BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) { |
| struct scalar_slot *first = &linkage->slot[i]; |
| |
| /* Skip if this is not the first array element. The first element |
| * always sets num_slots to at least 2. |
| */ |
| if (first->num_slots <= 1) |
| continue; |
| |
| /* Move instructions from other elements of the indirectly-accessed |
| * array to the first element (by merging the linked lists). |
| */ |
| for (unsigned elem = 1; elem < first->num_slots; elem++) { |
| /* The component slots are at 16-bit granularity, so we need to |
| * increment by 8 to get the same component in the next vec4 slot. |
| */ |
| struct scalar_slot *other = &linkage->slot[i + elem * 8]; |
| |
| list_splicetail(&other->producer.stores, &first->producer.stores); |
| list_splicetail(&other->producer.loads, &first->producer.loads); |
| list_splicetail(&other->consumer.loads, &first->consumer.loads); |
| list_inithead(&other->producer.stores); |
| list_inithead(&other->producer.loads); |
| list_inithead(&other->consumer.loads); |
| } |
| } |
| } |
| |
| /****************************************************************** |
| * TIDYING UP CONVERGENT VARYINGS |
| ******************************************************************/ |
| |
| /** |
| * Reorganize bitmasks for FS because they are initialized such that they can |
| * intersect with the convergent bitmasks. We want them to be disjoint, so |
| * that masks of interpolated, flat, and convergent varyings don't intersect. |
| */ |
| static void |
| tidy_up_convergent_varyings(struct linkage_info *linkage) |
| { |
| if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) |
| return; |
| |
| unsigned i; |
| /* Whether to promote convergent interpolated slots to flat if it |
| * doesn't lead to worse compaction. |
| */ |
| bool optimize_convergent_slots = true; /* only turn off for debugging */ |
| |
| if (optimize_convergent_slots) { |
| /* If a slot is flat and convergent and the driver can't load as flat |
| * from interpolated vec4 slots, keep the flat bit and remove |
| * the convergent bit. If the driver can load as flat from interpolated |
| * vec4 slots, keep the convergent bit. |
| * |
| * If a slot is interpolated and convergent, remove the interpolated |
| * bit and keep the convergent bit, which means that it's interpolated, |
| * but can be promoted to flat. |
| * |
| * Since the geometry shader is the only shader that can store values |
| * in multiple vertices before FS, it's required that all stores are |
| * equal to be considered convergent (output_equal_mask), otherwise |
| * the promotion to flat would be incorrect. |
| */ |
| BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) { |
| if (!BITSET_TEST(linkage->interp_fp32_mask, i) && |
| !BITSET_TEST(linkage->color32_mask, i) && |
| !BITSET_TEST(linkage->flat32_mask, i) && |
| !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) && |
| !BITSET3_TEST_ANY(linkage->color32_qual_masks, i)) { |
| /* Clear the flag - not used by FS. */ |
| BITSET_CLEAR(linkage->convergent32_mask, i); |
| } else if ((!linkage->can_mix_convergent_flat_with_interpolated && |
| BITSET_TEST(linkage->flat32_mask, i)) || |
| (linkage->producer_stage == MESA_SHADER_GEOMETRY && |
| !BITSET_TEST(linkage->output_equal_mask, i))) { |
| /* Keep the original qualifier. */ |
| BITSET_CLEAR(linkage->convergent32_mask, i); |
| } else { |
| /* Keep it convergent. */ |
| BITSET_CLEAR(linkage->interp_fp32_mask, i); |
| for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) |
| BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i); |
| BITSET_CLEAR(linkage->color32_mask, i); |
| for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++) |
| BITSET_CLEAR(linkage->color32_qual_masks[b], i); |
| BITSET_CLEAR(linkage->flat32_mask, i); |
| } |
| } |
| |
| BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) { |
| if (!BITSET_TEST(linkage->interp_fp16_mask, i) && |
| !BITSET_TEST(linkage->flat16_mask, i) && |
| !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i)) { |
| /* Clear the flag - not used by FS. */ |
| BITSET_CLEAR(linkage->convergent16_mask, i); |
| } else if ((!linkage->can_mix_convergent_flat_with_interpolated && |
| BITSET_TEST(linkage->flat16_mask, i)) || |
| (linkage->producer_stage == MESA_SHADER_GEOMETRY && |
| !BITSET_TEST(linkage->output_equal_mask, i))) { |
| /* Keep the original qualifier. */ |
| BITSET_CLEAR(linkage->convergent16_mask, i); |
| } else { |
| /* Keep it convergent. */ |
| BITSET_CLEAR(linkage->interp_fp16_mask, i); |
| for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) |
| BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i); |
| BITSET_CLEAR(linkage->flat16_mask, i); |
| } |
| } |
| } else { |
| /* Don't do anything with convergent slots. */ |
| BITSET_ZERO(linkage->convergent32_mask); |
| BITSET_ZERO(linkage->convergent16_mask); |
| } |
| } |
| |
| /****************************************************************** |
| * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS |
| ******************************************************************/ |
| |
| static bool |
| is_variable_present(nir_shader *nir, nir_variable *var, |
| nir_variable_mode mode, bool spirv) |
| { |
| nir_foreach_variable_with_modes(it, nir, mode) { |
| if ((spirv && it->data.binding == var->data.binding) || |
| (!spirv && !strcmp(it->name, var->name))) |
| return true; |
| } |
| return false; |
| } |
| |
| /* TODO: this should be a helper in common code */ |
| static unsigned |
| get_uniform_components(const struct glsl_type *type) |
| { |
| unsigned size = glsl_get_aoa_size(type); |
| size = MAX2(size, 1); |
| size *= glsl_get_matrix_columns(glsl_without_array(type)); |
| |
| if (glsl_type_is_dual_slot(glsl_without_array(type))) |
| size *= 2; |
| |
| /* Convert from vec4 to scalar. */ |
| return size * 4; |
| } |
| |
| static unsigned |
| get_ubo_slots(const nir_variable *var) |
| { |
| if (glsl_type_is_interface(glsl_without_array(var->type))) { |
| unsigned slots = glsl_get_aoa_size(var->type); |
| return MAX2(slots, 1); |
| } |
| |
| return 1; |
| } |
| |
| /** |
| * Count uniforms and see if the combined uniform component count is over |
| * the limit. If it is, don't move any uniforms. It's sufficient if drivers |
| * declare a very high limit. |
| */ |
| static void |
| determine_uniform_movability(struct linkage_info *linkage, |
| unsigned max_uniform_components) |
| { |
| nir_shader *producer = linkage->producer_builder.shader; |
| nir_shader *consumer = linkage->consumer_builder.shader; |
| unsigned num_producer_uniforms = 0; |
| unsigned num_consumer_uniforms = 0; |
| unsigned num_shared_uniforms = 0; |
| |
| nir_foreach_variable_with_modes(var, producer, nir_var_uniform) { |
| if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv)) |
| num_shared_uniforms += get_uniform_components(var->type); |
| else |
| num_producer_uniforms += get_uniform_components(var->type); |
| } |
| |
| nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) { |
| if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv)) |
| num_consumer_uniforms += get_uniform_components(var->type); |
| } |
| |
| linkage->can_move_uniforms = |
| num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <= |
| max_uniform_components; |
| } |
| |
| /** |
| * Count UBOs and see if the combined UBO count is over the limit. If it is, |
| * don't move any UBOs. It's sufficient if drivers declare a very high limit. |
| */ |
| static void |
| determine_ubo_movability(struct linkage_info *linkage, |
| unsigned max_ubos_per_stage) |
| { |
| nir_shader *producer = linkage->producer_builder.shader; |
| nir_shader *consumer = linkage->consumer_builder.shader; |
| unsigned num_producer_ubos = 0; |
| unsigned num_consumer_ubos = 0; |
| unsigned num_shared_ubos = 0; |
| |
| nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) { |
| if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv)) |
| num_shared_ubos += get_ubo_slots(var); |
| else |
| num_producer_ubos += get_ubo_slots(var); |
| } |
| |
| nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) { |
| if (!is_variable_present(producer, var, nir_var_mem_ubo, |
| linkage->spirv)) |
| num_consumer_ubos += get_ubo_slots(var); |
| } |
| |
| linkage->can_move_ubos = |
| num_producer_ubos + num_consumer_ubos + num_shared_ubos <= |
| max_ubos_per_stage; |
| } |
| |
| /****************************************************************** |
| * DEAD VARYINGS REMOVAL |
| ******************************************************************/ |
| |
| static void |
| remove_all_stores(struct linkage_info *linkage, unsigned i, |
| bool *uses_xfb, nir_opt_varyings_progress *progress) |
| { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| |
| assert(!list_is_empty(&slot->producer.stores) && |
| list_is_empty(&slot->producer.loads) && |
| list_is_empty(&slot->consumer.loads)); |
| |
| /* Remove all stores. */ |
| list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) { |
| if (nir_remove_varying(iter->instr, linkage->consumer_stage)) { |
| list_del(&iter->head); |
| *progress |= nir_progress_producer; |
| } else { |
| if (has_xfb(iter->instr)) { |
| *uses_xfb = true; |
| |
| if (!is_active_sysval_output(linkage, i, iter->instr)) { |
| if (iter->instr->src[0].ssa->bit_size == 32) |
| BITSET_SET(linkage->xfb32_only_mask, i); |
| else if (iter->instr->src[0].ssa->bit_size == 16) |
| BITSET_SET(linkage->xfb16_only_mask, i); |
| else |
| unreachable("invalid load_input type"); |
| } |
| } |
| } |
| } |
| } |
| |
| static void |
| remove_dead_varyings(struct linkage_info *linkage, |
| nir_opt_varyings_progress *progress) |
| { |
| unsigned i; |
| |
| /* Remove dead inputs and outputs. */ |
| BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| |
| /* Only indirect access can have no loads and stores because we moved |
| * them to the first element in tidy_up_indirect_varyings(). |
| */ |
| assert(!list_is_empty(&slot->producer.stores) || |
| !list_is_empty(&slot->producer.loads) || |
| !list_is_empty(&slot->consumer.loads) || |
| BITSET_TEST(linkage->indirect_mask, i)); |
| |
| /* Nothing to do if there are no loads and stores. */ |
| if (list_is_empty(&slot->producer.stores) && |
| list_is_empty(&slot->producer.loads) && |
| list_is_empty(&slot->consumer.loads)) |
| continue; |
| |
| /* If there are producer loads (e.g. TCS) but no consumer loads |
| * (e.g. TES), set the "no_varying" flag to indicate that the outputs |
| * are not consumed by the next shader stage (e.g. TES). |
| */ |
| if (!list_is_empty(&slot->producer.stores) && |
| !list_is_empty(&slot->producer.loads) && |
| list_is_empty(&slot->consumer.loads)) { |
| for (unsigned list_index = 0; list_index < 2; list_index++) { |
| struct list_head *list = list_index ? &slot->producer.stores : |
| &slot->producer.loads; |
| |
| list_for_each_entry(struct list_node, iter, list, head) { |
| nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr); |
| sem.no_varying = 1; |
| nir_intrinsic_set_io_semantics(iter->instr, sem); |
| } |
| } |
| |
| /* This tells the compaction to move these varyings to the end. */ |
| if (BITSET_TEST(linkage->flat32_mask, i)) { |
| assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT); |
| BITSET_CLEAR(linkage->flat32_mask, i); |
| BITSET_SET(linkage->no_varying32_mask, i); |
| } |
| if (BITSET_TEST(linkage->flat16_mask, i)) { |
| assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT); |
| BITSET_CLEAR(linkage->flat16_mask, i); |
| BITSET_SET(linkage->no_varying16_mask, i); |
| } |
| continue; |
| } |
| |
| /* The varyings aren't dead if both loads and stores are present. */ |
| if (!list_is_empty(&slot->producer.stores) && |
| (!list_is_empty(&slot->producer.loads) || |
| !list_is_empty(&slot->consumer.loads))) |
| continue; |
| |
| bool uses_xfb = false; |
| |
| if (list_is_empty(&slot->producer.stores)) { |
| /* There are no stores. */ |
| assert(!list_is_empty(&slot->producer.loads) || |
| !list_is_empty(&slot->consumer.loads)); |
| |
| /* TEXn.xy loads can't be removed in FS because of the coord |
| * replace state, but TEXn outputs can be removed if they are |
| * not read by FS. |
| * |
| * TEXn.zw loads can be eliminated and replaced by (0, 1), which |
| * is equal to the coord replace value. |
| */ |
| if (is_interpolated_texcoord(linkage, i)) { |
| assert(i % 2 == 0); /* high 16-bit slots disallowed */ |
| /* Keep TEXn.xy. */ |
| if (i % 8 < 4) |
| continue; |
| } |
| |
| /* Replace all loads with undef. Do that for both input loads |
| * in the consumer stage and output loads in the producer stage |
| * because we also want to eliminate TCS loads that have no |
| * corresponding TCS stores. |
| */ |
| for (unsigned list_index = 0; list_index < 2; list_index++) { |
| struct list_head *list = list_index ? &slot->producer.loads : |
| &slot->consumer.loads; |
| nir_builder *b = list_index ? &linkage->producer_builder : |
| &linkage->consumer_builder; |
| |
| list_for_each_entry(struct list_node, iter, list, head) { |
| nir_intrinsic_instr *loadi = iter->instr; |
| nir_def *replacement = NULL; |
| |
| b->cursor = nir_before_instr(&loadi->instr); |
| |
| /* LAYER and VIEWPORT FS inputs should be replaced by 0 |
| * instead of undef. |
| */ |
| gl_varying_slot location = (gl_varying_slot)(vec4_slot(i)); |
| |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && |
| (location == VARYING_SLOT_LAYER || |
| location == VARYING_SLOT_VIEWPORT || |
| /* TEXn.z is replaced by 0 (matching coord replace) */ |
| (is_interpolated_texcoord(linkage, i) && i % 8 == 4))) |
| replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size); |
| else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && |
| /* TEXn.w is replaced by 1 (matching coord replace) */ |
| is_interpolated_texcoord(linkage, i) && i % 8 == 6) |
| replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size); |
| else |
| replacement = nir_undef(b, 1, loadi->def.bit_size); |
| |
| nir_def_replace(&loadi->def, replacement); |
| |
| *progress |= list_index ? nir_progress_producer : |
| nir_progress_consumer; |
| } |
| } |
| |
| /* Clear the lists. */ |
| list_inithead(&slot->producer.loads); |
| list_inithead(&slot->consumer.loads); |
| } else { |
| /* There are no loads. */ |
| remove_all_stores(linkage, i, &uses_xfb, progress); |
| } |
| |
| /* Clear bitmasks associated with this varying slot or array. */ |
| for (unsigned elem = 0; elem < slot->num_slots; elem++) |
| clear_slot_info_after_removal(linkage, i + elem, uses_xfb); |
| } |
| } |
| |
| /****************************************************************** |
| * SSA CLONING HELPERS |
| ******************************************************************/ |
| |
| /* Pass flags for inter-shader code motion. Also used by helpers. */ |
| #define FLAG_ALU_IS_TES_INTERP_LOAD BITFIELD_BIT(0) |
| #define FLAG_MOVABLE BITFIELD_BIT(1) |
| #define FLAG_UNMOVABLE BITFIELD_BIT(2) |
| #define FLAG_POST_DOMINATOR_PROCESSED BITFIELD_BIT(3) |
| #define FLAG_GATHER_LOADS_VISITED BITFIELD_BIT(4) |
| |
| #define FLAG_INTERP_MASK BITFIELD_RANGE(5, 3) |
| #define FLAG_INTERP_CONVERGENT (0 << 5) |
| #define FLAG_INTERP_FLAT (1 << 5) |
| /* FS-only interpolation modes. */ |
| #define FLAG_INTERP_PERSP_PIXEL (2 << 5) |
| #define FLAG_INTERP_PERSP_CENTROID (3 << 5) |
| #define FLAG_INTERP_PERSP_SAMPLE (4 << 5) |
| #define FLAG_INTERP_LINEAR_PIXEL (5 << 5) |
| #define FLAG_INTERP_LINEAR_CENTROID (6 << 5) |
| #define FLAG_INTERP_LINEAR_SAMPLE (7 << 5) |
| /* TES-only interpolation modes. (these were found in shaders) */ |
| #define FLAG_INTERP_TES_TRIANGLE_UVW (2 << 5) /* v0*u + v1*v + v2*w */ |
| #define FLAG_INTERP_TES_TRIANGLE_WUV (3 << 5) /* v0*w + v1*u + v2*v */ |
| /* TODO: Feel free to insert more TES interpolation equations here. */ |
| |
| static bool |
| can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr) |
| { |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| unsigned allowed_modes = |
| (linkage->can_move_uniforms ? nir_var_uniform : 0) | |
| (linkage->can_move_ubos ? nir_var_mem_ubo : 0); |
| |
| if (!nir_deref_mode_is_one_of(deref, allowed_modes)) |
| return false; |
| |
| switch (deref->deref_type) { |
| case nir_deref_type_var: |
| case nir_deref_type_struct: |
| case nir_deref_type_array: |
| break; |
| default: |
| return false; |
| } |
| |
| nir_variable *var = nir_deref_instr_get_variable(deref); |
| |
| /* Subroutine uniforms are not moved. Even though it works and subroutine |
| * uniforms are moved correctly and subroutines have been inlined at this |
| * point, subroutine functions aren't moved and the linker doesn't like |
| * when a shader only contains a subroutine uniform but no subroutine |
| * functions. This could be fixed in the linker, but for now, don't |
| * move subroutine uniforms. |
| */ |
| if (var->name && strstr(var->name, "__subu_") == var->name) |
| return false; |
| |
| return true; |
| } |
| |
| static nir_intrinsic_instr * |
| find_per_vertex_load_for_tes_interp(nir_instr *instr) |
| { |
| switch (instr->type) { |
| case nir_instr_type_alu: { |
| nir_alu_instr *alu = nir_instr_as_alu(instr); |
| unsigned num_srcs = nir_op_infos[alu->op].num_inputs; |
| |
| for (unsigned i = 0; i < num_srcs; i++) { |
| nir_instr *src = alu->src[i].src.ssa->parent_instr; |
| nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src); |
| |
| if (intr) |
| return intr; |
| } |
| return NULL; |
| } |
| |
| case nir_instr_type_intrinsic: { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| return intr->intrinsic == nir_intrinsic_load_per_vertex_input ? |
| intr : NULL; |
| } |
| |
| default: |
| unreachable("unexpected instruction type"); |
| } |
| } |
| |
| static nir_def * |
| get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr) |
| { |
| nir_intrinsic_instr *intr; |
| |
| if (instr->type == nir_instr_type_intrinsic) { |
| intr = nir_instr_as_intrinsic(instr); |
| } else { |
| assert(instr->type == nir_instr_type_alu && |
| instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD); |
| intr = find_per_vertex_load_for_tes_interp(instr); |
| } |
| |
| unsigned slot_index = intr_get_scalar_16bit_slot(intr); |
| assert(list_is_singular(&linkage->slot[slot_index].producer.stores)); |
| |
| nir_def *stored_value = |
| list_first_entry(&linkage->slot[slot_index].producer.stores, |
| struct list_node, head)->instr->src[0].ssa; |
| assert(stored_value->num_components == 1); |
| return stored_value; |
| } |
| |
| /* Clone the SSA, which can be in a different shader. */ |
| static nir_def * |
| clone_ssa_impl(struct linkage_info *linkage, nir_builder *b, nir_def *ssa) |
| { |
| struct hash_entry *entry = _mesa_hash_table_search(linkage->clones_ht, |
| ssa->parent_instr); |
| if (entry) |
| return entry->data; |
| |
| nir_def *clone = NULL; |
| |
| switch (ssa->parent_instr->type) { |
| case nir_instr_type_load_const: |
| clone = nir_build_imm(b, ssa->num_components, ssa->bit_size, |
| nir_instr_as_load_const(ssa->parent_instr)->value); |
| break; |
| |
| case nir_instr_type_undef: |
| clone = nir_undef(b, ssa->num_components, ssa->bit_size); |
| break; |
| |
| case nir_instr_type_alu: { |
| nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr); |
| |
| if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) { |
| /* We are cloning an interpolated TES load in the producer for |
| * backward inter-shader code motion. |
| */ |
| assert(&linkage->producer_builder == b); |
| return get_stored_value_for_load(linkage, &alu->instr); |
| } |
| |
| nir_def *src[4] = {0}; |
| unsigned num_srcs = nir_op_infos[alu->op].num_inputs; |
| assert(num_srcs <= ARRAY_SIZE(src)); |
| |
| for (unsigned i = 0; i < num_srcs; i++) |
| src[i] = clone_ssa_impl(linkage, b, alu->src[i].src.ssa); |
| |
| clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]); |
| nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr); |
| |
| alu_clone->exact = alu->exact; |
| alu_clone->no_signed_wrap = alu->no_signed_wrap; |
| alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap; |
| alu_clone->def.num_components = alu->def.num_components; |
| alu_clone->def.bit_size = alu->def.bit_size; |
| |
| for (unsigned i = 0; i < num_srcs; i++) { |
| memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle, |
| NIR_MAX_VEC_COMPONENTS); |
| } |
| break; |
| } |
| |
| case nir_instr_type_intrinsic: { |
| /* Clone load_deref of uniform or ubo. It's the only thing that can |
| * occur here. |
| */ |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr); |
| |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_deref: { |
| nir_def *ssa = clone_ssa_impl(linkage, b, intr->src[0].ssa); |
| clone = nir_load_deref(b, nir_instr_as_deref(ssa->parent_instr)); |
| break; |
| } |
| |
| case nir_intrinsic_load_input: |
| case nir_intrinsic_load_per_primitive_input: |
| case nir_intrinsic_load_interpolated_input: { |
| /* We are cloning load_input in the producer for backward |
| * inter-shader code motion. Replace the input load with the stored |
| * output value. That way we can clone any expression using inputs |
| * from the consumer in the producer. |
| */ |
| assert(&linkage->producer_builder == b); |
| clone = get_stored_value_for_load(linkage, &intr->instr); |
| break; |
| } |
| |
| default: |
| unreachable("unexpected intrinsic"); |
| } |
| break; |
| } |
| |
| case nir_instr_type_deref: { |
| nir_deref_instr *deref = nir_instr_as_deref(ssa->parent_instr); |
| assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo)); |
| |
| /* Get the uniform from the original shader. */ |
| nir_variable *var = nir_deref_instr_get_variable(deref); |
| assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos); |
| |
| /* Declare the uniform in the target shader. If it's the same shader |
| * (in the case of replacing output loads with a uniform), this has |
| * no effect. If the variable already exists in the target shader, this |
| * just returns the existing one. |
| */ |
| var = nir_clone_uniform_variable(b->shader, var, linkage->spirv); |
| |
| if (deref->deref_type == nir_deref_type_var) { |
| clone = &nir_build_deref_var(b, var)->def; |
| } else { |
| nir_deref_instr *parent_orig = nir_deref_instr_parent(deref); |
| nir_deref_instr *parent_clone = |
| nir_instr_as_deref(clone_ssa_impl(linkage, b, &parent_orig->def) |
| ->parent_instr); |
| |
| switch (deref->deref_type) { |
| case nir_deref_type_array: { |
| nir_def *index = clone_ssa_impl(linkage, b, deref->arr.index.ssa); |
| clone = &nir_build_deref_array(b, parent_clone, index)->def; |
| break; |
| } |
| case nir_deref_type_struct: |
| clone = &nir_build_deref_struct(b, parent_clone, |
| deref->strct.index)->def; |
| break; |
| default: |
| unreachable("invalid deref type"); |
| } |
| } |
| break; |
| } |
| |
| default: |
| unreachable("unexpected instruction type"); |
| } |
| |
| _mesa_hash_table_insert(linkage->clones_ht, ssa->parent_instr, clone); |
| return clone; |
| } |
| |
| static nir_def * |
| clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa) |
| { |
| assert(!linkage->clones_ht); |
| linkage->clones_ht = _mesa_pointer_hash_table_create(NULL); |
| |
| nir_def *clone = clone_ssa_impl(linkage, b, ssa); |
| |
| _mesa_hash_table_destroy(linkage->clones_ht, NULL); |
| linkage->clones_ht = NULL; |
| return clone; |
| } |
| |
| /****************************************************************** |
| * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS) |
| ******************************************************************/ |
| |
| static void |
| remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot, |
| nir_opt_varyings_progress *progress) |
| { |
| bool uses_xfb = false; |
| remove_all_stores(linkage, slot, &uses_xfb, progress); |
| clear_slot_info_after_removal(linkage, slot, uses_xfb); |
| } |
| |
| struct is_uniform_expr_state { |
| struct linkage_info *linkage; |
| unsigned cost; |
| }; |
| |
| static bool |
| is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state); |
| |
| static bool |
| src_is_uniform_expression(nir_src *src, void *data) |
| { |
| return is_uniform_expression(src->ssa->parent_instr, |
| (struct is_uniform_expr_state*)data); |
| } |
| |
| /** |
| * Return whether instr is a uniform expression that can be moved into |
| * the next shader. |
| */ |
| static bool |
| is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state) |
| { |
| switch (instr->type) { |
| case nir_instr_type_load_const: |
| case nir_instr_type_undef: |
| return true; |
| |
| case nir_instr_type_alu: |
| break; |
| |
| case nir_instr_type_intrinsic: |
| if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_deref) |
| break; |
| return false; |
| |
| case nir_instr_type_deref: |
| if (!can_move_deref_between_shaders(state->linkage, instr)) |
| return false; |
| /* We need to iterate over the deref chain recursively. */ |
| break; |
| |
| default: |
| return false; |
| } |
| |
| if (!instr->pass_flags) { |
| state->cost += state->linkage->varying_estimate_instr_cost ? |
| state->linkage->varying_estimate_instr_cost(instr) : 1; |
| instr->pass_flags = 1; |
| return nir_foreach_src(instr, src_is_uniform_expression, state); |
| } |
| return true; |
| } |
| |
| /** |
| * Propagate constants, uniforms, UBO loads, and uniform expressions |
| * in output components to inputs loads in the next shader and output |
| * loads in the current stage, and remove the output components. |
| * |
| * Uniform expressions are ALU expressions only sourcing constants, uniforms, |
| * and UBO loads. |
| */ |
| static void |
| propagate_uniform_expressions(struct linkage_info *linkage, |
| nir_opt_varyings_progress *progress) |
| { |
| unsigned i; |
| |
| /* Find uniform expressions. If there are multiple stores, they should all |
| * store the same value. That's guaranteed by output_equal_mask. |
| */ |
| BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { |
| if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr) |
| continue; |
| |
| struct scalar_slot *slot = &linkage->slot[i]; |
| assert(!list_is_empty(&slot->producer.loads) || |
| !list_is_empty(&slot->consumer.loads)); |
| |
| struct is_uniform_expr_state state = { |
| .linkage = linkage, |
| .cost = 0, |
| }; |
| |
| /* Clear pass_flags, which is used to prevent adding the cost of |
| * the same instruction multiple times. |
| */ |
| nir_shader_clear_pass_flags(linkage->producer_builder.shader); |
| |
| if (!is_uniform_expression(slot->producer.value, &state)) |
| continue; |
| |
| if (state.cost > linkage->max_varying_expression_cost) |
| continue; |
| |
| /* Colors can be propagated only if they are constant between [0, 1] |
| * because that's the only case when the clamp vertex color state has |
| * no effect. |
| */ |
| if (is_interpolated_color(linkage, i) && |
| (slot->producer.value->type != nir_instr_type_load_const || |
| nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 || |
| nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1)) |
| continue; |
| |
| /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's |
| * the coord replace value. |
| */ |
| if (is_interpolated_texcoord(linkage, i)) { |
| assert(i % 2 == 0); /* high 16-bit slots disallowed */ |
| |
| if (i % 8 == 0 || /* TEXn.x */ |
| i % 8 == 2 || /* TEXn.y */ |
| slot->producer.value->type != nir_instr_type_load_const) |
| continue; |
| |
| float value = |
| nir_instr_as_load_const(slot->producer.value)->value[0].f32; |
| |
| /* This ignores signed zeros, but those are destroyed by |
| * interpolation, so it doesn't matter. |
| */ |
| if ((i % 8 == 4 && value != 0) || |
| (i % 8 == 6 && value != 1)) |
| continue; |
| } |
| |
| /* Clear pass_flags, which is used by clone_ssa. */ |
| nir_shader_clear_pass_flags(linkage->producer_builder.shader); |
| |
| /* Replace all loads. Do that for both input and output loads. */ |
| for (unsigned list_index = 0; list_index < 2; list_index++) { |
| struct list_head *load = list_index ? &slot->producer.loads : |
| &slot->consumer.loads; |
| nir_builder *b = list_index ? &linkage->producer_builder : |
| &linkage->consumer_builder; |
| |
| list_for_each_entry(struct list_node, node, load, head) { |
| nir_intrinsic_instr *loadi = node->instr; |
| b->cursor = nir_before_instr(&loadi->instr); |
| |
| /* Copy the uniform expression before the load. */ |
| nir_def *clone = clone_ssa(linkage, b, |
| nir_instr_def(slot->producer.value)); |
| |
| /* Interpolation converts Infs to NaNs. If we skip it, we need to |
| * convert Infs to NaNs manually. |
| */ |
| if (loadi->intrinsic == nir_intrinsic_load_interpolated_input && |
| preserve_nans(b->shader, clone->bit_size)) |
| clone = build_convert_inf_to_nan(b, clone); |
| |
| /* Replace the original load. */ |
| nir_def_replace(&loadi->def, clone); |
| *progress |= list_index ? nir_progress_producer : |
| nir_progress_consumer; |
| } |
| } |
| |
| /* Clear the lists. */ |
| list_inithead(&slot->producer.loads); |
| list_inithead(&slot->consumer.loads); |
| |
| /* Remove all stores now that loads have been replaced. */ |
| remove_all_stores_and_clear_slot(linkage, i, progress); |
| } |
| } |
| |
| /****************************************************************** |
| * OUTPUT DEDUPLICATION |
| ******************************************************************/ |
| |
| /* We can only deduplicate outputs that have the same qualifier, and color |
| * components must be deduplicated separately because they are affected by GL |
| * states. |
| * |
| * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for |
| * deduplication as long as it's not flat. |
| * |
| * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat |
| * by the flatshade state, so it can't be deduplicated with |
| * QUAL_COLOR_INTERP_ANY, which is never flat. |
| */ |
| enum var_qualifier { |
| QUAL_PATCH, |
| QUAL_VAR_FLAT, |
| QUAL_COLOR_FLAT, |
| QUAL_EXPLICIT, |
| QUAL_EXPLICIT_STRICT, |
| QUAL_PER_PRIMITIVE, |
| /* When nir_io_has_flexible_input_interpolation_except_flat is set: */ |
| QUAL_VAR_INTERP_ANY, |
| QUAL_COLOR_INTERP_ANY, |
| QUAL_COLOR_SHADEMODEL_ANY, |
| /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */ |
| QUAL_VAR_PERSP_PIXEL, |
| QUAL_VAR_PERSP_CENTROID, |
| QUAL_VAR_PERSP_SAMPLE, |
| QUAL_VAR_LINEAR_PIXEL, |
| QUAL_VAR_LINEAR_CENTROID, |
| QUAL_VAR_LINEAR_SAMPLE, |
| QUAL_COLOR_PERSP_PIXEL, |
| QUAL_COLOR_PERSP_CENTROID, |
| QUAL_COLOR_PERSP_SAMPLE, |
| QUAL_COLOR_LINEAR_PIXEL, |
| QUAL_COLOR_LINEAR_CENTROID, |
| QUAL_COLOR_LINEAR_SAMPLE, |
| QUAL_COLOR_SHADEMODEL_PIXEL, |
| QUAL_COLOR_SHADEMODEL_CENTROID, |
| QUAL_COLOR_SHADEMODEL_SAMPLE, |
| NUM_DEDUP_QUALIFIERS, |
| |
| QUAL_SKIP, |
| QUAL_UNKNOWN, |
| }; |
| |
| /* Return the input qualifier if all loads use the same one, else skip. |
| * This is only used by output deduplication to determine input compatibility. |
| */ |
| static enum var_qualifier |
| get_input_qualifier(struct linkage_info *linkage, unsigned i) |
| { |
| assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT); |
| struct scalar_slot *slot = &linkage->slot[i]; |
| bool is_color = is_interpolated_color(linkage, i); |
| nir_intrinsic_instr *load = |
| list_first_entry(&slot->consumer.loads, struct list_node, head)->instr; |
| |
| if (load->intrinsic == nir_intrinsic_load_input) |
| return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT; |
| |
| if (load->intrinsic == nir_intrinsic_load_per_primitive_input) |
| return QUAL_PER_PRIMITIVE; |
| |
| if (load->intrinsic == nir_intrinsic_load_input_vertex) { |
| return nir_intrinsic_io_semantics(load).interp_explicit_strict ? |
| QUAL_EXPLICIT_STRICT : QUAL_EXPLICIT; |
| } |
| |
| assert(load->intrinsic == nir_intrinsic_load_interpolated_input); |
| nir_intrinsic_instr *baryc = |
| nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); |
| |
| if (linkage->has_flexible_interp) { |
| if (is_color) { |
| return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ? |
| QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY; |
| } else { |
| return QUAL_VAR_INTERP_ANY; |
| } |
| } |
| |
| /* If interpolateAt{Centroid,Offset,Sample} is used, see if there is |
| * another load that doesn't use those, so that we get the real qualifier. |
| */ |
| if (baryc->intrinsic == nir_intrinsic_load_barycentric_centroid || |
| baryc->intrinsic == nir_intrinsic_load_barycentric_at_offset || |
| baryc->intrinsic == nir_intrinsic_load_barycentric_at_sample) { |
| list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { |
| nir_intrinsic_instr *bar = |
| nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr); |
| |
| if (bar->intrinsic != nir_intrinsic_load_barycentric_centroid && |
| bar->intrinsic != nir_intrinsic_load_barycentric_at_offset && |
| bar->intrinsic != nir_intrinsic_load_barycentric_at_sample) { |
| baryc = bar; |
| break; |
| } |
| } |
| } |
| |
| /* Get the exact interpolation qualifier. */ |
| unsigned pixel_location; |
| enum var_qualifier qual; |
| |
| switch (baryc->intrinsic) { |
| case nir_intrinsic_load_barycentric_pixel: |
| pixel_location = 0; |
| break; |
| case nir_intrinsic_load_barycentric_centroid: |
| pixel_location = 1; |
| break; |
| case nir_intrinsic_load_barycentric_sample: |
| pixel_location = 2; |
| break; |
| case nir_intrinsic_load_barycentric_at_offset: |
| case nir_intrinsic_load_barycentric_at_sample: |
| /* Don't deduplicate outputs that are interpolated at offset/sample. */ |
| return QUAL_SKIP; |
| default: |
| unreachable("unexpected barycentric src"); |
| } |
| |
| switch (nir_intrinsic_interp_mode(baryc)) { |
| case INTERP_MODE_NONE: |
| qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL : |
| QUAL_VAR_PERSP_PIXEL; |
| break; |
| case INTERP_MODE_SMOOTH: |
| qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL; |
| break; |
| case INTERP_MODE_NOPERSPECTIVE: |
| qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL; |
| break; |
| default: |
| unreachable("unexpected interp mode"); |
| } |
| |
| /* The ordering of the "qual" enum was carefully chosen to make this |
| * addition correct. |
| */ |
| STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID); |
| STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE); |
| STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID); |
| STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE); |
| STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID); |
| STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE); |
| STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID); |
| STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE); |
| STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 == |
| QUAL_COLOR_SHADEMODEL_CENTROID); |
| STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 == |
| QUAL_COLOR_SHADEMODEL_SAMPLE); |
| return qual + pixel_location; |
| } |
| |
| static void |
| deduplicate_outputs(struct linkage_info *linkage, |
| nir_opt_varyings_progress *progress) |
| { |
| struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL}; |
| unsigned i; |
| |
| /* Find duplicated outputs. If there are multiple stores, they should all |
| * store the same value as all stores of some other output. That's |
| * guaranteed by output_equal_mask. |
| */ |
| BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { |
| if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate) |
| continue; |
| |
| struct scalar_slot *slot = &linkage->slot[i]; |
| enum var_qualifier qualifier; |
| gl_varying_slot var_slot = vec4_slot(i); |
| |
| /* Determine which qualifier this slot has. */ |
| if ((var_slot >= VARYING_SLOT_PATCH0 && |
| var_slot <= VARYING_SLOT_PATCH31) || |
| var_slot == VARYING_SLOT_TESS_LEVEL_INNER || |
| var_slot == VARYING_SLOT_TESS_LEVEL_OUTER) |
| qualifier = QUAL_PATCH; |
| else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) |
| qualifier = QUAL_VAR_FLAT; |
| else |
| qualifier = get_input_qualifier(linkage, i); |
| |
| if (qualifier == QUAL_SKIP) |
| continue; |
| |
| struct hash_table **table = &tables[qualifier]; |
| if (!*table) |
| *table = _mesa_pointer_hash_table_create(NULL); |
| |
| nir_instr *value = slot->producer.value; |
| |
| struct hash_entry *entry = _mesa_hash_table_search(*table, value); |
| if (!entry) { |
| _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i); |
| continue; |
| } |
| |
| /* We've found a duplicate. Redirect loads and remove stores. */ |
| struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data]; |
| nir_intrinsic_instr *store = |
| list_first_entry(&found_slot->producer.stores, |
| struct list_node, head)->instr; |
| nir_io_semantics sem = nir_intrinsic_io_semantics(store); |
| unsigned component = nir_intrinsic_component(store); |
| |
| /* Redirect loads. */ |
| for (unsigned list_index = 0; list_index < 2; list_index++) { |
| struct list_head *src_loads = list_index ? &slot->producer.loads : |
| &slot->consumer.loads; |
| struct list_head *dst_loads = list_index ? &found_slot->producer.loads : |
| &found_slot->consumer.loads; |
| bool has_progress = !list_is_empty(src_loads); |
| |
| list_for_each_entry(struct list_node, iter, src_loads, head) { |
| nir_intrinsic_instr *loadi = iter->instr; |
| |
| nir_intrinsic_set_io_semantics(loadi, sem); |
| nir_intrinsic_set_component(loadi, component); |
| |
| /* We also need to set the base to match the duplicate load, so |
| * that CSE can eliminate it. |
| */ |
| if (!list_is_empty(dst_loads)) { |
| struct list_node *first = |
| list_first_entry(dst_loads, struct list_node, head); |
| nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr)); |
| } else { |
| /* Use the base of the found store if there are no loads (it can |
| * only happen with TCS). |
| */ |
| assert(list_index == 0); |
| nir_intrinsic_set_base(loadi, nir_intrinsic_base(store)); |
| } |
| } |
| |
| if (has_progress) { |
| /* Move the redirected loads to the found slot, so that compaction |
| * can find them. |
| */ |
| list_splicetail(src_loads, dst_loads); |
| list_inithead(src_loads); |
| |
| *progress |= list_index ? nir_progress_producer : |
| nir_progress_consumer; |
| } |
| } |
| |
| /* Remove all duplicated stores now that loads have been redirected. */ |
| remove_all_stores_and_clear_slot(linkage, i, progress); |
| } |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(tables); i++) |
| _mesa_hash_table_destroy(tables[i], NULL); |
| } |
| |
| /****************************************************************** |
| * FIND OPEN-CODED TES INPUT INTERPOLATION |
| ******************************************************************/ |
| |
| static nir_alu_instr * |
| get_single_use_as_alu(nir_def *def) |
| { |
| /* Only 1 use allowed. */ |
| if (!list_is_singular(&def->uses)) |
| return NULL; |
| |
| nir_instr *instr = |
| nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link)); |
| if (instr->type != nir_instr_type_alu) |
| return NULL; |
| |
| return nir_instr_as_alu(instr); |
| } |
| |
| static nir_alu_instr * |
| check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load, |
| unsigned *vertex_index, |
| unsigned *vertices_used, |
| unsigned max_vertices) |
| { |
| if (load->intrinsic != nir_intrinsic_load_per_vertex_input) |
| return NULL; |
| |
| /* Check the vertex index. Each vertex can be loaded only once. */ |
| if (!nir_src_is_const(load->src[0])) |
| return false; |
| |
| *vertex_index = nir_src_as_uint(load->src[0]); |
| if (*vertex_index >= max_vertices || |
| *vertices_used & BITFIELD_BIT(*vertex_index)) |
| return false; |
| |
| *vertices_used |= BITFIELD_BIT(*vertex_index); |
| |
| return get_single_use_as_alu(&load->def); |
| } |
| |
| static bool |
| gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul, |
| unsigned vertex_index, unsigned *tess_coord_swizzle, |
| unsigned *tess_coord_used, nir_def **load_tess_coord) |
| { |
| unsigned other_src = fmul->src[0].src.ssa == &load->def; |
| nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr; |
| |
| assert(fmul->src[!other_src].swizzle[0] == 0); |
| |
| if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD)) |
| return false; |
| |
| unsigned tess_coord_component = fmul->src[other_src].swizzle[0]; |
| /* Each tesscoord component can be used only once. */ |
| if (*tess_coord_used & BITFIELD_BIT(tess_coord_component)) |
| return false; |
| |
| *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index); |
| *tess_coord_used |= BITFIELD_BIT(tess_coord_component); |
| *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def; |
| return true; |
| } |
| |
| /** |
| * Find interpolation of the form: |
| * input[0].slot * TessCoord.a + |
| * input[1].slot * TessCoord.b + |
| * input[2].slot * TessCoord.c; |
| * |
| * a,b,c can be any of x,y,z, but each can occur only once. |
| */ |
| static bool |
| find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i) |
| { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| unsigned vertices_used = 0; |
| unsigned tess_coord_used = 0; |
| unsigned tess_coord_swizzle = 0; |
| unsigned num_fmuls = 0, num_fadds = 0; |
| nir_alu_instr *fadds[2]; |
| nir_def *load_tess_coord = NULL; |
| |
| /* Find 3 multiplications by TessCoord and their uses, which must be |
| * fadds. |
| */ |
| list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { |
| unsigned vertex_index; |
| nir_alu_instr *fmul = |
| check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index, |
| &vertices_used, 3); |
| /* Only maximum of 3 loads expected. Also reject exact ops because we |
| * are going to do an inexact transformation with it. |
| */ |
| if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 || |
| !gather_fmul_tess_coord(iter->instr, fmul, vertex_index, |
| &tess_coord_swizzle, &tess_coord_used, |
| &load_tess_coord)) |
| return false; |
| |
| num_fmuls++; |
| |
| /* The multiplication must only be used by fadd. Also reject exact ops. |
| */ |
| nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def); |
| if (!fadd || fadd->op != nir_op_fadd || fadd->exact) |
| return false; |
| |
| /* The 3 fmuls must only be used by 2 fadds. */ |
| unsigned i; |
| for (i = 0; i < num_fadds; i++) { |
| if (fadds[i] == fadd) |
| break; |
| } |
| if (i == num_fadds) { |
| if (num_fadds == 2) |
| return false; |
| |
| fadds[num_fadds++] = fadd; |
| } |
| } |
| |
| if (num_fmuls != 3 || num_fadds != 2) |
| return false; |
| |
| assert(tess_coord_used == 0x7); |
| |
| /* We have found that the only uses of the 3 fmuls are 2 fadds, which |
| * implies that at least 2 fmuls are used by the same fadd. |
| * |
| * Check that 1 fadd is used by the other fadd, which can only be |
| * the result of the TessCoord interpolation. |
| */ |
| for (unsigned i = 0; i < 2; i++) { |
| if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) { |
| switch (tess_coord_swizzle) { |
| case 0x210: |
| slot->consumer.tes_interp_load = fadds[!i]; |
| slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW; |
| slot->consumer.tes_load_tess_coord = load_tess_coord; |
| return true; |
| |
| case 0x102: |
| slot->consumer.tes_interp_load = fadds[!i]; |
| slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV; |
| slot->consumer.tes_load_tess_coord = load_tess_coord; |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Find interpolation of the form: |
| * fma(input[0].slot, TessCoord.a, |
| * fma(input[1].slot, TessCoord.b, |
| * input[2].slot * TessCoord.c)) |
| * |
| * a,b,c can be any of x,y,z, but each can occur only once. |
| */ |
| static bool |
| find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i) |
| { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| unsigned vertices_used = 0; |
| unsigned tess_coord_used = 0; |
| unsigned tess_coord_swizzle = 0; |
| unsigned num_fmuls = 0, num_ffmas = 0; |
| nir_alu_instr *ffmas[2], *fmul = NULL; |
| nir_def *load_tess_coord = NULL; |
| |
| list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { |
| unsigned vertex_index; |
| nir_alu_instr *alu = |
| check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index, |
| &vertices_used, 3); |
| |
| /* Reject exact ops because we are going to do an inexact transformation |
| * with it. |
| */ |
| if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) || |
| alu->exact || |
| !gather_fmul_tess_coord(iter->instr, alu, vertex_index, |
| &tess_coord_swizzle, &tess_coord_used, |
| &load_tess_coord)) |
| return false; |
| |
| /* The multiplication must only be used by ffma. */ |
| if (alu->op == nir_op_fmul) { |
| nir_alu_instr *ffma = get_single_use_as_alu(&alu->def); |
| if (!ffma || ffma->op != nir_op_ffma) |
| return false; |
| |
| if (num_fmuls == 1) |
| return false; |
| |
| fmul = alu; |
| num_fmuls++; |
| } else { |
| if (num_ffmas == 2) |
| return false; |
| |
| ffmas[num_ffmas++] = alu; |
| } |
| } |
| |
| if (num_fmuls != 1 || num_ffmas != 2) |
| return false; |
| |
| assert(tess_coord_used == 0x7); |
| |
| /* We have found that fmul has only 1 use and it's ffma, and there are 2 |
| * ffmas. Fail if neither ffma is using fmul. |
| */ |
| if (ffmas[0]->src[2].src.ssa != &fmul->def && |
| ffmas[1]->src[2].src.ssa != &fmul->def) |
| return false; |
| |
| /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */ |
| for (unsigned i = 0; i < 2; i++) { |
| if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) { |
| switch (tess_coord_swizzle) { |
| case 0x210: |
| slot->consumer.tes_interp_load = ffmas[!i]; |
| slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW; |
| slot->consumer.tes_load_tess_coord = load_tess_coord; |
| return true; |
| |
| case 0x102: |
| slot->consumer.tes_interp_load = ffmas[!i]; |
| slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV; |
| slot->consumer.tes_load_tess_coord = load_tess_coord; |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| static void |
| find_open_coded_tes_input_interpolation(struct linkage_info *linkage) |
| { |
| if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL) |
| return; |
| |
| unsigned i; |
| BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) { |
| if (vec4_slot(i) >= VARYING_SLOT_PATCH0 && |
| vec4_slot(i) <= VARYING_SLOT_PATCH31) |
| continue; |
| if (find_tes_triangle_interp_3fmul_2fadd(linkage, i)) |
| continue; |
| if (find_tes_triangle_interp_1fmul_2ffma(linkage, i)) |
| continue; |
| } |
| |
| BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) { |
| if (vec4_slot(i) >= VARYING_SLOT_PATCH0 && |
| vec4_slot(i) <= VARYING_SLOT_PATCH31) |
| continue; |
| if (find_tes_triangle_interp_3fmul_2fadd(linkage, i)) |
| continue; |
| if (find_tes_triangle_interp_1fmul_2ffma(linkage, i)) |
| continue; |
| } |
| } |
| |
| /****************************************************************** |
| * BACKWARD INTER-SHADER CODE MOTION |
| ******************************************************************/ |
| |
| #define NEED_UPDATE_MOVABLE_FLAGS(instr) \ |
| (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE))) |
| |
| #define GET_SRC_INTERP(alu, i) \ |
| ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK) |
| |
| static bool |
| can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu) |
| { |
| /* Exact ALUs can't be moved across interpolation. */ |
| if (alu->exact) |
| return false; |
| |
| /* Interpolation converts Infs to NaNs. If we turn a result of an ALU |
| * instruction into a new interpolated input, it converts Infs to NaNs for |
| * that instruction, while removing the Infs to NaNs conversion for sourced |
| * interpolated values. We can't do that if Infs and NaNs must be preserved. |
| */ |
| if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size)) |
| return false; |
| |
| switch (alu->op) { |
| /* Always legal if the sources are interpolated identically because: |
| * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j) |
| * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j) |
| */ |
| case nir_op_fadd: |
| case nir_op_fsub: |
| /* This is the same as multiplying by -1, which is always legal, see fmul. |
| */ |
| case nir_op_fneg: |
| case nir_op_mov: |
| return true; |
| |
| /* At least one side of the multiplication must be convergent because this |
| * is the only equation with multiplication that is true: |
| * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j) |
| */ |
| case nir_op_fmul: |
| case nir_op_fmulz: |
| case nir_op_ffma: |
| case nir_op_ffmaz: |
| return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT || |
| GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT; |
| |
| case nir_op_fdiv: |
| /* The right side must be convergent, which then follows the fmul rule. |
| */ |
| return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT; |
| |
| case nir_op_flrp: |
| /* Using the same rule as fmul. */ |
| return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT && |
| GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) || |
| GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT; |
| |
| default: |
| /* Moving other ALU instructions across interpolation is illegal. */ |
| return false; |
| } |
| } |
| |
| /* Determine whether an instruction is movable from the consumer to |
| * the producer. Also determine which interpolation modes each ALU instruction |
| * should use if its value was promoted to a new input. |
| */ |
| static void |
| update_movable_flags(struct linkage_info *linkage, nir_instr *instr) |
| { |
| /* This function shouldn't be called more than once for each instruction |
| * to minimize recursive calling. |
| */ |
| assert(NEED_UPDATE_MOVABLE_FLAGS(instr)); |
| |
| switch (instr->type) { |
| case nir_instr_type_undef: |
| case nir_instr_type_load_const: |
| /* Treat constants as convergent, which means compatible with both flat |
| * and non-flat inputs. |
| */ |
| instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT; |
| return; |
| |
| case nir_instr_type_alu: { |
| nir_alu_instr *alu = nir_instr_as_alu(instr); |
| unsigned num_srcs = nir_op_infos[alu->op].num_inputs; |
| unsigned alu_interp; |
| |
| /* Make vector ops unmovable. They are technically movable but more |
| * complicated, and NIR should be scalarized for this pass anyway. |
| * The only remaining vector ops should be vecN for intrinsic sources. |
| */ |
| if (alu->def.num_components > 1) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| alu_interp = FLAG_INTERP_CONVERGENT; |
| |
| for (unsigned i = 0; i < num_srcs; i++) { |
| nir_instr *src_instr = alu->src[i].src.ssa->parent_instr; |
| |
| if (NEED_UPDATE_MOVABLE_FLAGS(src_instr)) |
| update_movable_flags(linkage, src_instr); |
| |
| if (src_instr->pass_flags & FLAG_UNMOVABLE) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| /* Determine which interpolation mode this ALU instruction should |
| * use if it was promoted to a new input. |
| */ |
| unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK; |
| |
| if (alu_interp == src_interp || |
| src_interp == FLAG_INTERP_CONVERGENT) { |
| /* Nothing to do. */ |
| } else if (alu_interp == FLAG_INTERP_CONVERGENT) { |
| alu_interp = src_interp; |
| } else { |
| assert(alu_interp != FLAG_INTERP_CONVERGENT && |
| src_interp != FLAG_INTERP_CONVERGENT && |
| alu_interp != src_interp); |
| /* The ALU instruction sources conflicting interpolation flags. |
| * It can never become a new input. |
| */ |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| } |
| |
| /* Check if we can move the ALU instruction across an interpolated |
| * load into the previous shader. |
| */ |
| if (alu_interp > FLAG_INTERP_FLAT && |
| !can_move_alu_across_interp(linkage, alu)) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| instr->pass_flags |= FLAG_MOVABLE | alu_interp; |
| return; |
| } |
| |
| case nir_instr_type_intrinsic: { |
| /* Movable input loads already have FLAG_MOVABLE on them. |
| * Unmovable input loads skipped by initialization get UNMOVABLE here. |
| * (e.g. colors, texcoords) |
| * |
| * The only other movable intrinsic is load_deref for uniforms and UBOs. |
| * Other intrinsics are not movable. |
| */ |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| if (intr->intrinsic == nir_intrinsic_load_deref) { |
| nir_instr *deref = intr->src[0].ssa->parent_instr; |
| |
| if (NEED_UPDATE_MOVABLE_FLAGS(deref)) |
| update_movable_flags(linkage, deref); |
| |
| instr->pass_flags |= deref->pass_flags; |
| return; |
| } |
| |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| case nir_instr_type_deref: { |
| if (!can_move_deref_between_shaders(linkage, instr)) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| nir_deref_instr *parent = nir_deref_instr_parent(deref); |
| |
| if (parent) { |
| if (NEED_UPDATE_MOVABLE_FLAGS(&parent->instr)) |
| update_movable_flags(linkage, &parent->instr); |
| |
| if (parent->instr.pass_flags & FLAG_UNMOVABLE) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| } |
| |
| switch (deref->deref_type) { |
| case nir_deref_type_var: |
| instr->pass_flags |= FLAG_MOVABLE; |
| return; |
| |
| case nir_deref_type_struct: |
| assert(parent->instr.pass_flags & FLAG_MOVABLE); |
| instr->pass_flags |= parent->instr.pass_flags; |
| return; |
| |
| case nir_deref_type_array: { |
| nir_instr *index = deref->arr.index.ssa->parent_instr; |
| |
| if (NEED_UPDATE_MOVABLE_FLAGS(index)) |
| update_movable_flags(linkage, index); |
| |
| /* Integer array indices should be movable only if they are |
| * convergent or flat. |
| */ |
| ASSERTED unsigned index_interp = index->pass_flags & FLAG_INTERP_MASK; |
| assert(index->pass_flags & FLAG_UNMOVABLE || |
| (index_interp == FLAG_INTERP_CONVERGENT || |
| index_interp == FLAG_INTERP_FLAT)); |
| |
| if (parent) { |
| unsigned parent_interp = parent->instr.pass_flags & FLAG_INTERP_MASK; |
| |
| /* Check if the interpolation flags are compatible. */ |
| if (parent_interp != FLAG_INTERP_CONVERGENT && |
| index_interp != FLAG_INTERP_CONVERGENT && |
| parent_interp != index_interp) { |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| |
| /* Pick the one that isn't convergent because convergent inputs |
| * can be in expressions with any other qualifier. |
| */ |
| if (parent_interp == FLAG_INTERP_CONVERGENT) |
| instr->pass_flags |= index->pass_flags; |
| else |
| instr->pass_flags |= parent->instr.pass_flags; |
| } else { |
| instr->pass_flags |= index->pass_flags; |
| } |
| return; |
| } |
| |
| default: |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| } |
| |
| default: |
| instr->pass_flags |= FLAG_UNMOVABLE; |
| return; |
| } |
| } |
| |
| /* Gather the input loads used by the post-dominator using DFS. */ |
| static void |
| gather_used_input_loads(nir_instr *instr, |
| nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS], |
| unsigned *num_loads) |
| { |
| switch (instr->type) { |
| case nir_instr_type_undef: |
| case nir_instr_type_load_const: |
| return; |
| |
| case nir_instr_type_alu: { |
| nir_alu_instr *alu = nir_instr_as_alu(instr); |
| unsigned num_srcs = nir_op_infos[alu->op].num_inputs; |
| |
| for (unsigned i = 0; i < num_srcs; i++) { |
| gather_used_input_loads(alu->src[i].src.ssa->parent_instr, |
| loads, num_loads); |
| } |
| return; |
| } |
| |
| case nir_instr_type_intrinsic: { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_tess_coord: |
| return; |
| |
| case nir_intrinsic_load_deref: |
| gather_used_input_loads(intr->src[0].ssa->parent_instr, |
| loads, num_loads); |
| return; |
| |
| case nir_intrinsic_load_input: |
| case nir_intrinsic_load_per_vertex_input: |
| case nir_intrinsic_load_interpolated_input: |
| if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) { |
| assert(*num_loads < NUM_SCALAR_SLOTS*8); |
| loads[(*num_loads)++] = intr; |
| intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED; |
| } |
| return; |
| |
| default: |
| printf("%u\n", intr->intrinsic); |
| unreachable("unexpected intrinsic"); |
| } |
| } |
| |
| case nir_instr_type_deref: { |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| nir_deref_instr *parent = nir_deref_instr_parent(deref); |
| |
| if (parent) |
| gather_used_input_loads(&parent->instr, loads, num_loads); |
| |
| switch (deref->deref_type) { |
| case nir_deref_type_var: |
| case nir_deref_type_struct: |
| return; |
| |
| case nir_deref_type_array: |
| gather_used_input_loads(deref->arr.index.ssa->parent_instr, |
| loads, num_loads); |
| return; |
| |
| default: |
| unreachable("unexpected deref type"); |
| } |
| } |
| |
| default: |
| unreachable("unexpected instr type"); |
| } |
| } |
| |
| /* Move a post-dominator, which is an ALU opcode, into the previous shader, |
| * and replace the post-dominator with a new input load. |
| */ |
| static bool |
| try_move_postdominator(struct linkage_info *linkage, |
| struct nir_use_dominance_state *postdom_state, |
| nir_instr *postdom, |
| nir_def *load_def, |
| nir_intrinsic_instr *first_load, |
| nir_opt_varyings_progress *progress) |
| { |
| #define PRINT 0 |
| #if PRINT |
| printf("Trying to move post-dom: "); |
| nir_print_instr(postdom, stdout); |
| puts(""); |
| #endif |
| |
| /* Gather the input loads used by the post-dominator using DFS. */ |
| nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8]; |
| unsigned num_loads = 0; |
| gather_used_input_loads(postdom, loads, &num_loads); |
| assert(num_loads && "no loads were gathered"); |
| |
| /* Clear the flag set by gather_used_input_loads. */ |
| for (unsigned i = 0; i < num_loads; i++) |
| loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED; |
| |
| /* For all the loads, the previous shader must have the corresponding |
| * output stores in the same basic block because we are going to replace |
| * them with 1 store. Only TCS and GS can have stores of different outputs |
| * in different blocks. |
| */ |
| nir_block *block = NULL; |
| |
| for (unsigned i = 0; i < num_loads; i++) { |
| unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]); |
| struct scalar_slot *slot = &linkage->slot[slot_index]; |
| |
| assert(list_is_singular(&slot->producer.stores)); |
| nir_intrinsic_instr *store = |
| list_first_entry(&slot->producer.stores, struct list_node, |
| head)->instr; |
| |
| if (!block) { |
| block = store->instr.block; |
| continue; |
| } |
| if (block != store->instr.block) |
| return false; |
| } |
| |
| assert(block); |
| |
| #if PRINT |
| printf("Post-dom accepted: "); |
| nir_print_instr(postdom, stdout); |
| puts("\n"); |
| #endif |
| |
| /* Determine the scalar slot index of the new varying. It will reuse |
| * the slot of the load we started from because the load will be |
| * removed. |
| */ |
| unsigned final_slot = intr_get_scalar_16bit_slot(first_load); |
| |
| /* Replace the post-dominator in the consumer with a new input load. |
| * Since we are reusing the same slot as the first load and it has |
| * the right interpolation qualifiers, use it as the new load by using |
| * it in place of the post-dominator. |
| * |
| * Boolean post-dominators are upcast in the producer and then downcast |
| * in the consumer. |
| */ |
| unsigned slot_index = final_slot; |
| struct scalar_slot *slot = &linkage->slot[slot_index]; |
| nir_builder *b = &linkage->consumer_builder; |
| b->cursor = nir_after_instr(load_def->parent_instr); |
| nir_def *postdom_def = nir_instr_def(postdom); |
| unsigned alu_interp = postdom->pass_flags & FLAG_INTERP_MASK; |
| nir_def *new_input, *new_tes_loads[3]; |
| BITSET_WORD *mask; |
| |
| /* Convergent instruction results that are not interpolatable (integer or |
| * FP64) should not be moved because compaction can relocate convergent |
| * varyings to interpolated vec4 slots because the definition of convergent |
| * varyings implies that they can be interpolated (which doesn't work with |
| * integer and FP64 values). |
| * |
| * Check the result type and if it's not float and the driver doesn't |
| * support convergent flat loads from interpolated vec4 slots, don't move |
| * it. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && |
| alu_interp == FLAG_INTERP_CONVERGENT && |
| !linkage->can_mix_convergent_flat_with_interpolated && |
| (postdom->type != nir_instr_type_alu || |
| (postdom_def->bit_size != 16 && postdom_def->bit_size != 32) || |
| !(nir_op_infos[nir_instr_as_alu(postdom)->op].output_type & nir_type_float))) |
| return false; |
| |
| /* NIR can't do 1-bit inputs. Convert them to a bigger size. */ |
| assert(postdom_def->bit_size & (1 | 16 | 32)); |
| unsigned new_bit_size = postdom_def->bit_size; |
| |
| if (new_bit_size == 1) { |
| assert(alu_interp == FLAG_INTERP_CONVERGENT || |
| alu_interp == FLAG_INTERP_FLAT); |
| /* TODO: We could use 16 bits instead, but that currently fails on AMD. |
| */ |
| new_bit_size = 32; |
| } |
| |
| bool rewrite_convergent_to_flat = |
| alu_interp == FLAG_INTERP_CONVERGENT && |
| linkage->can_mix_convergent_flat_with_interpolated; |
| |
| /* Create the new input load. This creates a new load (or a series of |
| * loads in case of open-coded TES interpolation) that's identical to |
| * the original load(s). |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && |
| alu_interp != FLAG_INTERP_FLAT && !rewrite_convergent_to_flat) { |
| nir_def *baryc = NULL; |
| |
| /* Determine the barycentric coordinates. */ |
| switch (alu_interp) { |
| case FLAG_INTERP_PERSP_PIXEL: |
| case FLAG_INTERP_LINEAR_PIXEL: |
| baryc = nir_load_barycentric_pixel(b, 32); |
| break; |
| case FLAG_INTERP_PERSP_CENTROID: |
| case FLAG_INTERP_LINEAR_CENTROID: |
| baryc = nir_load_barycentric_centroid(b, 32); |
| break; |
| case FLAG_INTERP_PERSP_SAMPLE: |
| case FLAG_INTERP_LINEAR_SAMPLE: |
| baryc = nir_load_barycentric_sample(b, 32); |
| break; |
| default: |
| baryc = first_load->src[0].ssa; |
| break; |
| } |
| |
| if (baryc != first_load->src[0].ssa) { |
| nir_intrinsic_instr *baryc_i = |
| nir_instr_as_intrinsic(baryc->parent_instr); |
| |
| if (alu_interp == FLAG_INTERP_LINEAR_PIXEL || |
| alu_interp == FLAG_INTERP_LINEAR_CENTROID || |
| alu_interp == FLAG_INTERP_LINEAR_SAMPLE) |
| nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE); |
| else |
| nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH); |
| } |
| |
| new_input = nir_load_interpolated_input( |
| b, 1, new_bit_size, baryc, nir_imm_int(b, 0), |
| .base = nir_intrinsic_base(first_load), |
| .component = nir_intrinsic_component(first_load), |
| .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) | |
| new_bit_size, |
| .io_semantics = nir_intrinsic_io_semantics(first_load)); |
| |
| if (alu_interp == FLAG_INTERP_CONVERGENT) { |
| mask = new_bit_size == 16 ? linkage->convergent16_mask |
| : linkage->convergent32_mask; |
| } else if (linkage->has_flexible_interp) { |
| mask = new_bit_size == 16 ? linkage->interp_fp16_mask |
| : linkage->interp_fp32_mask; |
| } else { |
| /* The index of the qualifier is encoded in alu_interp, so extract it. */ |
| unsigned i = (alu_interp - FLAG_INTERP_PERSP_PIXEL) >> 5; |
| mask = new_bit_size == 16 ? linkage->interp_fp16_qual_masks[i] |
| : linkage->interp_fp32_qual_masks[i]; |
| } |
| } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && |
| alu_interp > FLAG_INTERP_FLAT) { |
| nir_def *zero = nir_imm_int(b, 0); |
| |
| for (unsigned i = 0; i < 3; i++) { |
| new_tes_loads[i] = |
| nir_load_per_vertex_input(b, 1, new_bit_size, |
| i ? nir_imm_int(b, i) : zero, zero, |
| .base = nir_intrinsic_base(first_load), |
| .component = nir_intrinsic_component(first_load), |
| .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) | |
| new_bit_size, |
| .io_semantics = nir_intrinsic_io_semantics(first_load)); |
| } |
| |
| int remap_uvw[3] = {0, 1, 2}; |
| int remap_wuv[3] = {2, 0, 1}; |
| int *remap; |
| |
| switch (alu_interp) { |
| case FLAG_INTERP_TES_TRIANGLE_UVW: |
| remap = remap_uvw; |
| break; |
| case FLAG_INTERP_TES_TRIANGLE_WUV: |
| remap = remap_wuv; |
| break; |
| default: |
| unreachable("invalid TES interpolation mode"); |
| } |
| |
| nir_def *tesscoord = slot->consumer.tes_load_tess_coord; |
| nir_def *defs[3]; |
| |
| for (unsigned i = 0; i < 3; i++) { |
| if (i == 0) { |
| defs[i] = nir_fmul(b, new_tes_loads[i], |
| nir_channel(b, tesscoord, remap[i])); |
| } else { |
| defs[i] = nir_ffma(b, new_tes_loads[i], |
| nir_channel(b, tesscoord, remap[i]), |
| defs[i - 1]); |
| } |
| } |
| new_input = defs[2]; |
| |
| mask = new_bit_size == 16 ? linkage->flat16_mask |
| : linkage->flat32_mask; |
| } else { |
| /* We have to rewrite convergent to flat here and not during compaction |
| * because compaction adds code to convert Infs to NaNs for |
| * "load_interpolated_input -> load_input" replacements, which corrupts |
| * integer data. |
| */ |
| assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT || |
| alu_interp == FLAG_INTERP_FLAT || rewrite_convergent_to_flat); |
| |
| new_input = |
| nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0), |
| .base = nir_intrinsic_base(first_load), |
| .component = nir_intrinsic_component(first_load), |
| .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) | |
| new_bit_size, |
| .io_semantics = nir_intrinsic_io_semantics(first_load)); |
| |
| mask = new_bit_size == 16 ? linkage->flat16_mask |
| : linkage->flat32_mask; |
| |
| if (rewrite_convergent_to_flat) { |
| mask = new_bit_size == 16 ? linkage->convergent16_mask |
| : linkage->convergent32_mask; |
| } |
| } |
| |
| assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index)); |
| assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index)); |
| |
| /* Re-set the category of the new scalar input. This will cause |
| * the compaction to treat it as a different type, so that it will be moved |
| * into the vec4 that has compatible interpolation qualifiers. |
| * |
| * This shouldn't be done if any of the interp masks are not set, which |
| * indicates that compaction is disallowed. |
| */ |
| if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) || |
| BITSET_TEST(linkage->interp_fp16_mask, slot_index) || |
| BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, slot_index) || |
| BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, slot_index) || |
| BITSET_TEST(linkage->flat32_mask, slot_index) || |
| BITSET_TEST(linkage->flat16_mask, slot_index) || |
| BITSET_TEST(linkage->convergent32_mask, slot_index) || |
| BITSET_TEST(linkage->convergent16_mask, slot_index)) { |
| BITSET_CLEAR(linkage->interp_fp32_mask, slot_index); |
| for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) |
| BITSET_CLEAR(linkage->interp_fp32_qual_masks[i], slot_index); |
| BITSET_CLEAR(linkage->interp_fp16_mask, slot_index); |
| for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) |
| BITSET_CLEAR(linkage->interp_fp16_qual_masks[i], slot_index); |
| BITSET_CLEAR(linkage->flat16_mask, slot_index); |
| BITSET_CLEAR(linkage->flat32_mask, slot_index); |
| BITSET_CLEAR(linkage->convergent16_mask, slot_index); |
| BITSET_CLEAR(linkage->convergent32_mask, slot_index); |
| BITSET_SET(mask, slot_index); |
| } |
| |
| /* Replace the existing load with the new load in the slot. */ |
| if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && |
| alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) { |
| /* For TES, replace all 3 loads. */ |
| unsigned i = 0; |
| list_for_each_entry(struct list_node, iter, &slot->consumer.loads, |
| head) { |
| assert(i < 3); |
| iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr); |
| i++; |
| } |
| |
| assert(i == 3); |
| assert(postdom_def->bit_size != 1); |
| |
| slot->consumer.tes_interp_load = |
| nir_instr_as_alu(new_input->parent_instr); |
| } else { |
| assert(list_is_singular(&slot->consumer.loads)); |
| list_first_entry(&slot->consumer.loads, struct list_node, head)->instr = |
| nir_instr_as_intrinsic(new_input->parent_instr); |
| |
| /* The input is a bigger type even if the post-dominator is boolean. */ |
| if (postdom_def->bit_size == 1) |
| new_input = nir_ine_imm(b, new_input, 0); |
| } |
| |
| nir_def_rewrite_uses(postdom_def, new_input); |
| |
| /* Clone the post-dominator at the end of the block in the producer |
| * where the output stores are. |
| */ |
| b = &linkage->producer_builder; |
| b->cursor = nir_after_block_before_jump(block); |
| nir_def *producer_clone = clone_ssa(linkage, b, postdom_def); |
| |
| /* Boolean post-dominators are upcast in the producer because we can't |
| * use 1-bit outputs. |
| */ |
| if (producer_clone->bit_size == 1) |
| producer_clone = nir_b2bN(b, producer_clone, new_bit_size); |
| |
| /* Move the existing store to the end of the block and rewrite it to use |
| * the post-dominator result. |
| */ |
| nir_intrinsic_instr *store = |
| list_first_entry(&linkage->slot[final_slot].producer.stores, |
| struct list_node, head)->instr; |
| nir_instr_move(b->cursor, &store->instr); |
| if (nir_src_bit_size(store->src[0]) != producer_clone->bit_size) |
| nir_intrinsic_set_src_type(store, nir_alu_type_get_base_type(nir_intrinsic_src_type(store)) | |
| producer_clone->bit_size); |
| nir_src_rewrite(&store->src[0], producer_clone); |
| |
| /* Remove all loads and stores that we are replacing from the producer |
| * and consumer. |
| */ |
| for (unsigned i = 0; i < num_loads; i++) { |
| unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]); |
| |
| if (slot_index == final_slot) { |
| /* Keep the load and store that we reused. */ |
| continue; |
| } |
| |
| /* Remove loads and stores that are dead after the code motion. Only |
| * those loads that are post-dominated by the post-dominator are dead. |
| */ |
| struct scalar_slot *slot = &linkage->slot[slot_index]; |
| nir_instr *load; |
| |
| if (slot->consumer.tes_interp_load) { |
| load = &slot->consumer.tes_interp_load->instr; |
| |
| /* With interpolated TES loads, we get here 3 times, once for each |
| * per-vertex load. Skip this if we've been here before. |
| */ |
| if (list_is_empty(&slot->producer.stores)) { |
| assert(list_is_empty(&slot->consumer.loads)); |
| continue; |
| } |
| } else { |
| assert(list_is_singular(&slot->consumer.loads)); |
| load = &list_first_entry(&slot->consumer.loads, |
| struct list_node, head)->instr->instr; |
| } |
| |
| if (nir_instr_dominates_use(postdom_state, postdom, load)) { |
| list_inithead(&slot->consumer.loads); |
| |
| /* Remove stores. (transform feedback is allowed here, just not |
| * in final_slot) |
| */ |
| remove_all_stores_and_clear_slot(linkage, slot_index, progress); |
| } else { |
| /* If a load has 2 uses and one of those uses is moved into the previous |
| * shader, making that "use" dead, the load and its associated store |
| * can't be removed because there is still one use remaining. However, |
| * there are actually 2 uses remaining because the use that is dead isn't |
| * removed from NIR, but is left dangling there. |
| * |
| * When we run this optimization again and make the second use dead, |
| * which makes the load dead, the output store in the producer isn't removed |
| * because the post-dominator of the second use doesn't post-dominate |
| * the load because we left the first use dangling there. |
| * |
| * To fix that, we could run DCE, but that would be costly because we would |
| * need to re-gather all IO. Instead, remove dead uses by replacing them |
| * with undef here, so that when this code motion pass is entered again, |
| * the load has its number of uses reduced and the corresponding output store |
| * will be removed by the code above. |
| */ |
| nir_foreach_use_safe(src, nir_instr_def(load)) { |
| if (nir_instr_dominates_use(postdom_state, postdom, |
| nir_src_parent_instr(src))) { |
| nir_src_rewrite(src, nir_undef(&linkage->consumer_builder, |
| src->ssa->num_components, |
| src->ssa->bit_size)); |
| } |
| } |
| } |
| } |
| |
| *progress |= nir_progress_producer | nir_progress_consumer; |
| return true; |
| } |
| |
| static bool |
| backward_inter_shader_code_motion(struct linkage_info *linkage, |
| nir_opt_varyings_progress *progress) |
| { |
| /* These producers are not supported. The description at the beginning |
| * suggests a possible workaround. |
| */ |
| if (linkage->producer_stage == MESA_SHADER_GEOMETRY || |
| linkage->producer_stage == MESA_SHADER_MESH || |
| linkage->producer_stage == MESA_SHADER_TASK) |
| return false; |
| |
| /* Clear pass_flags. */ |
| nir_shader_clear_pass_flags(linkage->consumer_builder.shader); |
| |
| /* Gather inputs that can be moved into the previous shader. These are only |
| * checked for the basic constraints for movability. |
| */ |
| struct { |
| nir_def *def; |
| nir_intrinsic_instr *first_load; |
| } movable_loads[NUM_SCALAR_SLOTS]; |
| unsigned num_movable_loads = 0; |
| unsigned i; |
| |
| BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { |
| if (!can_optimize_varying(linkage, |
| vec4_slot(i)).inter_shader_code_motion) |
| continue; |
| |
| struct scalar_slot *slot = &linkage->slot[i]; |
| |
| assert(!list_is_empty(&slot->producer.stores)); |
| assert(!is_interpolated_texcoord(linkage, i)); |
| assert(!is_interpolated_color(linkage, i)); |
| |
| /* Disallow producer loads. */ |
| if (!list_is_empty(&slot->producer.loads)) |
| continue; |
| |
| /* There should be only 1 store per output. */ |
| if (!list_is_singular(&slot->producer.stores)) |
| continue; |
| |
| nir_def *load_def = NULL; |
| nir_intrinsic_instr *load = |
| list_first_entry(&slot->consumer.loads, struct list_node, |
| head)->instr; |
| |
| nir_intrinsic_instr *store = |
| list_first_entry(&slot->producer.stores, struct list_node, |
| head)->instr; |
| |
| /* Set interpolation flags. |
| * Handle interpolated TES loads first because they are special. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && |
| slot->consumer.tes_interp_load) { |
| if (linkage->producer_stage == MESA_SHADER_VERTEX) { |
| /* VS -> TES has no constraints on VS stores. */ |
| load_def = &slot->consumer.tes_interp_load->def; |
| load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD | |
| slot->consumer.tes_interp_mode; |
| } else { |
| assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); |
| assert(store->intrinsic == nir_intrinsic_store_per_vertex_output); |
| |
| /* The vertex index of the store must InvocationID. */ |
| if (is_sysval(store->src[1].ssa->parent_instr, |
| SYSTEM_VALUE_INVOCATION_ID)) { |
| load_def = &slot->consumer.tes_interp_load->def; |
| load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD | |
| slot->consumer.tes_interp_mode; |
| } else { |
| continue; |
| } |
| } |
| } else { |
| /* Allow only 1 load per input. CSE should be run before this. */ |
| if (!list_is_singular(&slot->consumer.loads)) |
| continue; |
| |
| /* This can only be TCS -> TES, which is handled above and rejected |
| * otherwise. |
| */ |
| if (store->intrinsic == nir_intrinsic_store_per_vertex_output) { |
| assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); |
| continue; |
| } |
| |
| /* TODO: handle load_per_vertex_input for TCS and GS. |
| * TES can also occur here if tes_interp_load is NULL. |
| */ |
| if (load->intrinsic == nir_intrinsic_load_per_vertex_input) |
| continue; |
| |
| load_def = &load->def; |
| |
| switch (load->intrinsic) { |
| case nir_intrinsic_load_interpolated_input: { |
| assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT); |
| nir_intrinsic_instr *baryc = |
| nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); |
| nir_intrinsic_op op = baryc->intrinsic; |
| enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc); |
| bool linear = interp == INTERP_MODE_NOPERSPECTIVE; |
| bool convergent = BITSET_TEST(linkage->convergent32_mask, i) || |
| BITSET_TEST(linkage->convergent16_mask, i); |
| |
| assert(interp == INTERP_MODE_NONE || |
| interp == INTERP_MODE_SMOOTH || |
| interp == INTERP_MODE_NOPERSPECTIVE); |
| |
| if (convergent) { |
| load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; |
| } else if (op == nir_intrinsic_load_barycentric_pixel) { |
| load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL |
| : FLAG_INTERP_PERSP_PIXEL; |
| } else if (op == nir_intrinsic_load_barycentric_centroid) { |
| load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID |
| : FLAG_INTERP_PERSP_CENTROID; |
| } else if (op == nir_intrinsic_load_barycentric_sample) { |
| load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE |
| : FLAG_INTERP_PERSP_SAMPLE; |
| } else { |
| /* Optimizing at_offset and at_sample would be possible but |
| * maybe not worth it if they are not convergent. Convergent |
| * inputs can trivially switch the barycentric coordinates |
| * to different ones or flat. |
| */ |
| continue; |
| } |
| break; |
| } |
| case nir_intrinsic_load_input: |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| if (BITSET_TEST(linkage->convergent32_mask, i) || |
| BITSET_TEST(linkage->convergent16_mask, i)) |
| load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; |
| else |
| load->instr.pass_flags |= FLAG_INTERP_FLAT; |
| } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) { |
| assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 && |
| vec4_slot(i) <= VARYING_SLOT_PATCH31); |
| /* Patch inputs are always convergent. */ |
| load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; |
| } else { |
| /* It's not a fragment shader. We still need to set this. */ |
| load->instr.pass_flags |= FLAG_INTERP_FLAT; |
| } |
| break; |
| case nir_intrinsic_load_per_primitive_input: |
| case nir_intrinsic_load_input_vertex: |
| /* Inter-shader code motion is unimplemented these. */ |
| continue; |
| default: |
| unreachable("unexpected load intrinsic"); |
| } |
| } |
| |
| load_def->parent_instr->pass_flags |= FLAG_MOVABLE; |
| |
| /* Disallow transform feedback. The load is "movable" for the purpose of |
| * finding a movable post-dominator, we just can't rewrite the store |
| * because we need to keep it for xfb, so the post-dominator search |
| * will have to start from a different load (only that varying will have |
| * its value rewritten). |
| */ |
| if (BITSET_TEST(linkage->xfb_mask, i)) |
| continue; |
| |
| assert(num_movable_loads < ARRAY_SIZE(movable_loads)); |
| movable_loads[num_movable_loads].def = load_def; |
| movable_loads[num_movable_loads].first_load = load; |
| num_movable_loads++; |
| } |
| |
| if (!num_movable_loads) |
| return false; |
| |
| /* Inter-shader code motion turns ALU results into outputs, but not all |
| * bit sizes are supported by outputs. |
| * |
| * The 1-bit type is allowed because the pass always promotes 1-bit |
| * outputs to 16 or 32 bits, whichever is supported. |
| * |
| * TODO: We could support replacing 2 32-bit inputs with one 64-bit |
| * post-dominator by supporting 64 bits here, but the likelihood of that |
| * occuring seems low. |
| */ |
| unsigned supported_io_types = 32 | 1; |
| |
| if (linkage->producer_builder.shader->options->io_options & |
| linkage->consumer_builder.shader->options->io_options & |
| nir_io_16bit_input_output_support) |
| supported_io_types |= 16; |
| |
| struct nir_use_dominance_state *postdom_state = |
| nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true); |
| |
| for (unsigned i = 0; i < num_movable_loads; i++) { |
| nir_def *load_def = movable_loads[i].def; |
| nir_instr *iter = load_def->parent_instr; |
| nir_instr *movable_postdom = NULL; |
| |
| /* Find the farthest post-dominator that is movable. */ |
| while (iter) { |
| iter = nir_get_immediate_use_dominator(postdom_state, iter); |
| if (iter) { |
| if (NEED_UPDATE_MOVABLE_FLAGS(iter)) |
| update_movable_flags(linkage, iter); |
| |
| if (iter->pass_flags & FLAG_UNMOVABLE) |
| break; |
| |
| /* We can't move derefs into the previous shader, but we can move |
| * instructions that use derefs. |
| */ |
| if (iter->type == nir_instr_type_deref) |
| continue; |
| |
| unsigned bit_size; |
| |
| if (iter->type == nir_instr_type_alu) { |
| nir_alu_instr *alu = nir_instr_as_alu(iter); |
| |
| /* Skip comparison opcodes that directly source the first load |
| * and a constant because any 1-bit values would have to be |
| * converted to 32 bits in the producer and then converted back |
| * to 1 bit using nir_op_ine in the consumer, achieving nothing. |
| */ |
| if (alu->def.bit_size == 1 && |
| ((nir_op_infos[alu->op].num_inputs == 1 && |
| alu->src[0].src.ssa == load_def) || |
| (nir_op_infos[alu->op].num_inputs == 2 && |
| ((alu->src[0].src.ssa == load_def && |
| alu->src[1].src.ssa->parent_instr->type == |
| nir_instr_type_load_const) || |
| (alu->src[0].src.ssa->parent_instr->type == |
| nir_instr_type_load_const && |
| alu->src[1].src.ssa == load_def))))) |
| continue; |
| |
| bit_size = alu->def.bit_size; |
| } else if (iter->type == nir_instr_type_intrinsic) { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(iter); |
| |
| /* This is a uniform load with a non-constant index because |
| * only a non-constant index can be post-dominated by a load. |
| */ |
| assert(intr->intrinsic == nir_intrinsic_load_deref); |
| |
| /* Uniform loads must be scalar if their result is immediately |
| * stored into an output because this pass only works with |
| * scalar outputs. |
| */ |
| if (intr->num_components > 1) |
| continue; |
| |
| bit_size = intr->def.bit_size; |
| } else { |
| unreachable("unexpected instr type"); |
| } |
| |
| /* Skip unsupported bit sizes and keep searching. */ |
| if (!(bit_size & supported_io_types)) |
| continue; |
| |
| movable_postdom = iter; |
| } |
| } |
| |
| /* Add the post-dominator to the list unless it's been added already. */ |
| if (movable_postdom && |
| !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) { |
| if (try_move_postdominator(linkage, postdom_state, movable_postdom, |
| load_def, movable_loads[i].first_load, |
| progress)) { |
| /* Moving only one postdominator can change the IR enough that |
| * we should start from scratch. |
| */ |
| ralloc_free(postdom_state); |
| return true; |
| } |
| |
| movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED; |
| } |
| } |
| |
| ralloc_free(postdom_state); |
| return false; |
| } |
| |
| /****************************************************************** |
| * COMPACTION |
| ******************************************************************/ |
| |
| /* Relocate a slot to a new index. Used by compaction. new_index is |
| * the component index at 16-bit granularity, so the size of vec4 is 8 |
| * in that representation. |
| */ |
| static void |
| relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot, |
| unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type, |
| bool convergent, nir_opt_varyings_progress *progress) |
| { |
| assert(!list_is_empty(&slot->producer.stores)); |
| |
| list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) { |
| assert(!nir_intrinsic_io_semantics(iter->instr).no_varying || |
| has_xfb(iter->instr) || |
| linkage->producer_stage == MESA_SHADER_TESS_CTRL); |
| assert(!is_active_sysval_output(linkage, i, iter->instr)); |
| } |
| |
| /* Relocate the slot in all loads and stores. */ |
| struct list_head *instruction_lists[3] = { |
| &slot->producer.stores, |
| &slot->producer.loads, |
| &slot->consumer.loads, |
| }; |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) { |
| list_for_each_entry(struct list_node, iter, instruction_lists[i], head) { |
| nir_intrinsic_instr *intr = iter->instr; |
| |
| gl_varying_slot new_semantic = vec4_slot(new_index); |
| unsigned new_component = (new_index % 8) / 2; |
| bool new_high_16bits = new_index % 2; |
| |
| /* We also need to relocate xfb info because it's always relative |
| * to component 0. This just moves it into the correct xfb slot. |
| */ |
| if (has_xfb(intr)) { |
| unsigned old_component = nir_intrinsic_component(intr); |
| static const nir_io_xfb clear_xfb; |
| nir_io_xfb xfb; |
| bool new_is_odd = new_component % 2 == 1; |
| |
| memset(&xfb, 0, sizeof(xfb)); |
| |
| if (old_component >= 2) { |
| xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2]; |
| nir_intrinsic_set_io_xfb2(intr, clear_xfb); |
| } else { |
| xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component]; |
| nir_intrinsic_set_io_xfb(intr, clear_xfb); |
| } |
| |
| if (new_component >= 2) |
| nir_intrinsic_set_io_xfb2(intr, xfb); |
| else |
| nir_intrinsic_set_io_xfb(intr, xfb); |
| } |
| |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| unsigned bit_size = nir_intrinsic_infos[intr->intrinsic].has_dest ? |
| intr->def.bit_size : intr->src[0].ssa->bit_size; |
| |
| /* Set all types to float to facilitate full IO vectorization. |
| * This is skipped only if mediump is not lowered to 16 bits. |
| * |
| * Set nir_io_mediump_is_32bit if you never lower mediump IO to 16 |
| * bits, which sets nir_io_semantics::mediump_precision = 0 during |
| * nir_lower_io. |
| * |
| * Set nir_shader_compiler_options::lower_mediump_io if you want to |
| * lower mediump to 16 bits in the GLSL linker before this pass. |
| */ |
| if (bit_size != 32 || !sem.medium_precision) { |
| nir_alu_type type = nir_intrinsic_has_src_type(intr) ? |
| nir_intrinsic_src_type(intr) : |
| nir_intrinsic_dest_type(intr); |
| type = nir_alu_type_get_type_size(type) | nir_type_float; |
| |
| if (nir_intrinsic_has_src_type(intr)) |
| nir_intrinsic_set_src_type(intr, type); |
| else |
| nir_intrinsic_set_dest_type(intr, type); |
| } |
| |
| /* When relocating a back color store, don't change it to a front |
| * color as that would be incorrect. Keep it as back color and only |
| * relocate it between BFC0 and BFC1. |
| */ |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && |
| (sem.location == VARYING_SLOT_BFC0 || |
| sem.location == VARYING_SLOT_BFC1)) { |
| assert(new_semantic == VARYING_SLOT_COL0 || |
| new_semantic == VARYING_SLOT_COL1); |
| new_semantic = VARYING_SLOT_BFC0 + |
| (new_semantic - VARYING_SLOT_COL0); |
| } |
| |
| #if PRINT_RELOCATE_SLOT |
| assert(bit_size == 16 || bit_size == 32); |
| |
| fprintf(stderr, "--- relocating: %s.%c%s%s -> %s.%c%s%s FS_VEC4_TYPE_%s\n", |
| gl_varying_slot_name_for_stage(sem.location, linkage->producer_stage) + 13, |
| "xyzw"[nir_intrinsic_component(intr) % 4], |
| (bit_size == 16 && !sem.high_16bits) ? ".lo" : "", |
| (bit_size == 16 && sem.high_16bits) ? ".hi" : "", |
| gl_varying_slot_name_for_stage(new_semantic, linkage->producer_stage) + 13, |
| "xyzw"[new_component % 4], |
| (bit_size == 16 && !new_high_16bits) ? ".lo" : "", |
| (bit_size == 16 && new_high_16bits) ? ".hi" : "", |
| fs_vec4_type_strings[fs_vec4_type]); |
| #endif /* PRINT_RELOCATE_SLOT */ |
| |
| sem.location = new_semantic; |
| sem.high_16bits = new_high_16bits; |
| |
| /* This is never indirectly indexed. Simplify num_slots. */ |
| sem.num_slots = 1; |
| |
| nir_intrinsic_set_io_semantics(intr, sem); |
| nir_intrinsic_set_component(intr, new_component); |
| |
| if (fs_vec4_type == FS_VEC4_TYPE_PER_PRIMITIVE) { |
| assert(intr->intrinsic == nir_intrinsic_store_per_primitive_output || |
| intr->intrinsic == nir_intrinsic_load_per_primitive_output || |
| intr->intrinsic == nir_intrinsic_load_per_primitive_input); |
| } else { |
| assert(intr->intrinsic != nir_intrinsic_store_per_primitive_output && |
| intr->intrinsic != nir_intrinsic_load_per_primitive_output && |
| intr->intrinsic != nir_intrinsic_load_per_primitive_input); |
| } |
| |
| if (intr->intrinsic != nir_intrinsic_load_interpolated_input) |
| continue; |
| |
| /* This path is used when promoting convergent interpolated |
| * inputs to flat. Replace load_interpolated_input with load_input. |
| */ |
| if (fs_vec4_type == FS_VEC4_TYPE_FLAT || |
| /* Promote all convergent loads to flat if the driver supports it. */ |
| (convergent && |
| linkage->can_mix_convergent_flat_with_interpolated)) { |
| assert(instruction_lists[i] == &slot->consumer.loads); |
| nir_builder *b = &linkage->consumer_builder; |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| nir_def *load = |
| nir_load_input(b, 1, intr->def.bit_size, |
| nir_get_io_offset_src(intr)->ssa, |
| .io_semantics = sem, |
| .component = new_component, |
| .dest_type = nir_intrinsic_dest_type(intr)); |
| |
| nir_def_rewrite_uses(&intr->def, load); |
| iter->instr = nir_instr_as_intrinsic(load->parent_instr); |
| nir_instr_remove(&intr->instr); |
| *progress |= nir_progress_consumer; |
| |
| /* Interpolation converts Infs to NaNs. If we change it to flat, |
| * we need to convert Infs to NaNs manually in the producer to |
| * preserve that. |
| */ |
| if (preserve_nans(linkage->consumer_builder.shader, |
| load->bit_size)) { |
| list_for_each_entry(struct list_node, iter, |
| &slot->producer.stores, head) { |
| nir_intrinsic_instr *store = iter->instr; |
| |
| nir_builder *b = &linkage->producer_builder; |
| b->cursor = nir_before_instr(&store->instr); |
| nir_def *repl = |
| build_convert_inf_to_nan(b, store->src[0].ssa); |
| nir_src_rewrite(&store->src[0], repl); |
| } |
| } |
| continue; |
| } |
| |
| /* We are packing convergent inputs with any other interpolated |
| * inputs in the same vec4, but the interpolation qualifier might not |
| * be the same between the two. Set the qualifier of the convergent |
| * input to match the input it's being packed with. |
| */ |
| if (!linkage->has_flexible_interp && convergent) { |
| enum fs_vec4_type current_vec4_type = |
| get_interp_vec4_type(linkage, i, intr); |
| |
| /* Make the interpolation qualifier match the slot where we are |
| * moving this input. |
| */ |
| if (current_vec4_type != fs_vec4_type) { |
| nir_builder *b = &linkage->consumer_builder; |
| nir_def *baryc; |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| switch (fs_vec4_type) { |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL: |
| baryc = nir_load_barycentric_pixel(b, 32, |
| .interp_mode = INTERP_MODE_SMOOTH); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID: |
| baryc = nir_load_barycentric_centroid(b, 32, |
| .interp_mode = INTERP_MODE_SMOOTH); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE: |
| case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE: |
| baryc = nir_load_barycentric_sample(b, 32, |
| .interp_mode = INTERP_MODE_SMOOTH); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL: |
| baryc = nir_load_barycentric_pixel(b, 32, |
| .interp_mode = INTERP_MODE_NOPERSPECTIVE); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID: |
| baryc = nir_load_barycentric_centroid(b, 32, |
| .interp_mode = INTERP_MODE_NOPERSPECTIVE); |
| break; |
| case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE: |
| case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE: |
| baryc = nir_load_barycentric_sample(b, 32, |
| .interp_mode = INTERP_MODE_NOPERSPECTIVE); |
| break; |
| case FS_VEC4_TYPE_INTERP_COLOR_PIXEL: |
| baryc = nir_load_barycentric_pixel(b, 32, |
| .interp_mode = INTERP_MODE_NONE); |
| break; |
| case FS_VEC4_TYPE_INTERP_COLOR_CENTROID: |
| baryc = nir_load_barycentric_centroid(b, 32, |
| .interp_mode = INTERP_MODE_NONE); |
| break; |
| case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE: |
| baryc = nir_load_barycentric_sample(b, 32, |
| .interp_mode = INTERP_MODE_NONE); |
| break; |
| default: |
| unreachable("invalid qualifier"); |
| } |
| |
| nir_src_rewrite(&intr->src[0], baryc); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * A helper function for compact_varyings(). Assign new slot indices for |
| * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already- |
| * assigned scalar slots (determined by assigned_mask) and don't assign to |
| * vec4 slots that have an incompatible vec4 type (determined by |
| * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types. |
| * slot_size is the component size in the units of 16 bits (2 means 32 bits). |
| * |
| * The number of slots to assign can optionally be limited by |
| * max_assigned_slots. |
| * |
| * Return how many 16-bit slots are left unused in the last vec4 (up to 8 |
| * slots). |
| */ |
| static unsigned |
| fs_assign_slots(struct linkage_info *linkage, |
| BITSET_WORD *assigned_mask, |
| uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], |
| BITSET_WORD *input_mask, |
| enum fs_vec4_type fs_vec4_type, |
| unsigned slot_size, |
| unsigned max_assigned_slots, |
| bool convergent, |
| bool assign_colors, |
| unsigned color_channel_rotate, |
| nir_opt_varyings_progress *progress) |
| { |
| unsigned i, slot_index, max_slot; |
| unsigned num_assigned_slots = 0; |
| |
| if (assign_colors) { |
| slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */ |
| max_slot = VARYING_SLOT_COL1 + 1; |
| } else { |
| slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */ |
| max_slot = VARYING_SLOT_MAX; |
| } |
| |
| /* Assign new slot indices for scalar slots. */ |
| BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) { |
| if (is_interpolated_color(linkage, i) != assign_colors) |
| continue; |
| |
| /* Skip indirectly-indexed scalar slots and slots incompatible |
| * with the FS vec4 type. |
| */ |
| while (1) { |
| /* If the FS vec4 type is incompatible. Move to the next vec4. */ |
| if (fs_vec4_type != FS_VEC4_TYPE_NONE && |
| assigned_fs_vec4_type[vec4_slot(slot_index)] != |
| FS_VEC4_TYPE_NONE && |
| assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) { |
| slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */ |
| continue; |
| } |
| |
| /* This slot is already assigned (assigned_mask is set). Move to |
| * the next one. |
| */ |
| if (BITSET_TEST(assigned_mask, slot_index)) { |
| slot_index += slot_size; |
| continue; |
| } |
| break; |
| } |
| |
| /* Assign color channels in this order, starting |
| * at the color_channel_rotate component first. Cases: |
| * color_channel_rotate = 0: xyzw |
| * color_channel_rotate = 1: yzwx |
| * color_channel_rotate = 2: zwxy |
| * color_channel_rotate = 3: wxyz |
| * |
| * This has no effect on behavior per se, but some drivers merge VARn |
| * and COLn into one output if each defines different components. |
| * For example, if we store VAR0.xy and COL0.z, a driver can merge them |
| * by mapping the same output to 2 different inputs (VAR0 and COL0) if |
| * color-specific behavior is per component, but it can't merge VAR0.xy |
| * and COL0.x because they both define x. |
| */ |
| unsigned new_slot_index = slot_index; |
| if (assign_colors && color_channel_rotate) { |
| new_slot_index = (vec4_slot(new_slot_index)) * 8 + |
| (new_slot_index + color_channel_rotate * 2) % 8; |
| } |
| |
| /* Relocate the slot. */ |
| assert(slot_index < max_slot * 8); |
| relocate_slot(linkage, &linkage->slot[i], i, new_slot_index, |
| fs_vec4_type, convergent, progress); |
| |
| for (unsigned i = 0; i < slot_size; ++i) |
| BITSET_SET(assigned_mask, slot_index + i); |
| |
| if (assigned_fs_vec4_type) |
| assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type; |
| slot_index += slot_size; /* move to the next slot */ |
| num_assigned_slots += slot_size; |
| |
| /* Remove the slot from the input (unassigned) mask. */ |
| BITSET_CLEAR(input_mask, i); |
| |
| /* The number of slots to assign can optionally be limited. */ |
| assert(num_assigned_slots <= max_assigned_slots); |
| if (num_assigned_slots == max_assigned_slots) |
| break; |
| } |
| |
| assert(slot_index <= max_slot * 8); |
| |
| if (!convergent && fs_vec4_type != FS_VEC4_TYPE_NONE) { |
| /* Count the number of unused 16-bit components. There can be holes |
| * because indirect inputs are not moved from their original locations. |
| * The result is used to determine which compoments should be filled |
| * with convergent inputs. |
| */ |
| unsigned unused_slots = 0; |
| |
| for (unsigned i = assign_colors ? VARYING_SLOT_COL0 : VARYING_SLOT_VAR0; |
| i < max_slot; i++) { |
| if (assigned_fs_vec4_type[i] != fs_vec4_type) |
| continue; |
| |
| unsigned comp_mask = |
| BITSET_GET_RANGE_INSIDE_WORD(assigned_mask, i * 8, i * 8 + 7); |
| assert(comp_mask); |
| assert(comp_mask <= 0xff); |
| |
| if (comp_mask == 0xff) |
| continue; |
| |
| /* Only count full unused 32-bit slots, so that 2 disjoint unused |
| * 16-bit slots don't give the misleading impression that there is |
| * a full unused 32-bit slots. |
| */ |
| for (unsigned i = 0; i < 4; i++) { |
| if (!(comp_mask & BITFIELD_RANGE(i * 2, 2))) |
| unused_slots += 2; |
| } |
| } |
| return unused_slots; |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * This is called once for 32-bit inputs and once for 16-bit inputs. |
| * It assigns new slot indices to all scalar slots specified in the masks. |
| * |
| * \param linkage Linkage info |
| * \param assigned_mask Which scalar (16-bit) slots are already taken. |
| * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier |
| * and can only be filled with compatible slots. |
| * \param interp_mask The list of interp slots to assign locations for. |
| * \param flat_mask The list of flat slots to assign locations for. |
| * \param convergent_mask The list of slots that have convergent output |
| * stores. |
| * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}*. |
| * \param slot_size 1 for 16 bits, 2 for 32 bits |
| * \param color_channel_rotate Assign color channels starting with this index, |
| * e.g. 2 assigns channels in the zwxy order. |
| * \param assign_colors Whether to assign only color varyings or only |
| * non-color varyings. |
| */ |
| static void |
| fs_assign_slot_groups(struct linkage_info *linkage, |
| BITSET_WORD *assigned_mask, |
| uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], |
| BITSET_WORD *interp_mask, |
| BITSET_WORD *flat_mask, |
| BITSET_WORD *convergent_mask, |
| BITSET_WORD *color_interp_mask, |
| enum fs_vec4_type sized_interp_type, |
| unsigned slot_size, |
| bool assign_colors, |
| unsigned color_channel_rotate, |
| nir_opt_varyings_progress *progress) |
| { |
| /* Put interpolated slots first. */ |
| unsigned unused_interp_slots = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| interp_mask, sized_interp_type, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| |
| unsigned unused_color_interp_slots = 0; |
| if (color_interp_mask) { |
| unused_color_interp_slots = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| } |
| |
| /* Put flat slots next. |
| * Note that only flat vec4 slots can have both 32-bit and 16-bit types |
| * packed in the same vec4. 32-bit flat inputs are packed first, followed |
| * by 16-bit flat inputs. |
| */ |
| unsigned unused_flat_slots = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| flat_mask, FS_VEC4_TYPE_FLAT, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| |
| /* Take the inputs with convergent values and assign them as follows. |
| * Since they can be assigned as both interpolated and flat, we can |
| * choose. We prefer them to be flat, but if interpolated vec4s have |
| * unused components, try to fill those before starting a new flat vec4. |
| * |
| * First, fill the unused components of flat (if any), then fill |
| * the unused components of interpolated (if any), and then make |
| * the remaining convergent inputs flat. |
| */ |
| if (!linkage->always_interpolate_convergent_fs_inputs && |
| unused_flat_slots) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, FS_VEC4_TYPE_FLAT, |
| slot_size, unused_flat_slots, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| if (unused_interp_slots) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, sized_interp_type, |
| slot_size, unused_interp_slots, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| if (unused_color_interp_slots) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, FS_VEC4_TYPE_INTERP_COLOR, |
| slot_size, unused_color_interp_slots, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, |
| linkage->always_interpolate_convergent_fs_inputs ? |
| (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32 : |
| FS_VEC4_TYPE_INTERP_FP16) : |
| FS_VEC4_TYPE_FLAT, |
| slot_size, NUM_SCALAR_SLOTS, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| |
| /** |
| * Same as fs_assign_slot_groups, but don't mix different interpolation |
| * qualifiers in the same vec4. |
| */ |
| static void |
| fs_assign_slot_groups_separate_qual(struct linkage_info *linkage, |
| BITSET_WORD *assigned_mask, |
| uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], |
| INTERP_QUAL_BITSET *interp_masks, |
| BITSET_WORD *flat_mask, |
| BITSET_WORD *convergent_mask, |
| COLOR_QUAL_BITSET *color_interp_masks, |
| enum fs_vec4_type sized_interp_type_base, |
| unsigned slot_size, |
| bool assign_colors, |
| unsigned color_channel_rotate, |
| nir_opt_varyings_progress *progress) |
| { |
| unsigned unused_interp_slots[NUM_INTERP_QUALIFIERS] = {0}; |
| unsigned unused_color_slots[NUM_COLOR_QUALIFIERS] = {0}; |
| |
| /* Put interpolated slots first. */ |
| for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { |
| unused_interp_slots[i] = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| (*interp_masks)[i], sized_interp_type_base + i, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| } |
| |
| if (color_interp_masks) { |
| for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { |
| unused_color_slots[i] = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| (*color_interp_masks)[i], |
| FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| } |
| } |
| |
| /* Put flat slots next. |
| * Note that only flat vec4 slots can have both 32-bit and 16-bit types |
| * packed in the same vec4. 32-bit flat inputs are packed first, followed |
| * by 16-bit flat inputs. |
| */ |
| unsigned unused_flat_slots = |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| flat_mask, FS_VEC4_TYPE_FLAT, |
| slot_size, NUM_SCALAR_SLOTS, false, assign_colors, |
| color_channel_rotate, progress); |
| |
| /* Take the inputs with convergent values and assign them as follows. |
| * Since they can be assigned as both interpolated and flat, we can |
| * choose. We prefer them to be flat, but if interpolated vec4s have |
| * unused components, try to fill those before starting a new flat vec4. |
| * |
| * First, fill the unused components of flat (if any) with convergent |
| * inputs. |
| */ |
| if (!linkage->always_interpolate_convergent_fs_inputs && |
| unused_flat_slots) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, FS_VEC4_TYPE_FLAT, |
| slot_size, unused_flat_slots, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| |
| /* Then fill the unused components of interpolated slots (if any) with |
| * convergent inputs. |
| */ |
| for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { |
| if (unused_interp_slots[i]) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, sized_interp_type_base + i, |
| slot_size, unused_interp_slots[i], true, |
| assign_colors, color_channel_rotate, progress); |
| } |
| } |
| |
| for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { |
| if (unused_color_slots[i]) { |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i, |
| slot_size, unused_color_slots[i], true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| } |
| |
| /* Then make the remaining convergent inputs flat. */ |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| convergent_mask, |
| linkage->always_interpolate_convergent_fs_inputs ? |
| (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL : |
| FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL) : |
| FS_VEC4_TYPE_FLAT, |
| slot_size, NUM_SCALAR_SLOTS, true, assign_colors, |
| color_channel_rotate, progress); |
| } |
| |
| static void |
| vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage, |
| BITSET_WORD *input_mask, |
| unsigned *slot_index, |
| unsigned *patch_slot_index, |
| unsigned slot_size, |
| nir_opt_varyings_progress *progress) |
| { |
| unsigned i; |
| |
| BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) { |
| if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) { |
| /* Skip indirectly-indexed scalar slots at 32-bit granularity. |
| * We have to do it at this granularity because the low 16-bit |
| * slot is set to 1 for 32-bit inputs but not the high 16-bit slot. |
| */ |
| while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index)) |
| *patch_slot_index = align(*patch_slot_index + 1, 2); |
| |
| assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8); |
| relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index, |
| FS_VEC4_TYPE_NONE, false, progress); |
| *patch_slot_index += slot_size; /* increment by 16 or 32 bits */ |
| } else { |
| /* If the driver wants to use POS and we've already used it, move |
| * to VARn. |
| */ |
| if (*slot_index < VARYING_SLOT_VAR0 && |
| *slot_index >= VARYING_SLOT_POS + 8) |
| *slot_index = VARYING_SLOT_VAR0 * 8; |
| |
| /* Skip indirectly-indexed scalar slots at 32-bit granularity. */ |
| while (BITSET_TEST32(linkage->indirect_mask, *slot_index)) |
| *slot_index = align(*slot_index + 1, 2); |
| |
| assert(*slot_index < VARYING_SLOT_MAX * 8); |
| relocate_slot(linkage, &linkage->slot[i], i, *slot_index, |
| FS_VEC4_TYPE_NONE, false, progress); |
| *slot_index += slot_size; /* increment by 16 or 32 bits */ |
| } |
| } |
| } |
| |
| static void |
| vs_tcs_tes_gs_assign_slots_2sets(struct linkage_info *linkage, |
| BITSET_WORD *input32_mask, |
| BITSET_WORD *input16_mask, |
| unsigned *slot_index, |
| unsigned *patch_slot_index, |
| nir_opt_varyings_progress *progress) |
| { |
| /* Compact 32-bit inputs, followed by 16-bit inputs allowing them to |
| * share vec4 slots with 32-bit inputs. |
| */ |
| vs_tcs_tes_gs_assign_slots(linkage, input32_mask, slot_index, |
| patch_slot_index, 2, progress); |
| vs_tcs_tes_gs_assign_slots(linkage, input16_mask, slot_index, |
| patch_slot_index, 1, progress); |
| |
| assert(*slot_index <= VARYING_SLOT_MAX * 8); |
| assert(!patch_slot_index || *patch_slot_index <= VARYING_SLOT_TESS_MAX * 8); |
| } |
| |
| /** |
| * Compaction means scalarizing and then packing scalar components into full |
| * vec4s, so that we minimize the number of unused components in vec4 slots. |
| * |
| * Compaction is as simple as moving a scalar input from one scalar slot |
| * to another. Indirectly-indexed slots are not touched, so the compaction |
| * has to compact around them. Unused 32-bit components of indirectly-indexed |
| * slots are still filled, so no space is wasted there, but if indirectly- |
| * indexed 16-bit components have the other 16-bit half unused, that half is |
| * wasted. |
| */ |
| static void |
| compact_varyings(struct linkage_info *linkage, |
| nir_opt_varyings_progress *progress) |
| { |
| if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { |
| /* These arrays are used to track which scalar slots we've already |
| * assigned. We can fill unused components of indirectly-indexed slots, |
| * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same. |
| * Assign vec4 slot type separately, skipping over already assigned |
| * scalar slots. |
| */ |
| uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0}; |
| BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS); |
| BITSET_ZERO(assigned_mask); |
| |
| /* Iterate over all indirectly accessed inputs and set the assigned vec4 |
| * type of each occupied slot to the vec4 type of indirect inputs, so |
| * that compaction doesn't put inputs of a different vec4 type in |
| * the same vec4. |
| * |
| * We don't try to compact indirect input arrays, though we could. |
| */ |
| unsigned i; |
| BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) { |
| struct scalar_slot *slot = &linkage->slot[i]; |
| |
| /* The slot of the first array element contains all loads for all |
| * elements, including all direct accesses, while all other array |
| * elements are empty (on purpose). |
| */ |
| if (list_is_empty(&linkage->slot[i].consumer.loads)) |
| continue; |
| |
| assert(slot->num_slots >= 2); |
| |
| for (unsigned array_index = 0; array_index < slot->num_slots; |
| array_index++) { |
| unsigned vec4_index = vec4_slot(i) + array_index; |
| unsigned scalar_index = i + array_index * 8; |
| assigned_fs_vec4_type[vec4_index] = linkage->fs_vec4_type[vec4_index]; |
| /* Indirectly-indexed slots are marked to always occupy 32 bits |
| * (2 16-bit slots), though we waste the high 16 bits if they are unused. |
| */ |
| BITSET_SET_RANGE_INSIDE_WORD(assigned_mask, scalar_index, scalar_index + 1); |
| } |
| } |
| |
| if (linkage->has_flexible_interp) { |
| /* This codepath packs convergent varyings with both interpolated and |
| * flat, whichever has free space. |
| */ |
| fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_fp32_mask, linkage->flat32_mask, |
| linkage->convergent32_mask, NULL, |
| FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress); |
| |
| /* Now do the same thing, but for 16-bit inputs. */ |
| fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_fp16_mask, linkage->flat16_mask, |
| linkage->convergent16_mask, NULL, |
| FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress); |
| } else { |
| /* Basically the same as above. */ |
| fs_assign_slot_groups_separate_qual( |
| linkage, assigned_mask, assigned_fs_vec4_type, |
| &linkage->interp_fp32_qual_masks, linkage->flat32_mask, |
| linkage->convergent32_mask, NULL, |
| FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, false, 0, progress); |
| |
| fs_assign_slot_groups_separate_qual( |
| linkage, assigned_mask, assigned_fs_vec4_type, |
| &linkage->interp_fp16_qual_masks, linkage->flat16_mask, |
| linkage->convergent16_mask, NULL, |
| FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, 1, false, 0, progress); |
| } |
| |
| /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same |
| * slot because the vertex data is passed to FS as-is. |
| */ |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_explicit32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT, |
| 2, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_explicit16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT, |
| 1, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| /* Same for strict vertex ordering. */ |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_explicit_strict32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT, |
| 2, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_explicit_strict16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT, |
| 1, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| /* Same for per-primitive. */ |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->per_primitive32_mask, FS_VEC4_TYPE_PER_PRIMITIVE, |
| 2, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->per_primitive16_mask, FS_VEC4_TYPE_PER_PRIMITIVE, |
| 1, NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| /* Put transform-feedback-only outputs last. */ |
| fs_assign_slots(linkage, assigned_mask, NULL, |
| linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2, |
| NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| fs_assign_slots(linkage, assigned_mask, NULL, |
| linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1, |
| NUM_SCALAR_SLOTS, false, false, 0, progress); |
| |
| /* Color varyings are only compacted among themselves. */ |
| /* Set whether the shader contains any color varyings. */ |
| unsigned col0 = VARYING_SLOT_COL0 * 8; |
| bool has_colors = |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0, |
| col0 + 15, 0) || |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0, |
| col0 + 15, 0) || |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0, |
| col0 + 15, 0) || |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0, |
| col0 + 15, 0) || |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0, |
| col0 + 15, 0); |
| |
| for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { |
| has_colors |= |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_qual_masks[i], |
| col0, col0 + 15, 0); |
| } |
| for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { |
| has_colors |= |
| !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_qual_masks[i], |
| col0, col0 + 15, 0); |
| } |
| |
| if (has_colors) { |
| unsigned color_channel_rotate = 0; |
| |
| if (linkage->consumer_builder.shader->options->io_options & |
| nir_io_compaction_rotates_color_channels) { |
| color_channel_rotate = |
| DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4; |
| } |
| |
| if (linkage->has_flexible_interp) { |
| fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, |
| linkage->interp_fp32_mask, linkage->flat32_mask, |
| linkage->convergent32_mask, linkage->color32_mask, |
| FS_VEC4_TYPE_INTERP_FP32, 2, true, |
| color_channel_rotate, progress); |
| } else { |
| fs_assign_slot_groups_separate_qual( |
| linkage, assigned_mask, assigned_fs_vec4_type, |
| &linkage->interp_fp32_qual_masks, linkage->flat32_mask, |
| linkage->convergent32_mask, &linkage->color32_qual_masks, |
| FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, true, |
| color_channel_rotate, progress); |
| } |
| |
| /* Put transform-feedback-only outputs last. */ |
| fs_assign_slots(linkage, assigned_mask, NULL, |
| linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2, |
| NUM_SCALAR_SLOTS, false, true, color_channel_rotate, |
| progress); |
| } |
| return; |
| } |
| |
| /* If we get here, the consumer can only be TCS, TES, or GS. |
| * |
| * "use_pos" says whether the driver prefers that compaction with non-FS |
| * consumers puts varyings into POS first before using any VARn. |
| */ |
| bool use_pos = !(linkage->producer_builder.shader->options->io_options & |
| nir_io_dont_use_pos_for_non_fs_varyings); |
| unsigned slot_index = (use_pos ? VARYING_SLOT_POS |
| : VARYING_SLOT_VAR0) * 8; |
| |
| if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL) { |
| /* Make tcs_cross_invoc*_mask bits disjoint with flat*_mask bits |
| * because tcs_cross_invoc*_mask is initially a subset of flat*_mask, |
| * but we must assign each scalar slot only once. |
| */ |
| BITSET_ANDNOT(linkage->flat32_mask, linkage->flat32_mask, |
| linkage->tcs_cross_invoc32_mask); |
| BITSET_ANDNOT(linkage->flat16_mask, linkage->flat16_mask, |
| linkage->tcs_cross_invoc16_mask); |
| |
| /* Put cross-invocation-accessed TCS inputs first. */ |
| vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->tcs_cross_invoc32_mask, |
| linkage->tcs_cross_invoc16_mask, |
| &slot_index, NULL, progress); |
| /* Remaining TCS inputs. */ |
| vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask, |
| linkage->flat16_mask, &slot_index, |
| NULL, progress); |
| return; |
| } |
| |
| if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) { |
| unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8; |
| |
| vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask, |
| linkage->flat16_mask, &slot_index, |
| &patch_slot_index, progress); |
| |
| /* Put no-varying slots last. These are TCS outputs read by TCS but |
| * not TES. |
| */ |
| vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->no_varying32_mask, |
| linkage->no_varying16_mask, &slot_index, |
| &patch_slot_index, progress); |
| return; |
| } |
| |
| assert(linkage->consumer_stage == MESA_SHADER_GEOMETRY); |
| vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask, |
| linkage->flat16_mask, &slot_index, |
| NULL, progress); |
| } |
| |
| /****************************************************************** |
| * PUTTING IT ALL TOGETHER |
| ******************************************************************/ |
| |
| /* A costing function determining the cost of a uniform expression to determine |
| * whether it's worth propagating from output stores to the next shader stage. |
| * This tries to model instruction cost of a scalar desktop GPU. |
| * |
| * It's used by uniform expression propagation when drivers provide a cost |
| * limit for such an optimization but don't provide their own costing function, |
| * which are the majority of drivers. |
| */ |
| static unsigned |
| default_varying_estimate_instr_cost(nir_instr *instr) |
| { |
| unsigned dst_bit_size, src_bit_size, num_dst_dwords; |
| nir_op alu_op; |
| |
| switch (instr->type) { |
| case nir_instr_type_alu: |
| dst_bit_size = nir_instr_as_alu(instr)->def.bit_size; |
| src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size; |
| alu_op = nir_instr_as_alu(instr)->op; |
| num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32); |
| |
| switch (alu_op) { |
| /* Moves are free. */ |
| case nir_op_mov: |
| case nir_op_vec2: |
| case nir_op_vec3: |
| case nir_op_vec4: |
| case nir_op_vec5: |
| case nir_op_vec8: |
| case nir_op_vec16: |
| /* These are usually folded into FP instructions as src or dst |
| * modifiers. |
| */ |
| case nir_op_fabs: |
| case nir_op_fneg: |
| case nir_op_fsat: |
| return 0; |
| |
| /* 16-bit multiplication should be cheap. Greater sizes not so much. */ |
| case nir_op_imul: |
| case nir_op_umul_low: |
| case nir_op_imul_2x32_64: |
| case nir_op_umul_2x32_64: |
| return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords; |
| |
| /* High bits of 64-bit multiplications. */ |
| case nir_op_imul_high: |
| case nir_op_umul_high: |
| /* Lowered into multiple instructions typically. */ |
| case nir_op_fsign: |
| return 4; |
| |
| /* Transcendental opcodes typically run at 1/4 rate of FMA. */ |
| case nir_op_fexp2: |
| case nir_op_flog2: |
| case nir_op_frcp: |
| case nir_op_frsq: |
| case nir_op_fsqrt: |
| case nir_op_fsin: |
| case nir_op_fcos: |
| case nir_op_fsin_amd: |
| case nir_op_fcos_amd: |
| /* FP64 is usually much slower. */ |
| return dst_bit_size == 64 ? 32 : 4; |
| |
| case nir_op_fpow: |
| return 4 + 1 + 4; /* log2 + mul + exp2 */ |
| |
| /* Integer division is slow. */ |
| case nir_op_idiv: |
| case nir_op_udiv: |
| case nir_op_imod: |
| case nir_op_umod: |
| case nir_op_irem: |
| return dst_bit_size == 64 ? 80 : 40; |
| |
| case nir_op_fdiv: |
| return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */ |
| |
| case nir_op_fmod: |
| case nir_op_frem: |
| return dst_bit_size == 64 ? 80 : 8; |
| |
| default: |
| /* FP64 is usually much slower. */ |
| if ((dst_bit_size == 64 && |
| nir_op_infos[alu_op].output_type & nir_type_float) || |
| (src_bit_size == 64 && |
| nir_op_infos[alu_op].input_types[0] & nir_type_float)) |
| return 16; |
| |
| /* 1 per 32-bit result. */ |
| return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32); |
| } |
| |
| case nir_instr_type_intrinsic: |
| dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size; |
| num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32); |
| |
| /* This can only be a uniform load. Other intrinsics and variables are |
| * rejected before this is called. |
| */ |
| switch (nir_instr_as_intrinsic(instr)->intrinsic) { |
| case nir_intrinsic_load_deref: |
| /* Uniform loads can appear fast if latency hiding is effective. */ |
| return 2 * num_dst_dwords; |
| |
| default: |
| unreachable("unexpected intrinsic"); |
| } |
| |
| case nir_instr_type_deref: { |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| |
| switch (deref->deref_type) { |
| case nir_deref_type_var: |
| case nir_deref_type_struct: |
| return 0; |
| case nir_deref_type_array: |
| /* Indexing uniforms with a divergent index has a high cost. This cost |
| * is likely only going to be accepted by the driver if the next |
| * shader doesn't run after amplification (e.g. VS->TCS, TES->GS). |
| */ |
| return nir_src_is_const(deref->arr.index) ? 0 : 128; |
| |
| default: |
| unreachable("unexpected deref type"); |
| } |
| } |
| |
| default: |
| unreachable("unexpected instr type"); |
| } |
| } |
| |
| static void |
| init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv, |
| unsigned max_uniform_components, unsigned max_ubos_per_stage, |
| struct linkage_info *linkage, nir_opt_varyings_progress *progress) |
| { |
| *linkage = (struct linkage_info){ |
| .spirv = spirv, |
| .can_mix_convergent_flat_with_interpolated = |
| consumer->info.stage == MESA_SHADER_FRAGMENT && |
| consumer->options->io_options & |
| nir_io_mix_convergent_flat_with_interpolated, |
| .has_flexible_interp = |
| consumer->info.stage == MESA_SHADER_FRAGMENT && |
| consumer->options->io_options & |
| nir_io_has_flexible_input_interpolation_except_flat, |
| .always_interpolate_convergent_fs_inputs = |
| consumer->info.stage == MESA_SHADER_FRAGMENT && |
| consumer->options->io_options & |
| nir_io_always_interpolate_convergent_fs_inputs, |
| .producer_stage = producer->info.stage, |
| .consumer_stage = consumer->info.stage, |
| .producer_builder = |
| nir_builder_create(nir_shader_get_entrypoint(producer)), |
| .consumer_builder = |
| nir_builder_create(nir_shader_get_entrypoint(consumer)), |
| |
| .max_varying_expression_cost = |
| producer->options->varying_expression_max_cost ? |
| producer->options->varying_expression_max_cost(producer, consumer) : |
| producer->options->max_varying_expression_cost, |
| .varying_estimate_instr_cost = |
| producer->options->varying_estimate_instr_cost ? |
| producer->options->varying_estimate_instr_cost : |
| default_varying_estimate_instr_cost, |
| |
| .linear_mem_ctx = linear_context(ralloc_context(NULL)), |
| }; |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) { |
| list_inithead(&linkage->slot[i].producer.loads); |
| list_inithead(&linkage->slot[i].producer.stores); |
| list_inithead(&linkage->slot[i].consumer.loads); |
| } |
| |
| /* Preparation. */ |
| nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage); |
| nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage); |
| tidy_up_indirect_varyings(linkage); |
| determine_uniform_movability(linkage, max_uniform_components); |
| determine_ubo_movability(linkage, max_ubos_per_stage); |
| /* This must always be done because it also cleans up bitmasks. */ |
| remove_dead_varyings(linkage, progress); |
| } |
| |
| static void |
| free_linkage(struct linkage_info *linkage) |
| { |
| ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx)); |
| } |
| |
| static void |
| print_shader_linkage(nir_shader *producer, nir_shader *consumer) |
| { |
| struct linkage_info *linkage = MALLOC_STRUCT(linkage_info); |
| nir_opt_varyings_progress progress = 0; |
| |
| init_linkage(producer, consumer, false, 0, 0, linkage, &progress); |
| print_linkage(linkage); |
| free_linkage(linkage); |
| FREE(linkage); |
| } |
| |
| /** |
| * Run lots of optimizations on varyings. See the description at the beginning |
| * of this file. |
| */ |
| nir_opt_varyings_progress |
| nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv, |
| unsigned max_uniform_components, unsigned max_ubos_per_stage) |
| { |
| /* Task -> Mesh I/O uses payload variables and not varying slots, |
| * so this pass can't do anything about it. |
| */ |
| if (producer->info.stage == MESA_SHADER_TASK) |
| return 0; |
| |
| nir_opt_varyings_progress progress = 0; |
| struct linkage_info *linkage = MALLOC_STRUCT(linkage_info); |
| if (linkage == NULL) |
| return 0; |
| |
| /* Producers before a fragment shader must have up-to-date vertex |
| * divergence information. |
| */ |
| if (consumer->info.stage == MESA_SHADER_FRAGMENT) { |
| nir_vertex_divergence_analysis(producer); |
| } |
| |
| /* This also removes dead varyings. */ |
| init_linkage(producer, consumer, spirv, max_uniform_components, |
| max_ubos_per_stage, linkage, &progress); |
| |
| /* Part 1: Run optimizations that only remove varyings. (they can move |
| * instructions between shaders) |
| */ |
| propagate_uniform_expressions(linkage, &progress); |
| |
| /* Part 2: Deduplicate outputs. */ |
| deduplicate_outputs(linkage, &progress); |
| |
| /* Run CSE on the consumer after output deduplication because duplicated |
| * loads can prevent finding the post-dominator for inter-shader code |
| * motion. |
| */ |
| NIR_PASS(_, consumer, nir_opt_cse); |
| |
| /* Re-gather linkage info after CSE. */ |
| free_linkage(linkage); |
| init_linkage(producer, consumer, spirv, max_uniform_components, |
| max_ubos_per_stage, linkage, &progress); |
| |
| /* This must be done after deduplication and before inter-shader code |
| * motion. |
| */ |
| tidy_up_convergent_varyings(linkage); |
| find_open_coded_tes_input_interpolation(linkage); |
| |
| /* Part 3: Run optimizations that completely change varyings. */ |
| #if PRINT |
| int i = 0; |
| puts("Before:"); |
| nir_print_shader(linkage->producer_builder.shader, stdout); |
| nir_print_shader(linkage->consumer_builder.shader, stdout); |
| print_linkage(linkage); |
| puts(""); |
| #endif |
| |
| while (backward_inter_shader_code_motion(linkage, &progress)) { |
| #if PRINT |
| i++; |
| printf("Finished: %i\n", i); |
| nir_print_shader(linkage->producer_builder.shader, stdout); |
| nir_print_shader(linkage->consumer_builder.shader, stdout); |
| print_linkage(linkage); |
| puts(""); |
| #endif |
| } |
| |
| /* Part 4: Do compaction. */ |
| compact_varyings(linkage, &progress); |
| |
| nir_metadata_preserve(linkage->producer_builder.impl, |
| progress & nir_progress_producer ? |
| (nir_metadata_control_flow) : |
| nir_metadata_all); |
| nir_metadata_preserve(linkage->consumer_builder.impl, |
| progress & nir_progress_consumer ? |
| (nir_metadata_control_flow) : |
| nir_metadata_all); |
| free_linkage(linkage); |
| FREE(linkage); |
| |
| /* Compaction moves CLIP_DIST and CULL_DIST outputs to VARn if the next |
| * shader is not FS. Clear those fields in shader_info. |
| */ |
| if (consumer->info.stage <= MESA_SHADER_GEOMETRY) { |
| producer->info.clip_distance_array_size = 0; |
| producer->info.cull_distance_array_size = 0; |
| } |
| |
| if (progress & nir_progress_producer) |
| nir_validate_shader(producer, "nir_opt_varyings"); |
| if (progress & nir_progress_consumer) |
| nir_validate_shader(consumer, "nir_opt_varyings"); |
| |
| if (consumer->info.stage == MESA_SHADER_FRAGMENT) { |
| /* We have called nir_vertex_divergence_analysis on the producer here. |
| * We need to reset the divergent field to true, otherwise it will be |
| * garbage after some other passes are run, and then we end up failing |
| * assertions in some passes because src is divergent and dst isn't. |
| */ |
| nir_clear_divergence_info(producer); |
| } |
| |
| return progress; |
| } |