src/panfrost/compiler/bifrost_compile.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright (C) 2020 Collabora Ltd.
  * Copyright (C) 2022 Alyssa Rosenzweig <[email protected]>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors (Collabora):
  *      Alyssa Rosenzweig <[email protected]>
  */

 #include "compiler/glsl/glsl_to_nir.h"
 #include "compiler/glsl_types.h"
 #include "compiler/nir/nir_builder.h"
 #include "util/u_debug.h"

 #include "bifrost/disassemble.h"
 #include "panfrost/lib/pan_props.h"
 #include "valhall/disassemble.h"
 #include "valhall/va_compiler.h"
 #include "bi_builder.h"
 #include "bi_quirks.h"
 #include "bifrost_compile.h"
 #include "bifrost_nir.h"
 #include "compiler.h"

 /* clang-format off */
 static const struct debug_named_value bifrost_debug_options[] = {
    {"msgs",       BIFROST_DBG_MSGS,		   "Print debug messages"},
    {"shaders",    BIFROST_DBG_SHADERS,	   "Dump shaders in NIR and MIR"},
    {"shaderdb",   BIFROST_DBG_SHADERDB,	"Print statistics"},
    {"verbose",    BIFROST_DBG_VERBOSE,	   "Disassemble verbosely"},
    {"internal",   BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
    {"nosched",    BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
    {"nopsched",   BIFROST_DBG_NOPSCHED,   "Disable scheduling for pressure"},
    {"inorder",    BIFROST_DBG_INORDER, 	"Force in-order bundling"},
    {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
    {"noopt",      BIFROST_DBG_NOOPT,      "Skip optimization passes"},
    {"noidvs",     BIFROST_DBG_NOIDVS,     "Disable IDVS"},
    {"nosb",       BIFROST_DBG_NOSB,       "Disable scoreboarding"},
    {"nopreload",  BIFROST_DBG_NOPRELOAD,  "Disable message preloading"},
    {"spill",      BIFROST_DBG_SPILL,      "Test register spilling"},
    DEBUG_NAMED_VALUE_END
 };
 /* clang-format on */

 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
                             bifrost_debug_options, 0)

 /* How many bytes are prefetched by the Bifrost shader core. From the final
  * clause of the shader, this range must be valid instructions or zero. */
 #define BIFROST_SHADER_PREFETCH 128

 int bifrost_debug = 0;

 #define DBG(fmt, ...)                                                          \
    do {                                                                        \
       if (bifrost_debug & BIFROST_DBG_MSGS)                                    \
          fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__);    \
    } while (0)

 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);

 static bi_index
 bi_preload(bi_builder *b, unsigned reg)
 {
    if (bi_is_null(b->shader->preloaded[reg])) {
       /* Insert at the beginning of the shader */
       bi_builder b_ = *b;
       b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));

       /* Cache the result */
       b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
    }

    return b->shader->preloaded[reg];
 }

 static bi_index
 bi_coverage(bi_builder *b)
 {
    if (bi_is_null(b->shader->coverage))
       b->shader->coverage = bi_preload(b, 60);

    return b->shader->coverage;
 }

 /*
  * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
  * changed from Bifrost to Valhall. Provide helpers that smooth over the
  * architectural difference.
  */
 static inline bi_index
 bi_vertex_id(bi_builder *b)
 {
    return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
 }

 static inline bi_index
 bi_instance_id(bi_builder *b)
 {
    return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
 }

 static inline bi_index
 bi_draw_id(bi_builder *b)
 {
    assert(b->shader->arch >= 9);
    return bi_preload(b, 62);
 }

 static void
 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
 {
    bi_instr *branch = bi_jump(b, bi_zero());

    switch (instr->type) {
    case nir_jump_break:
       branch->branch_target = b->shader->break_block;
       break;
    case nir_jump_continue:
       branch->branch_target = b->shader->continue_block;
       break;
    default:
       unreachable("Unhandled jump type");
    }

    bi_block_add_successor(b->shader->current_block, branch->branch_target);
    b->shader->current_block->unconditional_jumps = true;
 }

 /* Builds a 64-bit hash table key for an index */
 static uint64_t
 bi_index_to_key(bi_index idx)
 {
    static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");

    uint64_t key = 0;
    memcpy(&key, &idx, sizeof(idx));
    return key;
 }

 /*
  * Extract a single channel out of a vector source. We split vectors with SPLIT
  * so we can use the split components directly, without emitting an extract.
  * This has advantages of RA, as the split can usually be optimized away.
  */
 static bi_index
 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
 {
    bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
                                                       bi_index_to_key(vec));

    /* No extract needed for scalars.
     *
     * This is a bit imprecise, but actual bugs (missing splits for vectors)
     * should be caught by the following assertion. It is too difficult to
     * ensure bi_extract is only called for real vectors.
     */
    if (components == NULL && channel == 0)
       return vec;

    assert(components != NULL && "missing bi_cache_collect()");
    return components[channel];
 }

 static void
 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
 {
    /* Lifetime of a hash table entry has to be at least as long as the table */
    bi_index *channels = ralloc_array(b->shader, bi_index, n);
    memcpy(channels, s, sizeof(bi_index) * n);

    _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
                                channels);
 }

 /*
  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
  * split pseudo-instruction.
  *
  * Pre-condition: dests is filled with bi_null().
  */
 static void
 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
 {
    /* Setup the destinations */
    for (unsigned i = 0; i < n; ++i) {
       dests[i] = bi_temp(b->shader);
    }

    /* Emit the split */
    if (n == 1) {
       bi_mov_i32_to(b, dests[0], vec);
    } else {
       bi_instr *I = bi_split_i32_to(b, n, vec);

       bi_foreach_dest(I, j)
          I->dest[j] = dests[j];
    }
 }

 static void
 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
 {
    bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
    bi_emit_split_i32(b, dests, vec, n);
    bi_cache_collect(b, vec, dests, n);
 }

 /*
  * Emit and cache a split for a vector of a given bitsize. The vector may not be
  * composed of 32-bit words, but it will be split at 32-bit word boundaries.
  */
 static void
 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
 {
    bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
 }

 static void
 bi_split_def(bi_builder *b, nir_def *def)
 {
    bi_emit_cached_split(b, bi_def_index(def),
                         def->bit_size * def->num_components);
 }

 static bi_instr *
 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
 {
    /* Special case: COLLECT of a single value is a scalar move */
    if (n == 1)
       return bi_mov_i32_to(b, dst, chan[0]);

    bi_instr *I = bi_collect_i32_to(b, dst, n);

    bi_foreach_src(I, i)
       I->src[i] = chan[i];

    bi_cache_collect(b, dst, chan, n);
    return I;
 }

 static bi_instr *
 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
 {
    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
 }

 static bi_instr *
 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
                     bi_index s2)
 {
    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
 }

 static bi_index
 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
 {
    bi_index dst = bi_temp(b->shader);
    bi_collect_v2i32_to(b, dst, s0, s1);
    return dst;
 }

 static bi_index
 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 {
    switch (intr->intrinsic) {
    case nir_intrinsic_load_barycentric_centroid:
    case nir_intrinsic_load_barycentric_sample:
       return bi_preload(b, 61);

    /* Need to put the sample ID in the top 16-bits */
    case nir_intrinsic_load_barycentric_at_sample:
       return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
                             bi_half(bi_src_index(&intr->src[0]), false));

    /* Interpret as 8:8 signed fixed point positions in pixels along X and
     * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
     * is the center of the pixel so we first fixup and then convert. For
     * fp16 input:
     *
     * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
     * f2i16((256 * (x, y)) + (128, 128)) =
     * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
     *
     * For fp32 input, that lacks enough precision for MSAA 16x, but the
     * idea is the same. FIXME: still doesn't pass
     */
    case nir_intrinsic_load_barycentric_at_offset: {
       bi_index offset = bi_src_index(&intr->src[0]);
       bi_index f16 = bi_null();
       unsigned sz = nir_src_bit_size(intr->src[0]);

       if (sz == 16) {
          f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
       } else {
          assert(sz == 32);
          bi_index f[2];
          for (unsigned i = 0; i < 2; ++i) {
             f[i] =
                bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
                                   bi_imm_u32(8), BI_SPECIAL_NONE);
          }

          f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
       }

       return bi_v2f16_to_v2s16(b, f16);
    }

    case nir_intrinsic_load_barycentric_pixel:
    default:
       return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
    }
 }

 static enum bi_sample
 bi_interp_for_intrinsic(nir_intrinsic_op op)
 {
    switch (op) {
    case nir_intrinsic_load_barycentric_centroid:
       return BI_SAMPLE_CENTROID;
    case nir_intrinsic_load_barycentric_sample:
    case nir_intrinsic_load_barycentric_at_sample:
       return BI_SAMPLE_SAMPLE;
    case nir_intrinsic_load_barycentric_at_offset:
       return BI_SAMPLE_EXPLICIT;
    case nir_intrinsic_load_barycentric_pixel:
    default:
       return BI_SAMPLE_CENTER;
    }
 }

 /* auto, 64-bit omitted */
 static enum bi_register_format
 bi_reg_fmt_for_nir(nir_alu_type T)
 {
    switch (T) {
    case nir_type_float16:
       return BI_REGISTER_FORMAT_F16;
    case nir_type_float32:
       return BI_REGISTER_FORMAT_F32;
    case nir_type_int16:
       return BI_REGISTER_FORMAT_S16;
    case nir_type_uint16:
       return BI_REGISTER_FORMAT_U16;
    case nir_type_int32:
       return BI_REGISTER_FORMAT_S32;
    case nir_type_uint32:
       return BI_REGISTER_FORMAT_U32;
    default:
       unreachable("Invalid type for register format");
    }
 }

 static bool
 va_is_valid_const_narrow_index(bi_index idx)
 {
    if (idx.type != BI_INDEX_CONSTANT)
       return false;

    unsigned index = pan_res_handle_get_index(idx.value);
    unsigned table_index = pan_res_handle_get_table(idx.value);

    return index < 1024 && va_is_valid_const_table(table_index);
 }

 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
  * immediate to be used (which applies even if _IMM can't be used) */

 static bool
 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
                      unsigned max)
 {
    nir_src *offset = nir_get_io_offset_src(instr);

    if (!nir_src_is_const(*offset))
       return false;

    *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
    return (*immediate) < max;
 }

 static bool
 bi_is_imm_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
                       uint32_t *immediate, unsigned max)
 {
    nir_src *offset = nir_get_io_offset_src(instr);

    if (!nir_src_is_const(*offset))
       return false;

    if (b->shader->arch >= 9) {
       uint32_t res_handle =
          nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
       uint32_t table_index = pan_res_handle_get_table(res_handle);
       uint32_t res_index = pan_res_handle_get_index(res_handle);

       if (!va_is_valid_const_table(table_index) || res_index >= max)
          return false;

       *immediate = res_handle;
       return true;
    }

    return bi_is_intr_immediate(instr, immediate, max);
 }

 static bool
 bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
                           uint32_t *immediate)
 {
    unsigned max = b->shader->arch >= 9 ? 256 : 20;

    return bi_is_imm_desc_handle(b, instr, immediate, max);
 }

 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
                            unsigned *channel, unsigned count, unsigned bitsize);

 /* Bifrost's load instructions lack a component offset despite operating in
  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
  * but they may be unavoidable with separate shaders in use. To solve this, we
  * lower to a larger load and an explicit copy of the desired components. */

 static void
 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
 {
    unsigned component = nir_intrinsic_component(instr);
    unsigned nr = instr->num_components;
    unsigned total = nr + component;
    unsigned bitsize = instr->def.bit_size;

    assert(total <= 4 && "should be vec4");
    bi_emit_cached_split(b, tmp, total * bitsize);

    if (component == 0)
       return;

    bi_index srcs[] = {tmp, tmp, tmp};
    unsigned channels[] = {component, component + 1, component + 2};

    bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
                   instr->def.bit_size);
 }

 static void
 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index vertex_id =
       instr->intrinsic == nir_intrinsic_load_attribute_pan ?
          bi_src_index(&instr->src[0]) :
          bi_vertex_id(b);
    bi_index instance_id =
       instr->intrinsic == nir_intrinsic_load_attribute_pan ?
          bi_src_index(&instr->src[1]) :
          bi_instance_id(b);

    /* Disregard the signedness of an integer, since loading 32-bits into a
     * 32-bit register should be bit exact so should not incur any clamping.
     *
     * If we are reading as a u32, then it must be paired with an integer (u32 or
     * s32) source, so use .auto32 to disregard.
     */
    nir_alu_type T = nir_intrinsic_dest_type(instr);
    assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
    enum bi_register_format regfmt =
       T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;

    nir_src *offset = nir_get_io_offset_src(instr);
    unsigned component = nir_intrinsic_component(instr);
    enum bi_vecsize vecsize = (instr->num_components + component - 1);
    unsigned imm_index = 0;
    unsigned base = nir_intrinsic_base(instr);
    bool constant = nir_src_is_const(*offset);
    bool immediate = bi_is_imm_desc_handle(b, instr, &imm_index, 16);
    bi_index dest =
       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
    bi_instr *I;

    if (immediate) {
       I = bi_ld_attr_imm_to(b, dest, vertex_id, instance_id, regfmt,
                             vecsize, pan_res_handle_get_index(imm_index));

       if (b->shader->arch >= 9)
          I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
    } else {
       bi_index idx = bi_src_index(&instr->src[0]);

       if (constant)
          idx = bi_imm_u32(imm_index);
       else if (base != 0)
          idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);

       I = bi_ld_attr_to(b, dest, vertex_id, instance_id, idx, regfmt, vecsize);
    }

    bi_copy_component(b, instr, dest);
 }

 /*
  * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
  * come later, sparsely packed. This handles both linked and separable shaders
  * with a common code path, with minimal keying only for desktop GL. Each slot
  * consumes 16 bytes (TODO: fp16, partial vectors).
  */
 static unsigned
 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
 {
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    uint32_t mask = ctx->inputs->fixed_varying_mask;

    if (sem.location >= VARYING_SLOT_VAR0) {
       unsigned nr_special = util_bitcount(mask);
       unsigned general_index = (sem.location - VARYING_SLOT_VAR0);

       return 16 * (nr_special + general_index);
    } else {
       return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
    }
 }

 /*
  * Compute the offset in bytes of a varying with an immediate offset, adding the
  * offset to the base computed above. Convenience method.
  */
 static unsigned
 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
 {
    nir_src *src = nir_get_io_offset_src(intr);
    assert(nir_src_is_const(*src) && "assumes immediate offset");

    return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
 }

 static void
 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
    enum bi_sample sample = BI_SAMPLE_CENTER;
    enum bi_update update = BI_UPDATE_STORE;
    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
    bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
    bi_index src0 = bi_null();

    unsigned component = nir_intrinsic_component(instr);
    enum bi_vecsize vecsize = (instr->num_components + component - 1);
    bi_index dest =
       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);

    unsigned sz = instr->def.bit_size;

    if (smooth) {
       nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
       assert(parent);

       sample = bi_interp_for_intrinsic(parent->intrinsic);
       src0 = bi_varying_src0_for_barycentric(b, parent);

       assert(sz == 16 || sz == 32);
       regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
    } else {
       assert(sz == 32);
       regfmt = BI_REGISTER_FORMAT_U32;

       /* Valhall can't have bi_null() here, although the source is
        * logically unused for flat varyings
        */
       if (b->shader->arch >= 9)
          src0 = bi_preload(b, 61);

       /* Gather info as we go */
       b->shader->info.bifrost->uses_flat_shading = true;
    }

    enum bi_source_format source_format =
       smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;

    nir_src *offset = nir_get_io_offset_src(instr);
    unsigned imm_index = 0;
    bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
    unsigned base = nir_intrinsic_base(instr);

    /* On Valhall, ensure the table and index are valid for usage with immediate
     * form when IDVS isn't used */
    if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
       immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
                    pan_res_handle_get_index(base) < 256;

    if (b->shader->malloc_idvs && immediate) {
       /* Immediate index given in bytes. */
       bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
                            update, vecsize,
                            bi_varying_offset(b->shader, instr));
    } else if (immediate) {
       bi_instr *I;

       if (smooth) {
          I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
                               pan_res_handle_get_index(imm_index));
       } else {
          I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
                                    pan_res_handle_get_index(imm_index));
       }

       /* Valhall usually uses machine-allocated IDVS. If this is disabled,
        * use a simple Midgard-style ABI.
        */
       if (b->shader->arch >= 9)
          I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
    } else {
       bi_index idx = bi_src_index(offset);

       if (b->shader->malloc_idvs) {
          /* Index needs to be in bytes, but NIR gives the index
           * in slots. For now assume 16 bytes per element.
           */
          bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
          unsigned vbase = bi_varying_base_bytes(b->shader, instr);

          if (vbase != 0)
             idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);

          bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
                           source_format, update, vecsize);
       } else {
          if (base != 0)
             idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);

          if (smooth)
             bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
          else
             bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
       }
    }

    bi_copy_component(b, instr, dest);
 }

 static bi_index
 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
                     unsigned count)
 {
    assert(1 <= count && count <= 4);

    bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};

    for (unsigned i = 0; i < count; ++i) {
       unsigned chan = channel ? channel[i] : 0;
       unsigned lane = chan & 3;
       bi_index raw_data = bi_extract(b, src[i], chan >> 2);

       /* On Bifrost, MKVEC.v4i8 cannot select b1 or b3 */
       if (b->shader->arch < 9 && lane != 0 && lane != 2) {
          bytes[i] = bi_byte(bi_rshift_or(b, 32, raw_data, bi_zero(),
                                          bi_imm_u8(lane * 8), false),
                             0);
       } else {
          bytes[i] = bi_byte(raw_data, lane);
       }

       assert(b->shader->arch >= 9 || bytes[i].swizzle == BI_SWIZZLE_B0000 ||
              bytes[i].swizzle == BI_SWIZZLE_B2222);
    }

    if (b->shader->arch >= 9) {
       bi_index vec = bi_zero();

       if (count >= 3)
          vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);

       return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
    } else {
       return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
    }
 }

 static bi_index
 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
                      unsigned count)
 {
    unsigned chan0 = channel ? channel[0] : 0;
    bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
    bi_index h0 = bi_half(w0, chan0 & 1);

    /* Zero extend */
    if (count == 1)
       return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));

    /* Else, create a vector */
    assert(count == 2);

    unsigned chan1 = channel ? channel[1] : 0;
    bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
    bi_index h1 = bi_half(w1, chan1 & 1);

    if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
       return bi_mov_i32(b, w0);
    else if (bi_is_word_equiv(w0, w1))
       return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
    else
       return bi_mkvec_v2i16(b, h0, h1);
 }

 static void
 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
                unsigned count, unsigned bitsize)
 {
    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
    unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
    unsigned chan_per_word = 1 << shift;

    assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
           "unnecessarily large vector should have been lowered");

    bi_index srcs[BI_MAX_VEC];

    for (unsigned i = 0; i < count; i += chan_per_word) {
       unsigned rem = MIN2(count - i, chan_per_word);
       unsigned *channel_offset = channel ? (channel + i) : NULL;

       if (bitsize == 32)
          srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
       else if (bitsize == 16)
          srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
       else
          srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
    }

    bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
 }

 static inline bi_instr *
 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
                bi_index src1)
 {
    bi_instr *I;

    if (b->shader->arch >= 9) {
       I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
       I->seg = BI_SEG_UBO;
    } else {
       I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
    }

    bi_emit_cached_split(b, dest0, bitsize);
    return I;
 }

 static void
 bi_load_sample_id_to(bi_builder *b, bi_index dst)
 {
    /* r61[16:23] contains the sampleID, mask it out. Upper bits
     * seem to read garbage (despite being architecturally defined
     * as zero), so use a 5-bit mask instead of 8-bits */

    bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
                         bi_imm_u8(16), false);
 }

 static bi_index
 bi_load_sample_id(bi_builder *b)
 {
    bi_index sample_id = bi_temp(b->shader);
    bi_load_sample_id_to(b, sample_id);
    return sample_id;
 }

 static bi_index
 bi_pixel_indices(bi_builder *b, unsigned rt)
 {
    /* We want to load the current pixel. */
    struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};

    uint32_t indices_u32 = 0;
    memcpy(&indices_u32, &pix, sizeof(indices_u32));
    bi_index indices = bi_imm_u32(indices_u32);

    /* Sample index above is left as zero. For multisampling, we need to
     * fill in the actual sample ID in the lower byte */

    if (b->shader->inputs->blend.nr_samples > 1)
       indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);

    return indices;
 }

 /* Source color is passed through r0-r3, or r4-r7 for the second source when
  * dual-source blending. Preload the corresponding vector.
  */
 static void
 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
 {
    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
    unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
    unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
    assert(size == 16 || size == 32);

    bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
                       bi_preload(b, base + 2), bi_preload(b, base + 3)};

    bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
 }

 static void
 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
                  nir_alu_type T2, unsigned rt)
 {
    /* Reads 2 or 4 staging registers to cover the input */
    unsigned size = nir_alu_type_get_type_size(T);
    unsigned size_2 = nir_alu_type_get_type_size(T2);
    unsigned sr_count = (size <= 16) ? 2 : 4;
    unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
    const struct panfrost_compile_inputs *inputs = b->shader->inputs;
    uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);

    /* Workaround for NIR-to-TGSI */
    if (b->shader->nir->info.fs.untyped_color_outputs)
       regfmt = BI_REGISTER_FORMAT_AUTO;

    if (inputs->is_blend && inputs->blend.nr_samples > 1) {
       /* Conversion descriptor comes from the compile inputs, pixel
        * indices derived at run time based on sample ID */
       bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
                  bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
    } else if (b->shader->inputs->is_blend) {
       uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;

       /* Blend descriptor comes from the compile inputs */
       /* Put the result in r0 */

       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
                   bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
                   bi_null(), regfmt, sr_count, 0);
    } else {
       /* Blend descriptor comes from the FAU RAM. By convention, the
        * return address on Bifrost is stored in r48 and will be used
        * by the blend shader to jump back to the fragment shader */

       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
                   bi_fau(BIR_FAU_BLEND_0 + rt, false),
                   bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
                   sr_count_2);
    }

    assert(rt < 8);
    b->shader->info.bifrost->blend[rt].type = T;

    if (T2)
       b->shader->info.bifrost->blend_src1_type = T2;
 }

 /* Blend shaders do not need to run ATEST since they are dependent on a
  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
  * there are no writes to the coverage mask. The latter two are satisfied for
  * all blit shaders, so we just care about early-z, which blit shaders force
  * iff they do not write depth or stencil */

 static bool
 bi_skip_atest(bi_context *ctx, bool emit_zs)
 {
    return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
 }

 static void
 bi_emit_atest(bi_builder *b, bi_index alpha)
 {
    b->shader->coverage =
       bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
    b->shader->emitted_atest = true;
 }

 static bi_index
 bi_src_color_vec4(bi_builder *b, nir_src *src, nir_alu_type T)
 {
    unsigned num_components = nir_src_num_components(*src);
    bi_index base = bi_src_index(src);

    /* short-circuit the common case */
    if (num_components == 4)
       return base;

    unsigned size = nir_alu_type_get_type_size(T);
    assert(size == 16 || size == 32);

    bi_index src_vals[4];

    unsigned i;
    for (i = 0; i < num_components; i++)
       src_vals[i] = bi_extract(b, base, i);

    for (; i < 3; i++)
       src_vals[i] = (size == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0);
    src_vals[3] = (size == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0);
    bi_index temp = bi_temp(b->shader);
    bi_make_vec_to(b, temp, src_vals, NULL, 4, size);
    return temp;
 }

 static void
 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;

    unsigned writeout =
       combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;

    bool emit_blend = writeout & (PAN_WRITEOUT_C);
    bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);

    unsigned loc = nir_intrinsic_io_semantics(instr).location;
    bi_index src0 = bi_src_index(&instr->src[0]);

    /* By ISA convention, the coverage mask is stored in R60. The store
     * itself will be handled by a subsequent ATEST instruction */
    if (loc == FRAG_RESULT_SAMPLE_MASK) {
       b->shader->coverage = bi_extract(b, src0, 0);
       return;
    }

    /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
     * value, but render target #0 might not be floating point. However the
     * alpha value is only used for alpha-to-coverage, a stage which is
     * skipped for pure integer framebuffers, so the issue is moot. */

    if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
       nir_alu_type T = nir_intrinsic_src_type(instr);

       bi_index rgba = bi_src_index(&instr->src[0]);
       bi_index alpha;

       if (nir_src_num_components(instr->src[0]) < 4) {
          /* Don't read out-of-bounds */
          alpha = bi_imm_f32(1.0);
       } else if (T == nir_type_float16) {
          alpha = bi_half(bi_extract(b, rgba, 1), true);
       } else if (T == nir_type_float32) {
          alpha = bi_extract(b, rgba, 3);
       } else {
          alpha = bi_dontcare(b);
       }
       bi_emit_atest(b, alpha);
    }

    if (emit_zs) {
       bi_index z = bi_dontcare(b), s = bi_dontcare(b);

       if (writeout & PAN_WRITEOUT_Z)
          z = bi_src_index(&instr->src[2]);

       if (writeout & PAN_WRITEOUT_S)
          s = bi_src_index(&instr->src[3]);

       b->shader->coverage =
          bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
                     writeout & PAN_WRITEOUT_Z);
    }

    if (emit_blend) {
       unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
       bool dual = (writeout & PAN_WRITEOUT_2);
       nir_alu_type T = nir_intrinsic_src_type(instr);
       nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
       bi_index color = bi_src_color_vec4(b, &instr->src[0], T);
       bi_index color2 =
          dual ? bi_src_color_vec4(b, &instr->src[4], T2) : bi_null();

       if (instr->intrinsic == nir_intrinsic_store_output &&
           loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
          assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");

          unsigned rt_offs = nir_src_as_uint(instr->src[1]);

          assert(rt + rt_offs < 8 && "RT not in the [0-7] range");
          rt += rt_offs;
       }

       /* Explicit copy since BLEND inputs are precoloured to R0-R3,
        * TODO: maybe schedule around this or implement in RA as a
        * spill */
       bool has_mrt =
          (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);

       if (has_mrt) {
          bi_index srcs[4] = {color, color, color, color};
          unsigned channels[4] = {0, 1, 2, 3};
          color = bi_temp(b->shader);
          bi_make_vec_to(
             b, color, srcs, channels, nir_src_num_components(instr->src[0]),
             nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
       }

       bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
    }

    if (b->shader->inputs->is_blend) {
       /* Jump back to the fragment shader, return address is stored
        * in r48 (see above). On Valhall, only jump if the address is
        * nonzero. The check is free there and it implements the "jump
        * to 0 terminates the blend shader" that's automatic on
        * Bifrost.
        */
       if (b->shader->arch >= 8)
          bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
       else
          bi_jump(b, bi_preload(b, 48));
    }
 }

 /**
  * In a vertex shader, is the specified variable a position output? These kinds
  * of outputs are written from position shaders when IDVS is enabled. All other
  * outputs are written from the varying shader.
  */
 static bool
 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
 {
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);

    switch (sem.location) {
    case VARYING_SLOT_POS:
    case VARYING_SLOT_PSIZ:
    case VARYING_SLOT_LAYER:
       return idvs == BI_IDVS_VARYING;
    default:
       return idvs == BI_IDVS_POSITION;
    }
 }

 static bool
 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
 {
    enum bi_idvs_mode *idvs = data;

    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

    if (intr->intrinsic != nir_intrinsic_store_output &&
        intr->intrinsic != nir_intrinsic_store_per_view_output)
       return false;

    if (bi_should_remove_store(intr, *idvs)) {
       nir_instr_remove(instr);
       return true;
    }

    return false;
 }

 static void
 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
    /* In principle we can do better for 16-bit. At the moment we require
     * 32-bit to permit the use of .auto, in order to force .u32 for flat
     * varyings, to handle internal TGSI shaders that set flat in the VS
     * but smooth in the FS */

    ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
    ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
    assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;

    unsigned imm_index = 0;
    bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);

    /* Only look at the total components needed. In effect, we fill in all
     * the intermediate "holes" in the write mask, since we can't mask off
     * stores. Since nir_lower_io_to_temporaries ensures each varying is
     * written at most once, anything that's masked out is undefined, so it
     * doesn't matter what we write there. So we may as well do the
     * simplest thing possible. */
    unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
    assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));

    bi_index data = bi_src_index(&instr->src[0]);

    /* To keep the vector dimensions consistent, we need to drop some
     * components. This should be coalesced.
     *
     * TODO: This is ugly and maybe inefficient. Would we rather
     * introduce a TRIM.i32 pseudoinstruction?
     */
    if (nr < nir_intrinsic_src_components(instr, 0)) {
       assert(T_size == 32 && "todo: 16-bit trim");

       bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
       unsigned src_comps = nir_intrinsic_src_components(instr, 0);

       bi_emit_split_i32(b, chans, data, src_comps);

       bi_index tmp = bi_temp(b->shader);
       bi_instr *collect = bi_collect_i32_to(b, tmp, nr);

       bi_foreach_src(collect, w)
          collect->src[w] = chans[w];

       data = tmp;
    }

    bool psiz =
       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
    bool layer =
       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_LAYER);

    bi_index a[4] = {bi_null()};

    if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
       /* Bifrost position shaders have a fast path */
       assert(T == nir_type_float16 || T == nir_type_float32);
       unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
       unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
       unsigned snap4 = 0x5E;
       uint32_t format = identity | (snap4 << 12) | (regfmt << 24);

       bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
                 bi_imm_u32(format), regfmt, nr - 1);
    } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
       bi_index index = bi_preload(b, 59);
       unsigned index_offset = 0;
       unsigned pos_attr_offset = 0;
       unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);

       if (psiz || layer)
          index_offset += 4;

       if (layer) {
          assert(nr == 1 && src_bit_sz == 32);
          src_bit_sz = 8;
          pos_attr_offset = 2;
          data = bi_byte(data, 0);
       }

       if (psiz)
          assert(T_size == 16 && "should've been lowered");

       bool varying = (b->shader->idvs == BI_IDVS_VARYING);

       if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
          unsigned view_index = nir_src_as_uint(instr->src[1]);

          if (varying) {
             index_offset += view_index * 4;
          } else {
             /* We don't patch these offsets in the no_psiz variant, so if
              * multiview is enabled we can't switch to the basic format by
              * using no_psiz */
             bool extended_position_fifo = b->shader->nir->info.outputs_written &
                (VARYING_BIT_LAYER | VARYING_BIT_PSIZ);
             unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
             index_offset += view_index * position_fifo_stride;
          }
       }

       if (index_offset != 0)
          index = bi_iadd_imm_i32(b, index, index_offset);
       bi_index address = bi_lea_buf_imm(b, index);
       bi_emit_split_i32(b, a, address, 2);

       bi_store(b, nr * src_bit_sz, data, a[0], a[1],
                varying ? BI_SEG_VARY : BI_SEG_POS,
                varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
    } else if (immediate) {
       bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
                                          regfmt, imm_index);
       bi_emit_split_i32(b, a, address, 3);

       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
    } else {
       bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
                                  bi_imm_u32(nir_intrinsic_base(instr)), false);
       bi_index address =
          bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
       bi_emit_split_i32(b, a, address, 3);

       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
    }
 }

 static void
 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
 {
    nir_src *offset = nir_get_io_offset_src(instr);

    bool offset_is_const = nir_src_is_const(*offset);
    bi_index dyn_offset = bi_src_index(offset);
    uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;

    bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
                   bi_def_index(&instr->def),
                   offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
                   bi_src_index(&instr->src[0]));
 }

 static void
 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
 {
    assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");

    nir_src *offset = &instr->src[0];
    assert(!nir_intrinsic_base(instr) && "base must be zero");
    assert(!nir_intrinsic_range(instr) && "range must be zero");
    assert(nir_src_is_const(*offset) && "no indirect push constants");
    uint32_t base = nir_src_as_uint(*offset);
    assert((base & 3) == 0 && "unaligned push constants");

    unsigned bits = instr->def.bit_size * instr->def.num_components;

    unsigned n = DIV_ROUND_UP(bits, 32);
    assert(n <= 4);
    bi_index channels[4] = {bi_null()};

    for (unsigned i = 0; i < n; ++i) {
       unsigned word = (base >> 2) + i;

       channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
    }

    bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);

    /* Update push->count to report the highest push constant word being accessed
     * by this shader.
     */
    b->shader->info.push->count =
       MAX2((base / 4) + n, b->shader->info.push->count);
 }

 static bi_index
 bi_addr_high(bi_builder *b, nir_src *src)
 {
    return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
                                          : bi_zero();
 }

 static void
 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
                   enum bi_seg seg, int16_t *offset)
 {
    /* Not needed on Bifrost or for global accesses */
    if (b->shader->arch < 9 || seg == BI_SEG_NONE)
       return;

    /* There is no segment modifier on Valhall. Instead, we need to
     * emit the arithmetic ourselves. We do have an offset
     * available, which saves an instruction for constant offsets.
     */
    bool wls = (seg == BI_SEG_WLS);
    assert(wls || (seg == BI_SEG_TL));

    enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;

    bi_index base_lo = bi_fau(fau, false);

    if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
        addr_lo->value == (int16_t)addr_lo->value) {
       *offset = addr_lo->value;
       *addr_lo = base_lo;
    } else {
       *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
    }

    /* Do not allow overflow for WLS or TLS */
    *addr_hi = bi_fau(fau, true);
 }

 static void
 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
 {
    int16_t offset = 0;
    unsigned bits = instr->num_components * instr->def.bit_size;
    bi_index dest = bi_def_index(&instr->def);
    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
    bi_index addr_hi = bi_addr_high(b, &instr->src[0]);

    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);

    bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
    bi_emit_cached_split(b, dest, bits);
 }

 static void
 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
 {
    /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
    assert(nir_intrinsic_write_mask(instr) ==
           BITFIELD_MASK(instr->num_components));

    int16_t offset = 0;
    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
    bi_index addr_hi = bi_addr_high(b, &instr->src[1]);

    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);

    bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
             bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
 }

 /* Exchanges the staging register with memory */

 static void
 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
                  enum bi_seg seg)
 {
    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);

    unsigned sz = nir_src_bit_size(*arg);
    assert(sz == 32 || sz == 64);

    bi_index data = bi_src_index(arg);

    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);

    if (b->shader->arch >= 9)
       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
    else if (seg == BI_SEG_WLS)
       addr_hi = bi_zero();

    bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
 }

 /* Exchanges the second staging register with memory if comparison with first
  * staging register passes */

 static void
 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
                     nir_src *arg_2, enum bi_seg seg)
 {
    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);

    /* hardware is swapped from NIR */
    bi_index src0 = bi_src_index(arg_2);
    bi_index src1 = bi_src_index(arg_1);

    unsigned sz = nir_src_bit_size(*arg_1);
    assert(sz == 32 || sz == 64);

    bi_index data_words[] = {
       bi_extract(b, src0, 0),
       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),

       /* 64-bit */
       bi_extract(b, src1, 0),
       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
    };

    bi_index in = bi_temp(b->shader);
    bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);

    if (b->shader->arch >= 9)
       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
    else if (seg == BI_SEG_WLS)
       addr_hi = bi_zero();

    bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
    bi_emit_cached_split(b, out, sz);

    bi_index inout_words[] = {bi_extract(b, out, 0),
                              sz == 64 ? bi_extract(b, out, 1) : bi_null()};

    bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
 }

 static enum bi_atom_opc
 bi_atom_opc_for_nir(nir_atomic_op op)
 {
    /* clang-format off */
    switch (op) {
    case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
    case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
    case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
    case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
    case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
    case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
    case nir_atomic_op_ior:  return BI_ATOM_OPC_AOR;
    case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
    default: unreachable("Unexpected computational atomic");
    }
    /* clang-format on */
 }

 /* Optimized unary atomics are available with an implied #1 argument */

 static bool
 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
 {
    /* Check we have a compatible constant */
    if (arg.type != BI_INDEX_CONSTANT)
       return false;

    if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
       return false;

    /* Check for a compatible operation */
    switch (op) {
    case BI_ATOM_OPC_AADD:
       *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
       return true;
    case BI_ATOM_OPC_ASMAX:
       *out = BI_ATOM_OPC_ASMAX1;
       return true;
    case BI_ATOM_OPC_AUMAX:
       *out = BI_ATOM_OPC_AUMAX1;
       return true;
    case BI_ATOM_OPC_AOR:
       *out = BI_ATOM_OPC_AOR1;
       return true;
    default:
       return false;
    }
 }

 /*
  * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
  * translate between these forms (with MKVEC.v2i16).
  *
  * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
  * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
  *
  * Our indexing needs to reflects this. Since Valhall and Bifrost are quite
  * different, we provide separate functions for these.
  */
 static bi_index
 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
                     unsigned coord_comps, bool is_array, bool is_msaa)
 {
    assert(coord_comps > 0 && coord_comps <= 3);

    /* MSAA load store should have been lowered */
    assert(!is_msaa);
    if (src_idx == 0) {
       if (coord_comps == 1 || (coord_comps == 2 && is_array))
          return bi_extract(b, coord, 0);
       else
          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
                                bi_half(bi_extract(b, coord, 1), false));
    } else {
       if (coord_comps == 3)
          return bi_extract(b, coord, 2);
       else if (coord_comps == 2 && is_array)
          return bi_extract(b, coord, 1);
       else
          return bi_zero();
    }
 }

 static bi_index
 va_emit_image_coord(bi_builder *b, bi_index coord, bi_index sample_index,
                     unsigned src_idx, unsigned coord_comps, bool is_array,
                     bool is_msaa)
 {
    assert(coord_comps > 0 && coord_comps <= 3);
    if (src_idx == 0) {
       if (coord_comps == 1 || (coord_comps == 2 && is_array))
          return bi_extract(b, coord, 0);
       else
          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
                                bi_half(bi_extract(b, coord, 1), false));
    } else if (is_msaa) {
       bi_index array_idx = bi_extract(b, sample_index, 0);
       if (coord_comps == 3)
          return bi_mkvec_v2i16(b, bi_half(array_idx, false),
                                bi_half(bi_extract(b, coord, 2), false));
       else if (coord_comps == 2)
          return array_idx;
    } else if (coord_comps == 3 && is_array) {
       return bi_mkvec_v2i16(b, bi_imm_u16(0),
                             bi_half(bi_extract(b, coord, 2), false));
    } else if (coord_comps == 3 && !is_array) {
       return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 2), false),
                             bi_imm_u16(0));
    } else if (coord_comps == 2 && is_array) {
       return bi_mkvec_v2i16(b, bi_imm_u16(0),
                             bi_half(bi_extract(b, coord, 1), false));
    }
    return bi_zero();
 }

 static void
 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
 {
    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
    bool array =
       nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;

    bi_index coords = bi_src_index(&instr->src[1]);
    bi_index indexvar = bi_src_index(&instr->src[2]);
    bi_index xy, zw;
    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS);
    if (b->shader->arch < 9) {
       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
    } else {
       xy =
          va_emit_image_coord(b, coords, indexvar, 0, coord_comps, array, is_ms);
       zw =
          va_emit_image_coord(b, coords, indexvar, 1, coord_comps, array, is_ms);
    }
    bi_index dest = bi_def_index(&instr->def);
    enum bi_register_format regfmt =
       bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
    enum bi_vecsize vecsize = instr->num_components - 1;

    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
       const unsigned table_index = pan_res_handle_get_table(raw_value);
       const unsigned texture_index = pan_res_handle_get_index(raw_value);

       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
          bi_instr *I =
             bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, texture_index);
          I->table = va_res_fold_table_idx(table_index);
       } else {
          bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
                       vecsize);
       }
    } else if (b->shader->arch >= 9) {
       bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
                    vecsize);
    } else {
       bi_ld_attr_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
                         vecsize);
    }

    bi_split_def(b, &instr->def);
 }

 static void
 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
 {
    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
    bool array =
       nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;
    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);

    enum bi_register_format type =
       (instr->intrinsic == nir_intrinsic_image_store)
          ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
          : BI_REGISTER_FORMAT_AUTO;

    bi_index coords = bi_src_index(&instr->src[1]);
    bi_index indices = bi_src_index(&instr->src[2]);
    bi_index xy, zw;
    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
    if (b->shader->arch < 9) {
       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
    } else {
       xy =
          va_emit_image_coord(b, coords, indices, 0, coord_comps, array, is_ms);
       zw =
          va_emit_image_coord(b, coords, indices, 1, coord_comps, array, is_ms);
    }

    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
       unsigned table_index = pan_res_handle_get_table(raw_value);
       unsigned texture_index = pan_res_handle_get_index(raw_value);

       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
          bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, texture_index);
          I->table = va_res_fold_table_idx(table_index);
       } else {
          bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
       }
    } else if (b->shader->arch >= 9) {
       bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
    } else {
       bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
                                        bi_src_index(&instr->src[0]), type);

       /* LEA_ATTR_TEX defaults to the secondary attribute table, but
        * our ABI has all images in the primary attribute table
        */
       I->table = BI_TABLE_ATTRIBUTE_1;
    }

    bi_emit_cached_split(b, dest, 3 * 32);
 }

 static bi_index
 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index dest = bi_temp(b->shader);
    bi_emit_lea_image_to(b, dest, instr);
    return dest;
 }

 static void
 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index a[4] = {bi_null()};
    bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);

    /* Due to SPIR-V limitations, the source type is not fully reliable: it
     * reports uint32 even for write_imagei. This causes an incorrect
     * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
     * instead, which will match per the OpenCL spec. Of course this does
     * not work for 16-bit stores, but those are not available in OpenCL.
     */
    nir_alu_type T = nir_intrinsic_src_type(instr);
    assert(nir_alu_type_get_type_size(T) == 32);

    bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
              BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
 }

 static void
 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
                       nir_atomic_op op)
 {
    enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
    enum bi_atom_opc post_opc = opc;
    bool bifrost = b->shader->arch <= 8;

    /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
     * take any vector but can still output in RETURN mode */
    bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
    unsigned sr_count = bifrost ? 2 : 1;

    /* Generate either ATOM or ATOM1 as required */
    if (bi_promote_atom_c1(opc, arg, &opc)) {
       bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
                              bi_extract(b, addr, 1), opc, sr_count);
    } else {
       bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
                             bi_extract(b, addr, 1), opc, sr_count);
    }

    if (bifrost) {
       /* Post-process it */
       bi_emit_cached_split_i32(b, tmp_dest, 2);
       bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
                           bi_extract(b, tmp_dest, 1), post_opc);
    }
 }

 static void
 bi_emit_load_frag_coord_zw_pan(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index dst = bi_def_index(&instr->def);
    unsigned channel = nir_intrinsic_component(instr);
    nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);

    enum bi_sample sample = bi_interp_for_intrinsic(bary->intrinsic);
    bi_index src0 = bi_varying_src0_for_barycentric(b, bary);

    /* .explicit is not supported with frag_z */
    if (channel == 2)
       assert(sample != BI_SAMPLE_EXPLICIT);

    bi_ld_var_special_to(
       b, dst, src0, BI_REGISTER_FORMAT_F32, sample, BI_UPDATE_CLOBBER,
       (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
       BI_VECSIZE_NONE);
 }

 static void
 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index dest = bi_def_index(&instr->def);
    nir_alu_type T = nir_intrinsic_dest_type(instr);
    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
    unsigned size = instr->def.bit_size;
    unsigned nr = instr->num_components;

    /* Get the render target */
    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
    unsigned loc = sem.location;
    assert(loc >= FRAG_RESULT_DATA0);
    unsigned rt = (loc - FRAG_RESULT_DATA0);

    bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
                  bi_src_index(&instr->src[0]), regfmt, nr - 1);
    bi_emit_cached_split(b, dest, size * nr);
 }

 /*
  * Older Bifrost hardware has a limited CLPER instruction. Add a safe helper
  * that uses the hardware functionality if available and lowers otherwise.
  */
 static bi_index
 bi_clper(bi_builder *b, bi_index s0, bi_index s1, enum bi_lane_op lop)
 {
    if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
       if (lop == BI_LANE_OP_XOR) {
          bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
          s1 = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
       } else {
          assert(lop == BI_LANE_OP_NONE);
       }

       return bi_clper_old_i32(b, s0, s1);
    } else {
       return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, lop,
                           BI_SUBGROUP_SUBGROUP4);
    }
 }

 static void
 bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
                    unsigned axis, bool coarse)
 {
    bi_index left, right;
    bi_index s0 = bi_src_index(&instr->src[0]);
    unsigned sz = instr->def.bit_size;

    /* If all uses are fabs, the sign of the derivative doesn't matter. This is
     * inherently based on fine derivatives so we can't do it for coarse.
     */
    if (nir_def_all_uses_ignore_sign_bit(&instr->def) && !coarse) {
       left = s0;
       right = bi_clper(b, s0, bi_imm_u32(axis), BI_LANE_OP_XOR);
    } else {
       bi_index lane1, lane2;
       if (coarse) {
          lane1 = bi_imm_u32(0);
          lane2 = bi_imm_u32(axis);
       } else {
          lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
                                    bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));

          lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
       }

       left = bi_clper(b, s0, lane1, BI_LANE_OP_NONE);
       right = bi_clper(b, s0, lane2, BI_LANE_OP_NONE);
    }

    bi_fadd_to(b, sz, dst, right, bi_neg(left));
 }

 static void
 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
 {
    bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
                      ? bi_def_index(&instr->def)
                      : bi_null();
    gl_shader_stage stage = b->shader->stage;

    switch (instr->intrinsic) {
    case nir_intrinsic_load_barycentric_pixel:
    case nir_intrinsic_load_barycentric_centroid:
    case nir_intrinsic_load_barycentric_sample:
    case nir_intrinsic_load_barycentric_at_sample:
    case nir_intrinsic_load_barycentric_at_offset:
       /* handled later via load_vary */
       break;
    case nir_intrinsic_load_attribute_pan:
       assert(stage == MESA_SHADER_VERTEX);
       bi_emit_load_attr(b, instr);
       break;

    case nir_intrinsic_load_interpolated_input:
    case nir_intrinsic_load_input:
       if (b->shader->inputs->is_blend)
          bi_emit_load_blend_input(b, instr);
       else if (stage == MESA_SHADER_FRAGMENT)
          bi_emit_load_vary(b, instr);
       else if (stage == MESA_SHADER_VERTEX)
          bi_emit_load_attr(b, instr);
       else
          unreachable("Unsupported shader stage");
       break;

    case nir_intrinsic_store_output:
    case nir_intrinsic_store_per_view_output:
       if (stage == MESA_SHADER_FRAGMENT)
          bi_emit_fragment_out(b, instr);
       else if (stage == MESA_SHADER_VERTEX)
          bi_emit_store_vary(b, instr);
       else
          unreachable("Unsupported shader stage");
       break;

    case nir_intrinsic_store_combined_output_pan:
       assert(stage == MESA_SHADER_FRAGMENT);
       bi_emit_fragment_out(b, instr);
       break;

    case nir_intrinsic_load_ubo:
       bi_emit_load_ubo(b, instr);
       break;

    case nir_intrinsic_load_push_constant:
       bi_emit_load_push_constant(b, instr);
       break;

    case nir_intrinsic_load_global:
    case nir_intrinsic_load_global_constant:
       bi_emit_load(b, instr, BI_SEG_NONE);
       break;

    case nir_intrinsic_store_global:
       bi_emit_store(b, instr, BI_SEG_NONE);
       break;

    case nir_intrinsic_load_scratch:
       bi_emit_load(b, instr, BI_SEG_TL);
       break;

    case nir_intrinsic_store_scratch:
       bi_emit_store(b, instr, BI_SEG_TL);
       break;

    case nir_intrinsic_load_shared:
       bi_emit_load(b, instr, BI_SEG_WLS);
       break;

    case nir_intrinsic_store_shared:
       bi_emit_store(b, instr, BI_SEG_WLS);
       break;

    case nir_intrinsic_barrier:
       if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
          assert(b->shader->stage != MESA_SHADER_FRAGMENT);
          assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
                 "todo: subgroup barriers (different divergence rules)");
          bi_barrier(b);
       }
       /* Blob doesn't seem to do anything for memory barriers, so no need to
        * check nir_intrinsic_memory_scope().
        */
       break;

    case nir_intrinsic_shared_atomic: {
       nir_atomic_op op = nir_intrinsic_atomic_op(instr);

       if (op == nir_atomic_op_xchg) {
          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
                           BI_SEG_WLS);
       } else {
          assert(nir_src_bit_size(instr->src[1]) == 32);

          bi_index addr = bi_src_index(&instr->src[0]);
          bi_index addr_hi;

          if (b->shader->arch >= 9) {
             bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
             addr = bi_collect_v2i32(b, addr, addr_hi);
          } else {
             addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
             bi_emit_cached_split(b, addr, 64);
          }

          bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
       }

       bi_split_def(b, &instr->def);
       break;
    }

    case nir_intrinsic_global_atomic: {
       nir_atomic_op op = nir_intrinsic_atomic_op(instr);

       if (op == nir_atomic_op_xchg) {
          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
                           BI_SEG_NONE);
       } else {
          assert(nir_src_bit_size(instr->src[1]) == 32);

          bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
                                bi_src_index(&instr->src[1]), op);
       }

       bi_split_def(b, &instr->def);
       break;
    }

    case nir_intrinsic_image_texel_address:
       bi_emit_lea_image_to(b, dst, instr);
       break;

    case nir_intrinsic_image_load:
       bi_emit_image_load(b, instr);
       break;

    case nir_intrinsic_image_store:
       bi_emit_image_store(b, instr);
       break;

    case nir_intrinsic_global_atomic_swap:
       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
                           &instr->src[2], BI_SEG_NONE);
       bi_split_def(b, &instr->def);
       break;

    case nir_intrinsic_shared_atomic_swap:
       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
                           &instr->src[2], BI_SEG_WLS);
       bi_split_def(b, &instr->def);
       break;

    case nir_intrinsic_load_pixel_coord:
       /* Vectorized load of the preloaded i16vec2 */
       bi_mov_i32_to(b, dst, bi_preload(b, 59));
       break;

    case nir_intrinsic_load_frag_coord_zw_pan:
       bi_emit_load_frag_coord_zw_pan(b, instr);
       break;

    case nir_intrinsic_load_converted_output_pan:
       bi_emit_ld_tile(b, instr);
       break;

    case nir_intrinsic_terminate_if:
       bi_discard_b32(b, bi_src_index(&instr->src[0]));
       break;

    case nir_intrinsic_terminate:
       bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
       break;

    case nir_intrinsic_load_sample_positions_pan:
       bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
                           bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
       break;

    case nir_intrinsic_load_sample_mask_in:
       /* r61[0:15] contains the coverage bitmap */
       bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
       break;

    case nir_intrinsic_load_sample_mask:
       bi_mov_i32_to(b, dst, bi_coverage(b));
       break;

    case nir_intrinsic_load_sample_id:
       bi_load_sample_id_to(b, dst);
       break;

    case nir_intrinsic_load_front_face:
       /* r58 == 0 means primitive is front facing */
       bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
                      BI_RESULT_TYPE_M1);
       break;

    case nir_intrinsic_load_point_coord:
       bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
                            BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
                            BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
       bi_emit_cached_split_i32(b, dst, 2);
       break;

    /* It appears vertex_id is zero-based with Bifrost geometry flows, but
     * not with Valhall's memory-allocation IDVS geometry flow. We only support
     * the new flow on Valhall so this is lowered in NIR.
     */
    case nir_intrinsic_load_vertex_id:
       assert(b->shader->malloc_idvs);
       bi_mov_i32_to(b, dst, bi_vertex_id(b));
       break;

    case nir_intrinsic_load_raw_vertex_id_pan:
       assert(!b->shader->malloc_idvs);
       bi_mov_i32_to(b, dst, bi_vertex_id(b));
       break;

    case nir_intrinsic_load_instance_id:
       bi_mov_i32_to(b, dst, bi_instance_id(b));
       break;

    case nir_intrinsic_load_draw_id:
       bi_mov_i32_to(b, dst, bi_draw_id(b));
       break;

    case nir_intrinsic_load_subgroup_invocation:
       bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
       break;

    case nir_intrinsic_load_local_invocation_id:
       bi_collect_v3i32_to(b, dst,
                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
                           bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
       break;

    case nir_intrinsic_load_workgroup_id:
       bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
                           bi_preload(b, 59));
       break;

    case nir_intrinsic_load_global_invocation_id:
       bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
                           bi_preload(b, 62));
       break;

    case nir_intrinsic_shader_clock:
       bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
       bi_split_def(b, &instr->def);
       break;

    case nir_intrinsic_ddx:
    case nir_intrinsic_ddx_fine:
       bi_emit_derivative(b, dst, instr, 1, false);
       break;
    case nir_intrinsic_ddx_coarse:
       bi_emit_derivative(b, dst, instr, 1, true);
       break;
    case nir_intrinsic_ddy:
    case nir_intrinsic_ddy_fine:
       bi_emit_derivative(b, dst, instr, 2, false);
       break;
    case nir_intrinsic_ddy_coarse:
       bi_emit_derivative(b, dst, instr, 2, true);
       break;

    case nir_intrinsic_load_view_index:
    case nir_intrinsic_load_layer_id:
       assert(b->shader->arch >= 9);
       bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
       break;

    case nir_intrinsic_load_ssbo_address:
       assert(b->shader->arch >= 9);
       bi_lea_buffer_to(b, dst, bi_src_index(&instr->src[1]),
                        bi_src_index(&instr->src[0]));
       bi_emit_cached_split(b, dst, 64);
       break;

    case nir_intrinsic_load_ssbo: {
       assert(b->shader->arch >= 9);
       unsigned dst_bits = instr->num_components * instr->def.bit_size;
       bi_ld_buffer_to(b, dst_bits, dst, bi_src_index(&instr->src[1]),
                       bi_src_index(&instr->src[0]));
       bi_emit_cached_split(b, dst, dst_bits);
       break;
    }

    default:
       fprintf(stderr, "Unhandled intrinsic %s\n",
               nir_intrinsic_infos[instr->intrinsic].name);
       assert(0);
    }
 }

 static void
 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
 {
    /* Make sure we've been lowered */
    assert(instr->def.num_components <= (32 / instr->def.bit_size));

    /* Accumulate all the channels of the constant, as if we did an
     * implicit SEL over them */
    uint32_t acc = 0;

    for (unsigned i = 0; i < instr->def.num_components; ++i) {
       unsigned v =
          nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
       acc |= (v << (i * instr->def.bit_size));
    }

    bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
 }

 static bi_index
 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
 {
    unsigned bitsize = nir_src_bit_size(src.src);

    /* the bi_index carries the 32-bit (word) offset separate from the
     * subword swizzle, first handle the offset */

    unsigned offset = 0;

    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
    unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;

    for (unsigned i = 0; i < comps; ++i) {
       unsigned new_offset = (src.swizzle[i] >> subword_shift);

       if (i > 0)
          assert(offset == new_offset && "wrong vectorization");

       offset = new_offset;
    }

    bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);

    /* Compose the subword swizzle with existing (identity) swizzle */
    assert(idx.swizzle == BI_SWIZZLE_H01);

    /* Bigger vectors should have been lowered */
    assert(comps <= (1 << subword_shift));

    if (bitsize == 16) {
       unsigned c0 = src.swizzle[0] & 1;
       unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
       idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
    } else if (bitsize == 8 && comps == 1) {
       idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
    } else if (bitsize == 8) {
       /* XXX: Use optimized swizzle when posisble */
       bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
       unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};

       for (unsigned i = 0; i < comps; ++i) {
          unoffset_srcs[i] = bi_src_index(&src.src);
          channels[i] = src.swizzle[i];
       }

       bi_index temp = bi_temp(b->shader);
       bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);

       static const enum bi_swizzle swizzle_lut[] = {
          BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_H01, BI_SWIZZLE_H01};
       assert(comps - 1 < ARRAY_SIZE(swizzle_lut));

       /* Assign a coherent swizzle for the vector */
       temp.swizzle = swizzle_lut[comps - 1];

       return temp;
    }

    return idx;
 }

 static enum bi_round
 bi_nir_round(nir_op op)
 {
    switch (op) {
    case nir_op_fround_even:
       return BI_ROUND_NONE;
    case nir_op_ftrunc:
       return BI_ROUND_RTZ;
    case nir_op_fceil:
       return BI_ROUND_RTP;
    case nir_op_ffloor:
       return BI_ROUND_RTN;
    default:
       unreachable("invalid nir round op");
    }
 }

 /* Convenience for lowered transcendentals */

 static bi_index
 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
 {
    return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
 }

 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
  * Newton-Raphson to improve precision */

 static void
 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
 {
    bi_index x1 = bi_frcp_approx_f32(b, s0);
    bi_index m = bi_frexpm_f32(b, s0, false, false);
    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
    bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
                                    BI_SPECIAL_N);
    bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
 }

 static void
 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
 {
    bi_index x1 = bi_frsq_approx_f32(b, s0);
    bi_index m = bi_frexpm_f32(b, s0, false, true);
    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
    bi_index t1 = bi_fmul_f32(b, x1, x1);
    bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
                                    bi_imm_u32(-1), BI_SPECIAL_N);
    bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
 }

 /* More complex transcendentals, see
  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
  * for documentation */

 static void
 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
    bi_index t1 = bi_temp(b->shader);
    bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
    t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;

    bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));

    bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
    a2->clamp = BI_CLAMP_CLAMP_M1_1;

    bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
    bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
    bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
    bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
                             bi_imm_u32(0x3e75fffa));
    bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
    bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
    bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
                                       BI_SPECIAL_NONE);
    x->clamp = BI_CLAMP_CLAMP_0_INF;

    bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
    max->sem = BI_SEM_NAN_PROPAGATE;
 }

 static void
 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
 {
    /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
     * fixed-point input */
    bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
                                       bi_imm_u32(24), BI_SPECIAL_NONE);
    bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
    fixed_pt->round = BI_ROUND_NONE; // XXX

    /* Compute the result for the fixed-point input, but pass along
     * the floating-point scale for correct NaN propagation */
    bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
 }

 static void
 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
    /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
    bi_index a1 = bi_frexpm_f32(b, s0, true, false);
    bi_index ei = bi_frexpe_f32(b, s0, true, false);
    bi_index ef = bi_s32_to_f32(b, ei);

    /* xt estimates -log(r1), a coarse approximation of log(a1) */
    bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
    bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);

    /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
     * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
     * and then log(s0) = x1 + x2 */
    bi_index x1 = bi_fadd_f32(b, ef, xt);

    /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
     * polynomial approximation around 1. The series is expressed around
     * 1, so set y = (a1 * r1) - 1.0 */
    bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));

    /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
     * log_e(1 + y) by the Taylor series (lower precision than the blob):
     * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
    bi_index loge =
       bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));

    bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));

    /* log(s0) = x1 + x2 */
    bi_fadd_f32_to(b, dst, x1, x2);
 }

 static void
 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
    bi_index frexp = bi_frexpe_f32(b, s0, true, false);
    bi_index frexpi = bi_s32_to_f32(b, frexp);
    bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
    bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
 }

 static void
 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
 {
    bi_index log2_base = bi_null();

    if (base.type == BI_INDEX_CONSTANT) {
       log2_base = bi_imm_f32(log2f(uif(base.value)));
    } else {
       log2_base = bi_temp(b->shader);
       bi_lower_flog2_32(b, log2_base, base);
    }

    return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
 }

 static void
 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
 {
    bi_index log2_base = bi_null();

    if (base.type == BI_INDEX_CONSTANT) {
       log2_base = bi_imm_f32(log2f(uif(base.value)));
    } else {
       log2_base = bi_temp(b->shader);
       bi_flog2_32(b, log2_base, base);
    }

    return bi_fexp_32(b, dst, exp, log2_base);
 }

 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
  * calculates the results. We use them to calculate sin/cos via a Taylor
  * approximation:
  *
  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
  */

 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
 #define SINCOS_BIAS  bi_imm_u32(0x49400000)

 static void
 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
 {
    /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
    bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);

    /* Approximate domain error (small) */
    bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
                            MPI_OVER_TWO, s0);

    /* Lookup sin(x), cos(x) */
    bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
    bi_index cosx = bi_fcos_table_u6(b, x_u6, false);

    /* e^2 / 2 */
    bi_index e2_over_2 =
       bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);

    /* (-e^2)/2 f''(x) */
    bi_index quadratic =
       bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());

    /* e f'(x) - (e^2/2) f''(x) */
    bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
                                cos ? bi_neg(sinx) : cosx, quadratic);
    I->clamp = BI_CLAMP_CLAMP_M1_1;

    /* f(x) + e f'(x) - (e^2/2) f''(x) */
    bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
 }

 static enum bi_cmpf
 bi_translate_cmpf(nir_op op)
 {
    switch (op) {
    case nir_op_ieq8:
    case nir_op_ieq16:
    case nir_op_ieq32:
    case nir_op_feq16:
    case nir_op_feq32:
       return BI_CMPF_EQ;

    case nir_op_ine8:
    case nir_op_ine16:
    case nir_op_ine32:
    case nir_op_fneu16:
    case nir_op_fneu32:
       return BI_CMPF_NE;

    case nir_op_ilt8:
    case nir_op_ilt16:
    case nir_op_ilt32:
    case nir_op_flt16:
    case nir_op_flt32:
    case nir_op_ult8:
    case nir_op_ult16:
    case nir_op_ult32:
       return BI_CMPF_LT;

    case nir_op_ige8:
    case nir_op_ige16:
    case nir_op_ige32:
    case nir_op_fge16:
    case nir_op_fge32:
    case nir_op_uge8:
    case nir_op_uge16:
    case nir_op_uge32:
       return BI_CMPF_GE;

    default:
       unreachable("invalid comparison");
    }
 }

 static bool
 bi_nir_is_replicated(nir_alu_src *src)
 {
    for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
       if (src->swizzle[0] == src->swizzle[i])
          return false;
    }

    return true;
 }

 static void
 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 {
    bi_index dst = bi_def_index(&instr->def);
    unsigned srcs = nir_op_infos[instr->op].num_inputs;
    unsigned sz = instr->def.bit_size;
    unsigned comps = instr->def.num_components;
    unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;

    /* Indicate scalarness */
    if (sz == 16 && comps == 1)
       dst.swizzle = BI_SWIZZLE_H00;

    /* First, match against the various moves in NIR. These are
     * special-cased because they can operate on vectors even after
     * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
     * instruction is no "bigger" than SIMD-within-a-register. These moves
     * are the exceptions that need to handle swizzles specially. */

    switch (instr->op) {
    case nir_op_vec2:
    case nir_op_vec3:
    case nir_op_vec4:
    case nir_op_vec8:
    case nir_op_vec16: {
       bi_index unoffset_srcs[16] = {bi_null()};
       unsigned channels[16] = {0};

       for (unsigned i = 0; i < srcs; ++i) {
          unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
          channels[i] = instr->src[i].swizzle[0];
       }

       bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
       return;
    }

    case nir_op_unpack_32_2x16: {
       /* Should have been scalarized */
       assert(comps == 2 && sz == 16);

       bi_index vec = bi_src_index(&instr->src[0].src);
       unsigned chan = instr->src[0].swizzle[0];

       bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
       return;
    }

    case nir_op_unpack_64_2x32_split_x: {
       unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
       bi_mov_i32_to(b, dst,
                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
       return;
    }

    case nir_op_unpack_64_2x32_split_y: {
       unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
       bi_mov_i32_to(b, dst,
                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
       return;
    }

    case nir_op_pack_64_2x32_split:
       bi_collect_v2i32_to(b, dst,
                           bi_extract(b, bi_src_index(&instr->src[0].src),
                                      instr->src[0].swizzle[0]),
                           bi_extract(b, bi_src_index(&instr->src[1].src),
                                      instr->src[1].swizzle[0]));
       return;

    case nir_op_pack_64_2x32:
       bi_collect_v2i32_to(b, dst,
                           bi_extract(b, bi_src_index(&instr->src[0].src),
                                      instr->src[0].swizzle[0]),
                           bi_extract(b, bi_src_index(&instr->src[0].src),
                                      instr->src[0].swizzle[1]));
       return;

    case nir_op_pack_uvec2_to_uint: {
       bi_index src = bi_src_index(&instr->src[0].src);

       assert(sz == 32 && src_sz == 32);
       bi_mkvec_v2i16_to(
          b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
          bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
       return;
    }

    case nir_op_pack_uvec4_to_uint: {
       bi_index src = bi_src_index(&instr->src[0].src);

       assert(sz == 32 && src_sz == 32);

       bi_index srcs[4] = {
          bi_extract(b, src, instr->src[0].swizzle[0]),
          bi_extract(b, src, instr->src[0].swizzle[1]),
          bi_extract(b, src, instr->src[0].swizzle[2]),
          bi_extract(b, src, instr->src[0].swizzle[3]),
       };
       unsigned channels[4] = {0};
       bi_make_vec_to(b, dst, srcs, channels, 4, 8);
       return;
    }

    case nir_op_mov: {
       bi_index idx = bi_src_index(&instr->src[0].src);
       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};

       unsigned channels[4] = {
          comps > 0 ? instr->src[0].swizzle[0] : 0,
          comps > 1 ? instr->src[0].swizzle[1] : 0,
          comps > 2 ? instr->src[0].swizzle[2] : 0,
          comps > 3 ? instr->src[0].swizzle[3] : 0,
       };

       bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
       return;
    }

    case nir_op_pack_32_2x16: {
       assert(comps == 1);

       bi_index idx = bi_src_index(&instr->src[0].src);
       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};

       unsigned channels[2] = {instr->src[0].swizzle[0],
                               instr->src[0].swizzle[1]};

       bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
       return;
    }

    case nir_op_f2f16:
    case nir_op_f2f16_rtz:
    case nir_op_f2f16_rtne: {
       assert(src_sz == 32);
       bi_index idx = bi_src_index(&instr->src[0].src);
       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
       bi_index s1 =
          comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;

       bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);

       /* Override rounding if explicitly requested. Otherwise, the
        * default rounding mode is selected by the builder. Depending
        * on the float controls required by the shader, the default
        * mode may not be nearest-even.
        */
       if (instr->op == nir_op_f2f16_rtz)
          I->round = BI_ROUND_RTZ;
       else if (instr->op == nir_op_f2f16_rtne)
          I->round = BI_ROUND_NONE; /* Nearest even */

       return;
    }

    /* Vectorized downcasts */
    case nir_op_u2u16:
    case nir_op_i2i16: {
       if (!(src_sz == 32 && comps == 2))
          break;

       bi_index idx = bi_src_index(&instr->src[0].src);
       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
       bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);

       bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
       return;
    }

    /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
     * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
     * scalarizing due to scheduling (equal cost on Valhall). Additionally
     * if the source is replicated the MKVEC.v2i16 can be optimized out.
     */
    case nir_op_u2f16:
    case nir_op_i2f16: {
       if (!(src_sz == 32 && comps == 2))
          break;

       nir_alu_src *src = &instr->src[0];
       bi_index idx = bi_src_index(&src->src);
       bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
       bi_index s1 = bi_extract(b, idx, src->swizzle[1]);

       bi_index t =
          (src->swizzle[0] == src->swizzle[1])
             ? bi_half(s0, false)
             : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));

       if (instr->op == nir_op_u2f16)
          bi_v2u16_to_v2f16_to(b, dst, t);
       else
          bi_v2s16_to_v2f16_to(b, dst, t);

       return;
    }

    case nir_op_i2i8:
    case nir_op_u2u8: {
       /* Acts like an 8-bit swizzle */
       bi_index idx = bi_src_index(&instr->src[0].src);
       unsigned factor = src_sz / 8;
       unsigned chan[4] = {0};

       for (unsigned i = 0; i < comps; ++i)
          chan[i] = instr->src[0].swizzle[i] * factor;

       bi_make_vec_to(b, dst, &idx, chan, comps, 8);
       return;
    }

    case nir_op_b32csel: {
       if (sz != 16)
          break;

       /* We allow vectorizing b32csel(cond, A, B) which can be
        * translated as MUX.v2i16, even though cond is a 32-bit vector.
        *
        * If the source condition vector is replicated, we can use
        * MUX.v2i16 directly, letting each component use the
        * corresponding half of the 32-bit source. NIR uses 0/~0
        * booleans so that's guaranteed to work (that is, 32-bit NIR
        * booleans are 16-bit replicated).
        *
        * If we're not replicated, we use the same trick but must
        * insert a MKVEC.v2i16 first to convert down to 16-bit.
        */
       bi_index idx = bi_src_index(&instr->src[0].src);
       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
       bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
       bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);

       if (!bi_nir_is_replicated(&instr->src[0])) {
          s0 = bi_mkvec_v2i16(
             b, bi_half(s0, false),
             bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
       }

       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
       return;
    }

    default:
       break;
    }

    bi_index s0 =
       srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
    bi_index s1 =
       srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
    bi_index s2 =
       srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();

    switch (instr->op) {
    case nir_op_ffma:
       bi_fma_to(b, sz, dst, s0, s1, s2);
       break;

    case nir_op_fmul:
       bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
       break;

    case nir_op_fadd:
       bi_fadd_to(b, sz, dst, s0, s1);
       break;

    case nir_op_fsat: {
       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
       I->clamp = BI_CLAMP_CLAMP_0_1;
       break;
    }

    case nir_op_fsat_signed: {
       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
       I->clamp = BI_CLAMP_CLAMP_M1_1;
       break;
    }

    case nir_op_fclamp_pos: {
       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
       break;
    }

    case nir_op_fneg:
       bi_fabsneg_to(b, sz, dst, bi_neg(s0));
       break;

    case nir_op_fabs:
       bi_fabsneg_to(b, sz, dst, bi_abs(s0));
       break;

    case nir_op_fsin:
       bi_lower_fsincos_32(b, dst, s0, false);
       break;

    case nir_op_fcos:
       bi_lower_fsincos_32(b, dst, s0, true);
       break;

    case nir_op_fexp2:
       assert(sz == 32); /* should've been lowered */

       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
          bi_lower_fexp2_32(b, dst, s0);
       else
          bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));

       break;

    case nir_op_flog2:
       assert(sz == 32); /* should've been lowered */

       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
          bi_lower_flog2_32(b, dst, s0);
       else
          bi_flog2_32(b, dst, s0);

       break;

    case nir_op_fpow:
       assert(sz == 32); /* should've been lowered */

       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
          bi_lower_fpow_32(b, dst, s0, s1);
       else
          bi_fpow_32(b, dst, s0, s1);

       break;

    case nir_op_frexp_exp:
       bi_frexpe_to(b, sz, dst, s0, false, false);
       break;

    case nir_op_frexp_sig:
       bi_frexpm_to(b, sz, dst, s0, false, false);
       break;

    case nir_op_ldexp:
       bi_ldexp_to(b, sz, dst, s0, s1);
       break;

    case nir_op_b8csel:
       bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
       break;

    case nir_op_b16csel:
       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
       break;

    case nir_op_b32csel:
       bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
       break;

    case nir_op_extract_u8:
    case nir_op_extract_i8: {
       assert(comps == 1 && "should be scalarized");
       assert((src_sz == 16 || src_sz == 32) && "should be lowered");
       unsigned byte = nir_alu_src_as_uint(instr->src[1]);

       if (s0.swizzle == BI_SWIZZLE_H11) {
          assert(byte < 2);
          byte += 2;
       } else if (s0.swizzle != BI_SWIZZLE_H01) {
          assert(s0.swizzle == BI_SWIZZLE_H00);
       }

       assert(byte < 4);

       s0.swizzle = BI_SWIZZLE_H01;

       if (instr->op == nir_op_extract_i8)
          bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
       else
          bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
       break;
    }

    case nir_op_extract_u16:
    case nir_op_extract_i16: {
       assert(comps == 1 && "should be scalarized");
       assert(src_sz == 32 && "should be lowered");
       unsigned half = nir_alu_src_as_uint(instr->src[1]);
       assert(half == 0 || half == 1);

       if (instr->op == nir_op_extract_i16)
          bi_s16_to_s32_to(b, dst, bi_half(s0, half));
       else
          bi_u16_to_u32_to(b, dst, bi_half(s0, half));
       break;
    }

    case nir_op_insert_u16: {
       assert(comps == 1 && "should be scalarized");
       unsigned half = nir_alu_src_as_uint(instr->src[1]);
       assert(half == 0 || half == 1);

       if (half == 0)
          bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
       else
          bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
       break;
    }

    case nir_op_ishl:
       bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
       break;
    case nir_op_ushr:
       bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
       break;

    case nir_op_ishr:
       if (b->shader->arch >= 9)
          bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
       else
          bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
       break;

    case nir_op_imin:
    case nir_op_umin:
       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
                  s1, BI_CMPF_LT);
       break;

    case nir_op_imax:
    case nir_op_umax:
       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
                  s1, BI_CMPF_GT);
       break;

    case nir_op_f2f32:
       bi_f16_to_f32_to(b, dst, s0);
       break;

    case nir_op_fquantize2f16: {
       bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);

       if (b->shader->arch < 9) {
          /* Bifrost has psuedo-ftz on conversions, that is lowered to an ftz
           * flag in the clause header */
          f16->ftz = true;
       } else {
          /* Valhall doesn't have clauses, and uses a separate flush
           * instruction */
          f16 = bi_flush_to(b, 16, bi_temp(b->shader), f16->dest[0]);
          f16->ftz = true;
       }

       bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));

       if (b->shader->arch < 9)
          f32->ftz = true;

       break;
    }

    case nir_op_f2i32:
       if (src_sz == 32)
          bi_f32_to_s32_to(b, dst, s0);
       else
          bi_f16_to_s32_to(b, dst, s0);
       break;

    /* Note 32-bit sources => no vectorization, so 32-bit works */
    case nir_op_f2u16:
       if (src_sz == 32)
          bi_f32_to_u32_to(b, dst, s0);
       else
          bi_v2f16_to_v2u16_to(b, dst, s0);
       break;

    case nir_op_f2i16:
       if (src_sz == 32)
          bi_f32_to_s32_to(b, dst, s0);
       else
          bi_v2f16_to_v2s16_to(b, dst, s0);
       break;

    case nir_op_f2u32:
       if (src_sz == 32)
          bi_f32_to_u32_to(b, dst, s0);
       else
          bi_f16_to_u32_to(b, dst, s0);
       break;

    case nir_op_u2f16:
       if (src_sz == 32)
          bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
       else if (src_sz == 16)
          bi_v2u16_to_v2f16_to(b, dst, s0);
       else if (src_sz == 8)
          bi_v2u8_to_v2f16_to(b, dst, s0);
       break;

    case nir_op_u2f32:
       if (src_sz == 32)
          bi_u32_to_f32_to(b, dst, s0);
       else if (src_sz == 16)
          bi_u16_to_f32_to(b, dst, s0);
       else
          bi_u8_to_f32_to(b, dst, s0);
       break;

    case nir_op_i2f16:
       if (src_sz == 32)
          bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
       else if (src_sz == 16)
          bi_v2s16_to_v2f16_to(b, dst, s0);
       else if (src_sz == 8)
          bi_v2s8_to_v2f16_to(b, dst, s0);
       break;

    case nir_op_i2f32:
       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);

       if (src_sz == 32)
          bi_s32_to_f32_to(b, dst, s0);
       else if (src_sz == 16)
          bi_s16_to_f32_to(b, dst, s0);
       else if (src_sz == 8)
          bi_s8_to_f32_to(b, dst, s0);
       break;

    case nir_op_i2i32:
       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);

       if (src_sz == 32)
          bi_mov_i32_to(b, dst, s0);
       else if (src_sz == 16)
          bi_s16_to_s32_to(b, dst, s0);
       else if (src_sz == 8)
          bi_s8_to_s32_to(b, dst, s0);
       break;

    case nir_op_u2u32:
       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);

       if (src_sz == 32)
          bi_mov_i32_to(b, dst, s0);
       else if (src_sz == 16)
          bi_u16_to_u32_to(b, dst, s0);
       else if (src_sz == 8)
          bi_u8_to_u32_to(b, dst, s0);

       break;

    case nir_op_i2i16:
       assert(src_sz == 8 || src_sz == 32);

       if (src_sz == 8)
          bi_v2s8_to_v2s16_to(b, dst, s0);
       else
          bi_mov_i32_to(b, dst, s0);
       break;

    case nir_op_u2u16:
       assert(src_sz == 8 || src_sz == 32);

       if (src_sz == 8)
          bi_v2u8_to_v2u16_to(b, dst, s0);
       else
          bi_mov_i32_to(b, dst, s0);
       break;

    case nir_op_b2i8:
    case nir_op_b2i16:
    case nir_op_b2i32:
       bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
                 BI_MUX_INT_ZERO);
       break;

    case nir_op_ieq8:
    case nir_op_ine8:
    case nir_op_ilt8:
    case nir_op_ige8:
    case nir_op_ieq16:
    case nir_op_ine16:
    case nir_op_ilt16:
    case nir_op_ige16:
    case nir_op_ieq32:
    case nir_op_ine32:
    case nir_op_ilt32:
    case nir_op_ige32:
       bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
                  BI_RESULT_TYPE_M1);
       break;

    case nir_op_ult8:
    case nir_op_uge8:
    case nir_op_ult16:
    case nir_op_uge16:
    case nir_op_ult32:
    case nir_op_uge32:
       bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
                  bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
       break;

    case nir_op_feq32:
    case nir_op_feq16:
    case nir_op_flt32:
    case nir_op_flt16:
    case nir_op_fge32:
    case nir_op_fge16:
    case nir_op_fneu32:
    case nir_op_fneu16:
       bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
                  BI_RESULT_TYPE_M1);
       break;

    case nir_op_fround_even:
    case nir_op_fceil:
    case nir_op_ffloor:
    case nir_op_ftrunc:
       bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
       break;

    case nir_op_fmin:
       bi_fmin_to(b, sz, dst, s0, s1);
       break;

    case nir_op_fmax:
       bi_fmax_to(b, sz, dst, s0, s1);
       break;

    case nir_op_iadd:
       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
       break;

    case nir_op_iadd_sat:
       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
       break;

    case nir_op_uadd_sat:
       bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
       break;

    case nir_op_ihadd:
       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
       break;

    case nir_op_irhadd:
       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
       break;

    case nir_op_uhadd:
       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
       break;

    case nir_op_urhadd:
       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
       break;

    case nir_op_ineg:
       bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
       break;

    case nir_op_isub:
       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
       break;

    case nir_op_isub_sat:
       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
       break;

    case nir_op_usub_sat:
       bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
       break;

    case nir_op_imul:
       bi_imul_to(b, sz, dst, s0, s1);
       break;

    case nir_op_iabs:
       bi_iabs_to(b, sz, dst, s0);
       break;

    case nir_op_iand:
       bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
       break;

    case nir_op_ior:
       bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
       break;

    case nir_op_ixor:
       bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
       break;

    case nir_op_inot:
       bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
       break;

    case nir_op_frsq:
       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
          bi_lower_frsq_32(b, dst, s0);
       else
          bi_frsq_to(b, sz, dst, s0);
       break;

    case nir_op_frcp:
       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
          bi_lower_frcp_32(b, dst, s0);
       else
          bi_frcp_to(b, sz, dst, s0);
       break;

    case nir_op_uclz:
       bi_clz_to(b, sz, dst, s0, false);
       break;

    case nir_op_bit_count:
       assert(sz == 32 && src_sz == 32 && "should've been lowered");
       bi_popcount_i32_to(b, dst, s0);
       break;

    case nir_op_bitfield_reverse:
       assert(sz == 32 && src_sz == 32 && "should've been lowered");
       bi_bitrev_i32_to(b, dst, s0);
       break;

    case nir_op_ufind_msb: {
       bi_index clz = bi_clz(b, src_sz, s0, false);

       if (sz == 8)
          clz = bi_byte(clz, 0);
       else if (sz == 16)
          clz = bi_half(clz, false);

       bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
       break;
    }

    default:
       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
       unreachable("Unknown ALU op");
    }
 }

 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
  * Midgard */
 static unsigned
 bifrost_tex_format(enum glsl_sampler_dim dim)
 {
    switch (dim) {
    case GLSL_SAMPLER_DIM_1D:
    case GLSL_SAMPLER_DIM_BUF:
       return 1;

    case GLSL_SAMPLER_DIM_2D:
    case GLSL_SAMPLER_DIM_MS:
    case GLSL_SAMPLER_DIM_EXTERNAL:
    case GLSL_SAMPLER_DIM_RECT:
    case GLSL_SAMPLER_DIM_SUBPASS:
    case GLSL_SAMPLER_DIM_SUBPASS_MS:
       return 2;

    case GLSL_SAMPLER_DIM_3D:
       return 3;

    case GLSL_SAMPLER_DIM_CUBE:
       return 0;

    default:
       DBG("Unknown sampler dim type\n");
       assert(0);
       return 0;
    }
 }

 static enum bi_dimension
 valhall_tex_dimension(enum glsl_sampler_dim dim)
 {
    switch (dim) {
    case GLSL_SAMPLER_DIM_1D:
    case GLSL_SAMPLER_DIM_BUF:
       return BI_DIMENSION_1D;

    case GLSL_SAMPLER_DIM_2D:
    case GLSL_SAMPLER_DIM_MS:
    case GLSL_SAMPLER_DIM_EXTERNAL:
    case GLSL_SAMPLER_DIM_RECT:
    case GLSL_SAMPLER_DIM_SUBPASS:
    case GLSL_SAMPLER_DIM_SUBPASS_MS:
       return BI_DIMENSION_2D;

    case GLSL_SAMPLER_DIM_3D:
       return BI_DIMENSION_3D;

    case GLSL_SAMPLER_DIM_CUBE:
       return BI_DIMENSION_CUBE;

    default:
       unreachable("Unknown sampler dim type");
    }
 }

 static enum bifrost_texture_format_full
 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
 {
    switch (T) {
    case nir_type_float16:
       return BIFROST_TEXTURE_FORMAT_F16 + clamp;
    case nir_type_float32:
       return BIFROST_TEXTURE_FORMAT_F32 + clamp;
    case nir_type_uint16:
       return BIFROST_TEXTURE_FORMAT_U16;
    case nir_type_int16:
       return BIFROST_TEXTURE_FORMAT_S16;
    case nir_type_uint32:
       return BIFROST_TEXTURE_FORMAT_U32;
    case nir_type_int32:
       return BIFROST_TEXTURE_FORMAT_S32;
    default:
       unreachable("Invalid type for texturing");
    }
 }

 /* Array indices are specified as 32-bit uints, need to convert. In .z component
  * from NIR */
 static bi_index
 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
 {
    /* For (u)int we can just passthrough */
    nir_alu_type base = nir_alu_type_get_base_type(T);
    if (base == nir_type_int || base == nir_type_uint)
       return idx;

    /* Otherwise we convert */
    assert(T == nir_type_float32);

    /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
     * Texel Selection") defines the layer to be taken from clamp(RNE(r),
     * 0, dt - 1). So we use round RTE, clamping is handled at the data
     * structure level */

    bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
    I->round = BI_ROUND_NONE;
    return I->dest[0];
 }

 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
  * 16-bit 8:8 fixed-point format. We lower as:
  *
  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
  */

 static bi_index
 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
 {
    /* Precompute for constant LODs to avoid general constant folding */
    if (lod.type == BI_INDEX_CONSTANT) {
       uint32_t raw = lod.value;
       float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
       int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
       return bi_imm_u32(s32 & 0xFFFF);
    }

    /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
     * the max LOD (16 since we cap at 2^16 texture dimensions), and
     * preferably small to minimize precision loss */
    const float max_lod = 16.0;

    bi_instr *fsat =
       bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
                     bi_imm_f32(1.0f / max_lod), bi_negzero());

    fsat->clamp = BI_CLAMP_CLAMP_M1_1;

    bi_index fmul =
       bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());

    return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
                          bi_imm_u16(0));
 }

 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
  * TODO: Cube face.
  */

 static bi_index
 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
 {
    return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
 }

 /* The hardware specifies texel offsets and multisample indices together as a
  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
  * the bits we need and return that to be passed as a staging register. Else we
  * return 0 to avoid allocating a data register when everything is zero. */

 static bi_index
 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
 {
    bi_index dest = bi_zero();

    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
       dest = bi_mkvec_v4i8(
          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
          (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
          bi_imm_u8(0));
    }

    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
                               bi_imm_u8(24));
    }

    return dest;
 }

 /*
  * Valhall specifies specifies texel offsets, multisample indices, and (for
  * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
  * component is either offset.z or multisample index depending on context. Build
  * this register.
  */
 static bi_index
 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
 {
    bi_index dest = bi_zero();

    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);

    /* Components 0-2: offsets */
    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
       bi_index idx = bi_src_index(&instr->src[offs_idx].src);

       /* No multisample index with 3D */
       assert((nr <= 2) || (ms_idx < 0));

       /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
       bi_index z = (nr > 2)
                       ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
                                       bi_imm_u8(0), bi_zero())
                       : bi_zero();

       dest = bi_mkvec_v2i8(
          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
    }

    /* Component 2: multisample index */
    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
       dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
    }

    /* Component 3: 8-bit LOD */
    if (lod_idx >= 0 &&
        (!nir_src_is_const(instr->src[lod_idx].src) ||
         nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
        nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
                               bi_imm_u8(24));
    }

    return dest;
 }

 static void
 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
                    bi_index *t)
 {
    /* Compute max { |x|, |y|, |z| } */
    bi_index maxxyz = bi_temp(b->shader);
    *face = bi_temp(b->shader);

    bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
             cz = bi_extract(b, coord, 2);

    /* Use a pseudo op on Bifrost due to tuple restrictions */
    if (b->shader->arch <= 8) {
       bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
    } else {
       bi_cubeface1_to(b, maxxyz, cx, cy, cz);
       bi_cubeface2_v9_to(b, *face, cx, cy, cz);
    }

    /* Select coordinates */
    bi_index ssel =
       bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
    bi_index tsel =
       bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);

    /* The OpenGL ES specification requires us to transform an input vector
     * (x, y, z) to the coordinate, given the selected S/T:
     *
     * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
     *
     * We implement (s shown, t similar) in a form friendlier to FMA
     * instructions, and clamp coordinates at the end for correct
     * NaN/infinity handling:
     *
     * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
     *
     * Take the reciprocal of max{x, y, z}
     */
    bi_index rcp = bi_frcp_f32(b, maxxyz);

    /* Calculate 0.5 * (1.0 / max{x, y, z}) */
    bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());

    /* Transform the coordinates */
    *s = bi_temp(b->shader);
    *t = bi_temp(b->shader);

    bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
    bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));

    S->clamp = BI_CLAMP_CLAMP_0_1;
    T->clamp = BI_CLAMP_CLAMP_0_1;
 }

 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
  * 32-bits in passed pointer t. The packing of the face with the S coordinate
  * exploits the redundancy of floating points with the range restriction of
  * CUBEFACE output.
  *
  *     struct cube_map_descriptor {
  *         float s : 29;
  *         unsigned face : 3;
  *         float t : 32;
  *     }
  *
  * Since the cube face index is preshifted, this is easy to pack with a bitwise
  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
  * bits from face.
  */

 static bi_index
 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
 {
    bi_index face, s;
    bi_emit_cube_coord(b, coord, &face, &s, t);
    bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
    return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
 }

 /* Map to the main texture op used. Some of these (txd in particular) will
  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
  * sequence). We assume that lowering is handled elsewhere.
  */

 static enum bifrost_tex_op
 bi_tex_op(nir_texop op)
 {
    switch (op) {
    case nir_texop_tex:
    case nir_texop_txb:
    case nir_texop_txl:
    case nir_texop_txd:
       return BIFROST_TEX_OP_TEX;
    case nir_texop_txf:
    case nir_texop_txf_ms:
    case nir_texop_tg4:
       return BIFROST_TEX_OP_FETCH;
    case nir_texop_lod:
       return BIFROST_TEX_OP_GRDESC;
    case nir_texop_txs:
    case nir_texop_query_levels:
    case nir_texop_texture_samples:
    case nir_texop_samples_identical:
       unreachable("should've been lowered");
    default:
       unreachable("unsupported tex op");
    }
 }

 /* Data registers required by texturing in the order they appear. All are
  * optional, the texture operation descriptor determines which are present.
  * Note since 3D arrays are not permitted at an API level, Z_COORD and
  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */

 enum bifrost_tex_dreg {
    BIFROST_TEX_DREG_Z_COORD = 0,
    BIFROST_TEX_DREG_Y_DELTAS = 1,
    BIFROST_TEX_DREG_LOD = 2,
    BIFROST_TEX_DREG_GRDESC_HI = 3,
    BIFROST_TEX_DREG_SHADOW = 4,
    BIFROST_TEX_DREG_ARRAY = 5,
    BIFROST_TEX_DREG_OFFSETMS = 6,
    BIFROST_TEX_DREG_SAMPLER = 7,
    BIFROST_TEX_DREG_TEXTURE = 8,
    BIFROST_TEX_DREG_COUNT,
 };

 static void
 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
 {
    struct bifrost_texture_operation desc = {
       .op = bi_tex_op(instr->op),
       .offset_or_bias_disable = false, /* TODO */
       .shadow_or_clamp_disable = instr->is_shadow,
       .array = instr->is_array && instr->op != nir_texop_lod,
       .dimension = bifrost_tex_format(instr->sampler_dim),
       .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
                                   BI_CLAMP_NONE), /* TODO */
       .mask = 0xF,
    };

    switch (desc.op) {
    case BIFROST_TEX_OP_TEX:
       desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
       break;
    case BIFROST_TEX_OP_FETCH:
       desc.lod_or_fetch = (enum bifrost_lod_mode)(
          instr->op == nir_texop_tg4
             ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
             : BIFROST_TEXTURE_FETCH_TEXEL);
       break;
    case BIFROST_TEX_OP_GRDESC:
       break;
    default:
       unreachable("texture op unsupported");
    }

    /* 32-bit indices to be allocated as consecutive staging registers */
    bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
    bi_index cx = bi_null(), cy = bi_null();
    bi_index ddx = bi_null();
    bi_index ddy = bi_null();

    for (unsigned i = 0; i < instr->num_srcs; ++i) {
       bi_index index = bi_src_index(&instr->src[i].src);
       unsigned sz = nir_src_bit_size(instr->src[i].src);
       unsigned components = nir_src_num_components(instr->src[i].src);
       ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
       nir_alu_type T = base | sz;

       switch (instr->src[i].src_type) {
       case nir_tex_src_coord:
          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
             cx = bi_emit_texc_cube_coord(b, index, &cy);
          } else {
             /* Copy XY (for 2D+) or XX (for 1D) */
             cx = bi_extract(b, index, 0);
             cy = bi_extract(b, index, MIN2(1, components - 1));

             assert(components >= 1 && components <= 3);

             if (components == 3 && !desc.array) {
                /* 3D */
                dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
             }
          }

          if (desc.array) {
             dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
                b, bi_extract(b, index, components - 1), T);
          }

          break;

       case nir_tex_src_lod:
          if (desc.op == BIFROST_TEX_OP_TEX &&
              nir_src_is_const(instr->src[i].src) &&
              nir_src_as_uint(instr->src[i].src) == 0) {
             desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
          } else if (desc.op == BIFROST_TEX_OP_TEX) {
             assert(base == nir_type_float);

             assert(sz == 16 || sz == 32);
             dregs[BIFROST_TEX_DREG_LOD] =
                bi_emit_texc_lod_88(b, index, sz == 16);
             desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
          } else {
             assert(desc.op == BIFROST_TEX_OP_FETCH);
             assert(base == nir_type_uint || base == nir_type_int);
             assert(sz == 16 || sz == 32);

             dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
          }

          break;

       case nir_tex_src_ddx:
          ddx = index;
          break;

       case nir_tex_src_ddy:
          ddy = index;
          break;

       case nir_tex_src_bias:
          /* Upper 16-bits interpreted as a clamp, leave zero */
          assert(desc.op == BIFROST_TEX_OP_TEX);
          assert(base == nir_type_float);
          assert(sz == 16 || sz == 32);
          dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
          desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
          break;

       case nir_tex_src_ms_index:
       case nir_tex_src_offset:
          if (desc.offset_or_bias_disable)
             break;

          dregs[BIFROST_TEX_DREG_OFFSETMS] =
             bi_emit_texc_offset_ms_index(b, instr);
          if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
             desc.offset_or_bias_disable = true;
          break;

       case nir_tex_src_comparator:
          dregs[BIFROST_TEX_DREG_SHADOW] = index;
          break;

       case nir_tex_src_texture_offset:
          dregs[BIFROST_TEX_DREG_TEXTURE] = index;
          break;

       case nir_tex_src_sampler_offset:
          dregs[BIFROST_TEX_DREG_SAMPLER] = index;
          break;

       default:
          unreachable("Unhandled src type in texc emit");
       }
    }

    if (desc.op == BIFROST_TEX_OP_FETCH &&
        bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
       dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
    }

    /* Choose an index mode */

    bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
    bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
    bool direct = direct_tex && direct_samp;

    desc.immediate_indices =
       direct && (instr->sampler_index < 16 && instr->texture_index < 128);

    if (desc.immediate_indices) {
       desc.sampler_index_or_mode = instr->sampler_index;
       desc.index = instr->texture_index;
    } else {
       unsigned mode = 0;

       if (direct && instr->sampler_index == instr->texture_index &&
           instr->sampler_index < 128) {
          mode = BIFROST_INDEX_IMMEDIATE_SHARED;
          desc.index = instr->texture_index;
       } else if (direct && instr->sampler_index < 128) {
          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
          desc.index = instr->sampler_index;
          dregs[BIFROST_TEX_DREG_TEXTURE] =
             bi_mov_i32(b, bi_imm_u32(instr->texture_index));
       } else if (direct_tex && instr->texture_index < 128) {
          mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
          desc.index = instr->texture_index;

          if (direct_samp) {
             dregs[BIFROST_TEX_DREG_SAMPLER] =
                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
          }
       } else if (direct_samp && instr->sampler_index < 128) {
          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
          desc.index = instr->sampler_index;

          if (direct_tex) {
             dregs[BIFROST_TEX_DREG_TEXTURE] =
                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
          }
       } else {
          mode = BIFROST_INDEX_REGISTER;

          if (direct_tex) {
             dregs[BIFROST_TEX_DREG_TEXTURE] =
                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
          }

          if (direct_samp) {
             dregs[BIFROST_TEX_DREG_SAMPLER] =
                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
          }
       }

       mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
       desc.sampler_index_or_mode = mode;
    }

    if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
       assert(!bi_is_null(ddx) && !bi_is_null(ddy));
       struct bifrost_texture_operation gropdesc = {
          .sampler_index_or_mode = desc.sampler_index_or_mode,
          .index = desc.index,
          .immediate_indices = desc.immediate_indices,
          .op = BIFROST_TEX_OP_GRDESC_DER,
          .offset_or_bias_disable = true,
          .shadow_or_clamp_disable = true,
          .array = false,
          .dimension = desc.dimension,
          .format = desc.format,
          .mask = desc.mask,
       };

       unsigned coords_comp_count =
          instr->coord_components -
          (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
       bi_index derivs[4];
       unsigned sr_count = 0;

       if (coords_comp_count > 2)
          derivs[sr_count++] = bi_extract(b, ddx, 2);
       derivs[sr_count++] = bi_extract(b, ddy, 0);
       if (coords_comp_count > 1)
          derivs[sr_count++] = bi_extract(b, ddy, 1);
       if (coords_comp_count > 2)
          derivs[sr_count++] = bi_extract(b, ddy, 2);

       bi_index derivs_packed = bi_temp(b->shader);
       bi_make_vec_to(b, derivs_packed, derivs, NULL, sr_count, 32);
       bi_index grdesc = bi_temp(b->shader);
       bi_instr *I =
          bi_texc_to(b, grdesc, derivs_packed, bi_extract(b, ddx, 0),
                     coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_zero(),
                     bi_imm_u32(gropdesc.packed), true, sr_count, 0);
       I->register_format = BI_REGISTER_FORMAT_U32;

       bi_emit_cached_split_i32(b, grdesc, 4);

       dregs[BIFROST_TEX_DREG_LOD] = bi_extract(b, grdesc, 0);
       desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
    }

    /* Allocate staging registers contiguously by compacting the array. */
    unsigned sr_count = 0;

    for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
       if (!bi_is_null(dregs[i]))
          dregs[sr_count++] = dregs[i];
    }

    unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;

    bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();

    if (sr_count)
       bi_emit_collect_to(b, sr, dregs, sr_count);

    if (instr->op == nir_texop_lod) {
       assert(instr->def.num_components == 2 && instr->def.bit_size == 32);

       bi_index res[2];
       for (unsigned i = 0; i < 2; i++) {
          desc.shadow_or_clamp_disable = i != 0;

          bi_index grdesc = bi_temp(b->shader);
          bi_instr *I = bi_texc_to(b, grdesc, sr, cx, cy,
                                   bi_imm_u32(desc.packed), false, sr_count, 0);
          I->register_format = BI_REGISTER_FORMAT_U32;

          bi_emit_cached_split_i32(b, grdesc, 4);

          bi_index lod = bi_s16_to_f32(b, bi_half(bi_extract(b, grdesc, 0), 0));

          lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));

          if (i == 0)
             lod = bi_fround_f32(b, lod, BI_ROUND_NONE);

          res[i] = lod;
       }

       bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
       return;
    }

    bi_index dst = bi_temp(b->shader);

    bi_instr *I =
       bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc.packed),
                  !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
    I->register_format = bi_reg_fmt_for_nir(instr->dest_type);

    bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
    bi_emit_split_i32(b, w, dst, res_size);
    bi_emit_collect_to(b, bi_def_index(&instr->def), w,
                       DIV_ROUND_UP(instr->def.num_components * res_size, 4));
 }

 /* Staging registers required by texturing in the order they appear (Valhall) */

 enum valhall_tex_sreg {
    VALHALL_TEX_SREG_X_COORD = 0,
    VALHALL_TEX_SREG_Y_COORD = 1,
    VALHALL_TEX_SREG_Z_COORD = 2,
    VALHALL_TEX_SREG_Y_DELTAS = 3,
    VALHALL_TEX_SREG_ARRAY = 4,
    VALHALL_TEX_SREG_SHADOW = 5,
    VALHALL_TEX_SREG_OFFSETMS = 6,
    VALHALL_TEX_SREG_LOD = 7,
    VALHALL_TEX_SREG_GRDESC0 = 8,
    VALHALL_TEX_SREG_GRDESC1 = 9,
    VALHALL_TEX_SREG_COUNT,
 };

 static void
 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
 {
    bool explicit_offset = false;
    enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;

    bool has_lod_mode = (instr->op == nir_texop_tex) ||
                        (instr->op == nir_texop_txl) ||
                        (instr->op == nir_texop_txd) ||
                        (instr->op == nir_texop_txb);

    /* 32-bit indices to be allocated as consecutive staging registers */
    bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
    bi_index sampler = bi_imm_u32(instr->sampler_index);
    bi_index texture = bi_imm_u32(instr->texture_index);
    bi_index ddx = bi_null();
    bi_index ddy = bi_null();

    for (unsigned i = 0; i < instr->num_srcs; ++i) {
       bi_index index = bi_src_index(&instr->src[i].src);
       unsigned sz = nir_src_bit_size(instr->src[i].src);

       switch (instr->src[i].src_type) {
       case nir_tex_src_coord: {
          bool is_array = instr->is_array && instr->op != nir_texop_lod;
          unsigned components = nir_tex_instr_src_size(instr, i) - is_array;

          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
             sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
                b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
          } else {
             assert(components >= 1 && components <= 3);

             /* Copy XY (for 2D+) or XX (for 1D) */
             sregs[VALHALL_TEX_SREG_X_COORD] = index;

             if (components >= 2)
                sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);

             if (components == 3)
                sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
          }

          if (is_array)
             sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components);

          break;
       }

       case nir_tex_src_lod:
          if (nir_src_is_const(instr->src[i].src) &&
              nir_src_as_uint(instr->src[i].src) == 0) {
             lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
          } else if (has_lod_mode) {
             lod_mode = BI_VA_LOD_MODE_EXPLICIT;

             assert(sz == 16 || sz == 32);
             sregs[VALHALL_TEX_SREG_LOD] =
                bi_emit_texc_lod_88(b, index, sz == 16);
          }
          break;

       case nir_tex_src_ddx:
 	 ddx = index;
 	 break;

       case nir_tex_src_ddy:
 	 ddy = index;
 	 break;

       case nir_tex_src_bias:
          /* Upper 16-bits interpreted as a clamp, leave zero */
          assert(sz == 16 || sz == 32);
          sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);

          lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
          break;
       case nir_tex_src_ms_index:
       case nir_tex_src_offset:
          /* Handled below */
          break;

       case nir_tex_src_comparator:
          sregs[VALHALL_TEX_SREG_SHADOW] = index;
          break;

       case nir_tex_src_texture_offset:
          /* This should always be 0 as lower_index_to_offset is expected to be
           * set */
          assert(instr->texture_index == 0);
          texture = index;
          break;

       case nir_tex_src_sampler_offset:
          /* This should always be 0 as lower_index_to_offset is expected to be
           * set */
          assert(instr->sampler_index == 0);
          sampler = index;
          break;

       default:
          unreachable("Unhandled src type in tex emit");
       }
    }

    /* Generate packed offset + ms index + LOD register. These default to
     * zero so we only need to encode if these features are actually in use.
     */
    bi_index offsets = bi_emit_valhall_offsets(b, instr);

    if (!bi_is_equiv(offsets, bi_zero())) {
       sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
       explicit_offset = true;
    }

    bool narrow_indices = va_is_valid_const_narrow_index(texture) &&
                          va_is_valid_const_narrow_index(sampler);

    bi_index src0;
    bi_index src1;

    if (narrow_indices) {
       unsigned tex_set =
          va_res_fold_table_idx(pan_res_handle_get_table(texture.value));
       unsigned sampler_set =
          va_res_fold_table_idx(pan_res_handle_get_table(sampler.value));
       unsigned texture_index = pan_res_handle_get_index(texture.value);
       unsigned sampler_index = pan_res_handle_get_index(sampler.value);

       unsigned packed_handle = (tex_set << 27) | (texture_index << 16) |
                                (sampler_set << 11) | sampler_index;

       src0 = bi_imm_u32(packed_handle);

       /* TODO: narrow offsetms */
       src1 = bi_zero();
    } else {
       src0 = sampler;
       src1 = texture;
    }

    enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);

    if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
       unsigned coords_comp_count =
          instr->coord_components -
          (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
       assert(!bi_is_null(ddx) && !bi_is_null(ddy));

       lod_mode = BI_VA_LOD_MODE_GRDESC;

       bi_index derivs[6] = {
          bi_extract(b, ddx, 0),
          bi_extract(b, ddy, 0),
          coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_null(),
          coords_comp_count > 1 ? bi_extract(b, ddy, 1) : bi_null(),
          coords_comp_count > 2 ? bi_extract(b, ddx, 2) : bi_null(),
          coords_comp_count > 2 ? bi_extract(b, ddy, 2) : bi_null(),
       };
       bi_index derivs_packed = bi_temp(b->shader);
       bi_make_vec_to(b, derivs_packed, derivs, NULL, coords_comp_count * 2, 32);
       bi_index grdesc = bi_temp(b->shader);
       bi_instr *I = bi_tex_gradient_to(b, grdesc, derivs_packed, src0, src1, dim,
                                        !narrow_indices, 3, coords_comp_count * 2);
       I->derivative_enable = true;
       I->force_delta_enable = false;
       I->lod_clamp_disable = true;
       I->lod_bias_disable = true;
       I->register_format = BI_REGISTER_FORMAT_U32;

       bi_emit_cached_split_i32(b, grdesc, 2);
       sregs[VALHALL_TEX_SREG_GRDESC0] = bi_extract(b, grdesc, 0);
       sregs[VALHALL_TEX_SREG_GRDESC1] = bi_extract(b, grdesc, 1);
    }

    /* Allocate staging registers contiguously by compacting the array. */
    unsigned sr_count = 0;
    for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
       if (!bi_is_null(sregs[i]))
          sregs[sr_count++] = sregs[i];
    }

    bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();

    if (sr_count)
       bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);

    if (instr->op == nir_texop_lod) {
       assert(instr->def.num_components == 2 && instr->def.bit_size == 32);

       bi_index res[2];

       for (unsigned i = 0; i < 2; i++) {
          bi_index grdesc = bi_temp(b->shader);
          bi_instr *I = bi_tex_gradient_to(b, grdesc, idx, src0, src1, dim,
                                           !narrow_indices, 1, sr_count);
          I->derivative_enable = false;
          I->force_delta_enable = true;
          I->lod_clamp_disable = i != 0;
          I->register_format = BI_REGISTER_FORMAT_U32;
          bi_index lod = bi_s16_to_f32(b, bi_half(grdesc, 0));

          lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));

          if (i == 0)
             lod = bi_fround_f32(b, lod, BI_ROUND_NONE);

          res[i] = lod;
       }

       bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
       return;
    }

    /* Only write the components that we actually read */
    unsigned mask = nir_def_components_read(&instr->def);
    unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
    unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);

    enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
    bi_index dest = bi_temp(b->shader);

    switch (instr->op) {
    case nir_texop_tex:
    case nir_texop_txb:
    case nir_texop_txl:
    case nir_texop_txd:
       bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
                        instr->is_shadow, explicit_offset, lod_mode,
                        !narrow_indices, mask, sr_count);
       break;
    case nir_texop_txf:
    case nir_texop_txf_ms:
       bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
                       explicit_offset, !narrow_indices, mask, sr_count);
       break;
    case nir_texop_tg4:
       bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
                        instr->component, false, regfmt, instr->is_shadow,
                        explicit_offset, !narrow_indices, mask, sr_count);
       break;
    default:
       unreachable("Unhandled Valhall texture op");
    }

    /* The hardware will write only what we read, and it will into
     * contiguous registers without gaps (different from Bifrost). NIR
     * expects the gaps, so fill in the holes (they'll be copypropped and
     * DCE'd away later).
     */
    bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};

    bi_emit_cached_split_i32(b, dest, res_size);

    /* Index into the packed component array */
    unsigned j = 0;
    unsigned comps[4] = {0};
    unsigned nr_components = instr->def.num_components;

    for (unsigned i = 0; i < nr_components; ++i) {
       if (mask & BITFIELD_BIT(i)) {
          unpacked[i] = dest;
          comps[i] = j++;
       } else {
          unpacked[i] = bi_zero();
       }
    }

    bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
                   instr->def.num_components, instr->def.bit_size);
 }

 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
  * textures with sufficiently small immediate indices. Anything else
  * needs a complete texture op. */

 static void
 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
 {
    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
    assert(coord_idx >= 0);
    bi_index coords = bi_src_index(&instr->src[coord_idx].src);

    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
       bi_index face, s, t;
       bi_emit_cube_coord(b, coords, &face, &s, &t);

       bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
                       face, instr->sampler_index, instr->texture_index);
    } else {
       bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
                     bi_extract(b, coords, 0), bi_extract(b, coords, 1),
                     instr->op != nir_texop_tex, /* zero LOD */
                     instr->sampler_index, instr->texture_index);
    }

    bi_split_def(b, &instr->def);
 }

 static bool
 bi_is_simple_tex(nir_tex_instr *instr)
 {
    if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
       return false;

    if (instr->dest_type != nir_type_float32 &&
        instr->dest_type != nir_type_float16)
       return false;

    if (instr->is_shadow || instr->is_array)
       return false;

    switch (instr->sampler_dim) {
    case GLSL_SAMPLER_DIM_2D:
    case GLSL_SAMPLER_DIM_EXTERNAL:
    case GLSL_SAMPLER_DIM_RECT:
       break;

    case GLSL_SAMPLER_DIM_CUBE:
       /* LOD can't be specified with TEXS_CUBE */
       if (instr->op == nir_texop_txl)
          return false;
       break;

    default:
       return false;
    }

    for (unsigned i = 0; i < instr->num_srcs; ++i) {
       if (instr->src[i].src_type != nir_tex_src_lod &&
           instr->src[i].src_type != nir_tex_src_coord)
          return false;
    }

    /* Indices need to fit in provided bits */
    unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
    if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
       return false;

    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
    if (lod_idx < 0)
       return true;

    nir_src lod = instr->src[lod_idx].src;
    return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
 }

 static void
 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
 {
    /* If txf is used, we assume there is a valid sampler bound at index 0. Use
     * it for txf operations, since there may be no other valid samplers. This is
     * a workaround: txf does not require a sampler in NIR (so sampler_index is
     * undefined) but we need one in the hardware. This is ABI with the driver.
     *
     * On Valhall, as the descriptor table is encoded in the index, this should
     * be handled by the driver.
     */
    if (!nir_tex_instr_need_sampler(instr) && b->shader->arch < 9)
       instr->sampler_index = 0;

    if (b->shader->arch >= 9)
       bi_emit_tex_valhall(b, instr);
    else if (bi_is_simple_tex(instr))
       bi_emit_texs(b, instr);
    else
       bi_emit_texc(b, instr);
 }

 static void
 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
 {
    unsigned nr_srcs = exec_list_length(&instr->srcs);
    bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);

    /* Deferred */
    I->phi = instr;
 }

 /* Look up the AGX block corresponding to a given NIR block. Used when
  * translating phi nodes after emitting all blocks.
  */
 static bi_block *
 bi_from_nir_block(bi_context *ctx, nir_block *block)
 {
    return ctx->indexed_nir_blocks[block->index];
 }

 static void
 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
 {
    nir_phi_instr *phi = I->phi;

    /* Guaranteed by lower_phis_to_scalar */
    assert(phi->def.num_components == 1);

    nir_foreach_phi_src(src, phi) {
       bi_block *pred = bi_from_nir_block(ctx, src->pred);
       unsigned i = bi_predecessor_index(block, pred);
       assert(i < I->nr_srcs);

       I->src[i] = bi_src_index(&src->src);
    }

    I->phi = NULL;
 }

 static void
 bi_emit_phis_deferred(bi_context *ctx)
 {
    bi_foreach_block(ctx, block) {
       bi_foreach_instr_in_block(block, I) {
          if (I->op == BI_OPCODE_PHI)
             bi_emit_phi_deferred(ctx, block, I);
       }
    }
 }

 static void
 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
 {
    switch (instr->type) {
    case nir_instr_type_load_const:
       bi_emit_load_const(b, nir_instr_as_load_const(instr));
       break;

    case nir_instr_type_intrinsic:
       bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
       break;

    case nir_instr_type_alu:
       bi_emit_alu(b, nir_instr_as_alu(instr));
       break;

    case nir_instr_type_tex:
       bi_emit_tex(b, nir_instr_as_tex(instr));
       break;

    case nir_instr_type_jump:
       bi_emit_jump(b, nir_instr_as_jump(instr));
       break;

    case nir_instr_type_phi:
       bi_emit_phi(b, nir_instr_as_phi(instr));
       break;

    default:
       unreachable("should've been lowered");
    }
 }

 static bi_block *
 create_empty_block(bi_context *ctx)
 {
    bi_block *blk = rzalloc(ctx, bi_block);

    util_dynarray_init(&blk->predecessors, blk);

    return blk;
 }

 static bi_block *
 emit_block(bi_context *ctx, nir_block *block)
 {
    if (ctx->after_block) {
       ctx->current_block = ctx->after_block;
       ctx->after_block = NULL;
    } else {
       ctx->current_block = create_empty_block(ctx);
    }

    list_addtail(&ctx->current_block->link, &ctx->blocks);
    list_inithead(&ctx->current_block->instructions);

    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));

    ctx->indexed_nir_blocks[block->index] = ctx->current_block;

    nir_foreach_instr(instr, block) {
       bi_emit_instr(&_b, instr);
    }

    return ctx->current_block;
 }

 static void
 emit_if(bi_context *ctx, nir_if *nif)
 {
    bi_block *before_block = ctx->current_block;

    /* Speculatively emit the branch, but we can't fill it in until later */
    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
    bi_instr *then_branch =
       bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
                      bi_zero(), BI_CMPF_EQ);

    /* Emit the two subblocks. */
    bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
    bi_block *end_then_block = ctx->current_block;

    /* Emit second block */

    bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
    bi_block *end_else_block = ctx->current_block;
    ctx->after_block = create_empty_block(ctx);

    /* Now that we have the subblocks emitted, fix up the branches */

    assert(then_block);
    assert(else_block);

    then_branch->branch_target = else_block;

    /* Emit a jump from the end of the then block to the end of the else */
    _b.cursor = bi_after_block(end_then_block);
    bi_instr *then_exit = bi_jump(&_b, bi_zero());
    then_exit->branch_target = ctx->after_block;

    bi_block_add_successor(end_then_block, then_exit->branch_target);
    bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */

    bi_block_add_successor(before_block,
                           then_branch->branch_target); /* then_branch */
    bi_block_add_successor(before_block, then_block);   /* fallthrough */
 }

 static void
 emit_loop(bi_context *ctx, nir_loop *nloop)
 {
    assert(!nir_loop_has_continue_construct(nloop));

    /* Remember where we are */
    bi_block *start_block = ctx->current_block;

    bi_block *saved_break = ctx->break_block;
    bi_block *saved_continue = ctx->continue_block;

    ctx->continue_block = create_empty_block(ctx);
    ctx->break_block = create_empty_block(ctx);
    ctx->after_block = ctx->continue_block;
    ctx->after_block->loop_header = true;

    /* Emit the body itself */
    emit_cf_list(ctx, &nloop->body);

    /* Branch back to loop back */
    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
    bi_instr *I = bi_jump(&_b, bi_zero());
    I->branch_target = ctx->continue_block;
    bi_block_add_successor(start_block, ctx->continue_block);
    bi_block_add_successor(ctx->current_block, ctx->continue_block);

    ctx->after_block = ctx->break_block;

    /* Pop off */
    ctx->break_block = saved_break;
    ctx->continue_block = saved_continue;
    ++ctx->loop_count;
 }

 static bi_block *
 emit_cf_list(bi_context *ctx, struct exec_list *list)
 {
    bi_block *start_block = NULL;

    foreach_list_typed(nir_cf_node, node, node, list) {
       switch (node->type) {
       case nir_cf_node_block: {
          bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));

          if (!start_block)
             start_block = block;

          break;
       }

       case nir_cf_node_if:
          emit_if(ctx, nir_cf_node_as_if(node));
          break;

       case nir_cf_node_loop:
          emit_loop(ctx, nir_cf_node_as_loop(node));
          break;

       default:
          unreachable("Unknown control flow");
       }
    }

    return start_block;
 }

 /* shader-db stuff */

 struct bi_stats {
    unsigned nr_clauses, nr_tuples, nr_ins;
    unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
 };

 static void
 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
 {
    /* Count instructions */
    stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);

    /* Non-message passing tuples are always arithmetic */
    if (tuple->add != clause->message) {
       stats->nr_arith++;
       return;
    }

    /* Message + FMA we'll count as arithmetic _and_ message */
    if (tuple->fma)
       stats->nr_arith++;

    switch (clause->message_type) {
    case BIFROST_MESSAGE_VARYING:
       /* Check components interpolated */
       stats->nr_varying +=
          (clause->message->vecsize + 1) *
          (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
       break;

    case BIFROST_MESSAGE_VARTEX:
       /* 2 coordinates, fp32 each */
       stats->nr_varying += (2 * 2);
       FALLTHROUGH;
    case BIFROST_MESSAGE_TEX:
       stats->nr_texture++;
       break;

    case BIFROST_MESSAGE_ATTRIBUTE:
    case BIFROST_MESSAGE_LOAD:
    case BIFROST_MESSAGE_STORE:
    case BIFROST_MESSAGE_ATOMIC:
       stats->nr_ldst++;
       break;

    case BIFROST_MESSAGE_NONE:
    case BIFROST_MESSAGE_BARRIER:
    case BIFROST_MESSAGE_BLEND:
    case BIFROST_MESSAGE_TILE:
    case BIFROST_MESSAGE_Z_STENCIL:
    case BIFROST_MESSAGE_ATEST:
    case BIFROST_MESSAGE_JOB:
    case BIFROST_MESSAGE_64BIT:
       /* Nothing to do */
       break;
    };
 }

 /*
  * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
  * shader completes. These costs are not accounted for in the general cycle
  * counts, so this function calculates the effective cost of these messages, as
  * if they were executed by shader code.
  */
 static unsigned
 bi_count_preload_cost(bi_context *ctx)
 {
    /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
     * 16 fp16 varying components per cycle or fetch two texels per cycle.
     */
    unsigned cost = 0;

    for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
       struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];

       if (msg.enabled && msg.texture) {
          /* 2 coordinate, 2 half-words each, plus texture */
          cost += 12;
       } else if (msg.enabled) {
          cost += (msg.num_components * (msg.fp16 ? 1 : 2));
       }
    }

    return cost;
 }

 static const char *
 bi_shader_stage_name(bi_context *ctx)
 {
    if (ctx->idvs == BI_IDVS_VARYING)
       return "MESA_SHADER_VARYING";
    else if (ctx->idvs == BI_IDVS_POSITION)
       return "MESA_SHADER_POSITION";
    else if (ctx->inputs->is_blend)
       return "MESA_SHADER_BLEND";
    else
       return gl_shader_stage_name(ctx->stage);
 }

 static char *
 bi_print_stats(bi_context *ctx, unsigned size)
 {
    struct bi_stats stats = {0};

    /* Count instructions, clauses, and tuples. Also attempt to construct
     * normalized execution engine cycle counts, using the following ratio:
     *
     * 24 arith tuples/cycle
     * 2 texture messages/cycle
     * 16 x 16-bit varying channels interpolated/cycle
     * 1 load store message/cycle
     *
     * These numbers seem to match Arm Mobile Studio's heuristic. The real
     * cycle counts are surely more complicated.
     */

    bi_foreach_block(ctx, block) {
       bi_foreach_clause_in_block(block, clause) {
          stats.nr_clauses++;
          stats.nr_tuples += clause->tuple_count;

          for (unsigned i = 0; i < clause->tuple_count; ++i)
             bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
       }
    }

    float cycles_arith = ((float)stats.nr_arith) / 24.0;
    float cycles_texture = ((float)stats.nr_texture) / 2.0;
    float cycles_varying = ((float)stats.nr_varying) / 16.0;
    float cycles_ldst = ((float)stats.nr_ldst) / 1.0;

    float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
    float cycles_bound = MAX2(cycles_arith, cycles_message);

    /* Thread count and register pressure are traded off only on v7 */
    bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
    unsigned nr_threads = full_threads ? 2 : 1;

    /* Dump stats */
    char *str = ralloc_asprintf(
       NULL,
       "%s shader: "
       "%u inst, %u tuples, %u clauses, "
       "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
       "%u quadwords, %u threads",
       bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
       stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
       cycles_varying, cycles_ldst, size / 16, nr_threads);

    if (ctx->arch == 7) {
       ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
    }

    ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
                           ctx->loop_count, ctx->spills, ctx->fills);

    return str;
 }

 static char *
 va_print_stats(bi_context *ctx, unsigned size)
 {
    unsigned nr_ins = 0;
    struct va_stats stats = {0};

    /* Count instructions */
    bi_foreach_instr_global(ctx, I) {
       nr_ins++;
       va_count_instr_stats(I, &stats);
    }

    /* Mali G78 peak performance:
     *
     * 64 FMA instructions per cycle
     * 64 CVT instructions per cycle
     * 16 SFU instructions per cycle
     * 8 x 32-bit varying channels interpolated per cycle
     * 4 texture instructions per cycle
     * 1 load/store operation per cycle
     */

    float cycles_fma = ((float)stats.fma) / 64.0;
    float cycles_cvt = ((float)stats.cvt) / 64.0;
    float cycles_sfu = ((float)stats.sfu) / 16.0;
    float cycles_v = ((float)stats.v) / 16.0;
    float cycles_t = ((float)stats.t) / 4.0;
    float cycles_ls = ((float)stats.ls) / 1.0;

    /* Calculate the bound */
    float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
                        MAX3(cycles_v, cycles_t, cycles_ls));

    /* Thread count and register pressure are traded off */
    unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;

    /* Dump stats */
    return ralloc_asprintf(NULL,
                           "%s shader: "
                           "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
                           "%f t, %f ls, %u quadwords, %u threads, %u loops, "
                           "%u:%u spills:fills",
                           bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
                           cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
                           size / 16, nr_threads, ctx->loop_count, ctx->spills,
                           ctx->fills);
 }

 static int
 glsl_type_size(const struct glsl_type *type, bool bindless)
 {
    return glsl_count_attribute_slots(type, false);
 }

 /* Split stores to memory. We don't split stores to vertex outputs, since
  * nir_lower_io_to_temporaries will ensure there's only a single write.
  */

 static bool
 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
 {
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

    switch (intr->intrinsic) {
    case nir_intrinsic_store_ssbo:
    case nir_intrinsic_store_shared:
    case nir_intrinsic_store_global:
    case nir_intrinsic_store_scratch:
       return true;
    default:
       return false;
    }
 }

 /*
  * Some operations are only available as 32-bit instructions. 64-bit floats are
  * unsupported and ints are lowered with nir_lower_int64.  Certain 8-bit and
  * 16-bit instructions, however, are lowered here.
  */
 static unsigned
 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
 {
    if (instr->type != nir_instr_type_alu)
       return 0;

    nir_alu_instr *alu = nir_instr_as_alu(instr);

    switch (alu->op) {
    case nir_op_fexp2:
    case nir_op_flog2:
    case nir_op_fpow:
    case nir_op_fsin:
    case nir_op_fcos:
    case nir_op_bit_count:
    case nir_op_bitfield_reverse:
       return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
    default:
       return 0;
    }
 }

 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
  * transcendentals are an exception. Also shifts because of lane size mismatch
  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
  * to be scalarized due to type size. */

 static uint8_t
 bi_vectorize_filter(const nir_instr *instr, const void *data)
 {
    /* Defaults work for everything else */
    if (instr->type != nir_instr_type_alu)
       return 0;

    const nir_alu_instr *alu = nir_instr_as_alu(instr);

    switch (alu->op) {
    case nir_op_frcp:
    case nir_op_frsq:
    case nir_op_ishl:
    case nir_op_ishr:
    case nir_op_ushr:
    case nir_op_f2i16:
    case nir_op_f2u16:
    case nir_op_extract_u8:
    case nir_op_extract_i8:
    case nir_op_extract_u16:
    case nir_op_extract_i16:
    case nir_op_insert_u16:
       return 1;
    default:
       break;
    }

    /* Vectorized instructions cannot write more than 32-bit */
    int dst_bit_size = alu->def.bit_size;
    if (dst_bit_size == 16)
       return 2;
    else
       return 1;
 }

 static bool
 bi_scalarize_filter(const nir_instr *instr, const void *data)
 {
    if (instr->type != nir_instr_type_alu)
       return false;

    const nir_alu_instr *alu = nir_instr_as_alu(instr);

    switch (alu->op) {
    case nir_op_pack_uvec2_to_uint:
    case nir_op_pack_uvec4_to_uint:
       return false;
    default:
       return true;
    }
 }

 /* Ensure we write exactly 4 components */
 static nir_def *
 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
                           unsigned first, unsigned mask)
 {
    if (!(mask & BITFIELD_BIT(channel)))
       channel = first;

    return nir_channel(b, in, channel);
 }

 /* Lower fragment store_output instructions to always write 4 components,
  * matching the hardware semantic. This may require additional moves. Skipping
  * these moves is possible in theory, but invokes undefined behaviour in the
  * compiler. The DDK inserts these moves, so we will as well. */

 static bool
 bifrost_nir_lower_blend_components(struct nir_builder *b,
                                    nir_intrinsic_instr *intr, void *data)
 {
    if (intr->intrinsic != nir_intrinsic_store_output)
       return false;

    nir_def *in = intr->src[0].ssa;
    unsigned first = nir_intrinsic_component(intr);
    unsigned mask = nir_intrinsic_write_mask(intr);

    assert(first == 0 && "shouldn't get nonzero components");

    /* Nothing to do */
    if (mask == BITFIELD_MASK(4))
       return false;

    b->cursor = nir_before_instr(&intr->instr);

    /* Replicate the first valid component instead */
    nir_def *replicated =
       nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
                bifrost_nir_valid_channel(b, in, 1, first, mask),
                bifrost_nir_valid_channel(b, in, 2, first, mask),
                bifrost_nir_valid_channel(b, in, 3, first, mask));

    /* Rewrite to use our replicated version */
    nir_src_rewrite(&intr->src[0], replicated);
    nir_intrinsic_set_component(intr, 0);
    nir_intrinsic_set_write_mask(intr, 0xF);
    intr->num_components = 4;

    return true;
 }

 static nir_mem_access_size_align
 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
                          uint8_t bit_size, uint32_t align_mul,
                          uint32_t align_offset, bool offset_is_const,
                          enum gl_access_qualifier access, const void *cb_data)
 {
    uint32_t align = nir_combined_align(align_mul, align_offset);
    assert(util_is_power_of_two_nonzero(align));

    /* No more than 16 bytes at a time. */
    bytes = MIN2(bytes, 16);

    /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
     * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
     *
     * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
     * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
     * the size.
     */
    if ((bytes & 1) || (align == 1))
       bit_size = 8;
    else if ((bytes & 2) || (align == 2))
       bit_size = 16;
    else if (bit_size >= 32)
       bit_size = 32;

    unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);

    /* Push constants require 32-bit loads. */
    if (intrin == nir_intrinsic_load_push_constant) {
       if (align_mul >= 4) {
          /* If align_mul is bigger than 4 we can use align_offset to find
           * the exact number of words we need to read.
           */
          num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
       } else {
          /* If bytes is aligned on 32-bit, the access might still cross one
           * word at the beginning, and one word at the end. If bytes is not
           * aligned on 32-bit, the extra two words should cover for both the
           * size and offset mis-alignment.
           */
          num_comps = (bytes / 4) + 2;
       }

       bit_size = MAX2(bit_size, 32);
       align = 4;
    } else {
       align = bit_size / 8;
    }

    return (nir_mem_access_size_align){
       .num_components = num_comps,
       .bit_size = bit_size,
       .align = align,
       .shift = nir_mem_access_shift_method_scalar,
    };
 }

 static bool
 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
                  unsigned num_components, int64_t hole_size,
                  nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                  void *data)
 {
    if (hole_size > 0)
       return false;

    /* Must be aligned to the size of the load */
    unsigned align = nir_combined_align(align_mul, align_offset);
    if ((bit_size / 8) > align)
       return false;

    if (num_components > 4)
       return false;

    if (bit_size > 32)
       return false;

    return true;
 }

 static void
 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
 {
    NIR_PASS(_, nir, nir_opt_shrink_stores, true);

    bool progress;

    do {
       progress = false;

       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
       NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);

       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_remove_phis);
       NIR_PASS(progress, nir, nir_opt_dce);
       NIR_PASS(progress, nir, nir_opt_dead_cf);
       NIR_PASS(progress, nir, nir_opt_cse);
       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_opt_constant_folding);

       NIR_PASS(progress, nir, nir_opt_undef);
       NIR_PASS(progress, nir, nir_lower_undef_to_zero);

       NIR_PASS(progress, nir, nir_opt_shrink_vectors, false);
       NIR_PASS(progress, nir, nir_opt_loop_unroll);
    } while (progress);

    NIR_PASS(
       progress, nir, nir_opt_load_store_vectorize,
       &(const nir_load_store_vectorize_options){
          .modes = nir_var_mem_global | nir_var_mem_shared | nir_var_shader_temp,
          .callback = mem_vectorize_cb,
       });
    NIR_PASS(progress, nir, nir_lower_pack);

    /* nir_lower_pack can generate split operations, execute algebraic again to
     * handle them */
    NIR_PASS(progress, nir, nir_opt_algebraic);

    /* TODO: Why is 64-bit getting rematerialized?
     * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
    NIR_PASS(progress, nir, nir_lower_int64);

    /* We need to cleanup after each iteration of late algebraic
     * optimizations, since otherwise NIR can produce weird edge cases
     * (like fneg of a constant) which we don't handle */
    bool late_algebraic = true;
    while (late_algebraic) {
       late_algebraic = false;
       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
       NIR_PASS(progress, nir, nir_opt_constant_folding);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_dce);
       NIR_PASS(progress, nir, nir_opt_cse);
    }

    /* This opt currently helps on Bifrost but not Valhall */
    if (gpu_id < 0x9000)
       NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);

    NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
    NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
    NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);

    /* Prepass to simplify instruction selection */
    late_algebraic = false;
    NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);

    while (late_algebraic) {
       late_algebraic = false;
       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
       NIR_PASS(progress, nir, nir_opt_constant_folding);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_dce);
       NIR_PASS(progress, nir, nir_opt_cse);
    }

    NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(progress, nir, nir_opt_dce);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS(_, nir, nir_shader_intrinsics_pass,
                bifrost_nir_lower_blend_components, nir_metadata_control_flow,
                NULL);
    }

    /* Backend scheduler is purely local, so do some global optimizations
     * to reduce register pressure. */
    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
                                nir_move_load_input | nir_move_comparisons |
                                nir_move_copies | nir_move_load_ssbo;

    NIR_PASS(_, nir, nir_opt_sink, move_all);
    NIR_PASS(_, nir, nir_opt_move, move_all);

    /* We might lower attribute, varying, and image indirects. Use the
     * gathered info to skip the extra analysis in the happy path. */
    bool any_indirects = nir->info.inputs_read_indirectly ||
                         nir->info.outputs_accessed_indirectly ||
                         nir->info.patch_inputs_read_indirectly ||
                         nir->info.patch_outputs_accessed_indirectly ||
                         nir->info.images_used[0];

    if (any_indirects) {
       nir_divergence_analysis(nir);
       NIR_PASS(_, nir, bi_lower_divergent_indirects,
                pan_subgroup_size(pan_arch(gpu_id)));
    }
 }

 static void
 bi_opt_post_ra(bi_context *ctx)
 {
    bi_foreach_instr_global_safe(ctx, ins) {
       if (ins->op == BI_OPCODE_MOV_I32 &&
           bi_is_equiv(ins->dest[0], ins->src[0]))
          bi_remove_instruction(ins);
    }
 }

 /* Dead code elimination for branches at the end of a block - only one branch
  * per block is legal semantically, but unreachable jumps can be generated.
  * Likewise on Bifrost we can generate jumps to the terminal block which need
  * to be lowered away to a jump to #0x0, which induces successful termination.
  * That trick doesn't work on Valhall, which needs a NOP inserted in the
  * terminal block instead.
  */
 static void
 bi_lower_branch(bi_context *ctx, bi_block *block)
 {
    bool cull_terminal = (ctx->arch <= 8);
    bool branched = false;

    bi_foreach_instr_in_block_safe(block, ins) {
       if (!ins->branch_target)
          continue;

       if (branched) {
          bi_remove_instruction(ins);
          continue;
       }

       branched = true;

       if (!bi_is_terminal_block(ins->branch_target))
          continue;

       if (cull_terminal)
          ins->branch_target = NULL;
       else if (ins->branch_target)
          ins->branch_target->needs_nop = true;
    }
 }

 static void
 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
 {
    unsigned final_clause = bi_pack(ctx, binary);

    /* If we need to wait for ATEST or BLEND in the first clause, pass the
     * corresponding bits through to the renderer state descriptor */
    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
    bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);

    unsigned first_deps = first_clause ? first_clause->dependencies : 0;
    ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
    ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));

    /* Pad the shader with enough zero bytes to trick the prefetcher,
     * unless we're compiling an empty shader (in which case we don't pad
     * so the size remains 0) */
    unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;

    if (binary->size - offset) {
       memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
              prefetch_size);
    }
 }

 /*
  * Build a bit mask of varyings (by location) that are flatshaded. This
  * information is needed by lower_mediump_io, as we don't yet support 16-bit
  * flat varyings.
  *
  * Also varyings that are used as texture coordinates should be kept at fp32 so
  * the texture instruction may be promoted to VAR_TEX. In general this is a good
  * idea, as fp16 texture coordinates are not supported by the hardware and are
  * usually inappropriate. (There are both relevant CTS bugs here, even.)
  *
  * TODO: If we compacted the varyings with some fixup code in the vertex shader,
  * we could implement 16-bit flat varyings. Consider if this case matters.
  *
  * TODO: The texture coordinate handling could be less heavyhanded.
  */
 static bool
 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
 {
    uint64_t *mask = data;

    if (instr->type != nir_instr_type_tex)
       return false;

    nir_tex_instr *tex = nir_instr_as_tex(instr);

    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
    if (coord_idx < 0)
       return false;

    nir_src src = tex->src[coord_idx].src;
    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
    nir_scalar y = nir_scalar_resolved(src.ssa, 1);

    if (x.def != y.def)
       return false;

    nir_instr *parent = x.def->parent_instr;

    if (parent->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);

    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
       return false;

    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    *mask |= BITFIELD64_BIT(sem.location);
    return false;
 }

 static uint64_t
 bi_fp32_varying_mask(nir_shader *nir)
 {
    uint64_t mask = 0;

    assert(nir->info.stage == MESA_SHADER_FRAGMENT);

    nir_foreach_shader_in_variable(var, nir) {
       if (var->data.interpolation == INTERP_MODE_FLAT)
          mask |= BITFIELD64_BIT(var->data.location);
    }

    nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
                                 &mask);

    return mask;
 }

 static bool
 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
                             void *data)
 {
    if (intr->intrinsic != nir_intrinsic_store_output)
       return false;

    assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
    if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
       return false;

    b->cursor = nir_before_instr(&intr->instr);

    nir_def *orig = nir_load_sample_mask(b);

    nir_src_rewrite(&intr->src[0], nir_iand(b, orig, intr->src[0].ssa));
    return true;
 }

 static bool
 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
                      UNUSED void *data)
 {
    if (intr->intrinsic != nir_intrinsic_load_output)
       return false;

    unsigned loc = nir_intrinsic_io_semantics(intr).location;
    assert(loc >= FRAG_RESULT_DATA0);
    unsigned rt = loc - FRAG_RESULT_DATA0;

    b->cursor = nir_before_instr(&intr->instr);

    nir_def *conversion = nir_load_rt_conversion_pan(
       b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));

    nir_def *lowered = nir_load_converted_output_pan(
       b, intr->def.num_components, intr->def.bit_size, conversion,
       .dest_type = nir_intrinsic_dest_type(intr),
       .io_semantics = nir_intrinsic_io_semantics(intr));

    nir_def_rewrite_uses(&intr->def, lowered);
    return true;
 }

 bool
 bifrost_nir_lower_load_output(nir_shader *nir)
 {
    assert(nir->info.stage == MESA_SHADER_FRAGMENT);

    return nir_shader_intrinsics_pass(
       nir, bi_lower_load_output,
       nir_metadata_control_flow, NULL);
 }

 void
 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
     * (so we don't accidentally duplicate the epilogue since mesa/st has
     * messed with our I/O quite a bit already) */

    NIR_PASS(_, nir, nir_lower_vars_to_ssa);

    if (nir->info.stage == MESA_SHADER_VERTEX) {
       if (pan_arch(gpu_id) <= 7)
          NIR_PASS(_, nir, pan_nir_lower_vertex_id);

       NIR_PASS(_, nir, nir_lower_viewport_transform);
       NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);

       nir_variable *psiz = nir_find_variable_with_location(
          nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
       if (psiz != NULL)
          psiz->data.precision = GLSL_PRECISION_MEDIUM;
    }

    /* Get rid of any global vars before we lower to scratch. */
    NIR_PASS(_, nir, nir_lower_global_vars_to_local);

    /* Valhall introduces packed thread local storage, which improves cache
     * locality of TLS access. However, access to packed TLS cannot
     * straddle 16-byte boundaries. As such, when packed TLS is in use
     * (currently unconditional for Valhall), we force vec4 alignment for
     * scratch access.
     */
    glsl_type_size_align_func vars_to_scratch_size_align_func =
       (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
                          : glsl_get_natural_size_align_bytes;
    /* Lower large arrays to scratch and small arrays to bcsel */
    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
             vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);

    NIR_PASS(_, nir, nir_split_var_copies);
    NIR_PASS(_, nir, nir_lower_var_copies);
    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
             glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics);

    if (nir->info.stage == MESA_SHADER_VERTEX)
       NIR_PASS_V(nir, pan_nir_lower_noperspective_vs);
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       NIR_PASS_V(nir, pan_nir_lower_noperspective_fs);

    /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
     * offsets it could figure out are constant.  Do some constant folding
     * before bifrost_nir_lower_store_component below.
     */
    NIR_PASS(_, nir, nir_opt_constant_folding);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS(_, nir, nir_lower_mediump_io,
                nir_var_shader_in | nir_var_shader_out,
                ~bi_fp32_varying_mask(nir), false);

       NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
                nir_metadata_control_flow, NULL);

       NIR_PASS(_, nir, bifrost_nir_lower_load_output);
    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
       if (gpu_id >= 0x9000) {
          NIR_PASS(_, nir, nir_lower_mediump_io, nir_var_shader_out,
                   BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
       }

       NIR_PASS(_, nir, pan_nir_lower_store_component);
    }

    nir_lower_mem_access_bit_sizes_options mem_size_options = {
       .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
                nir_var_mem_constant | nir_var_mem_task_payload |
                nir_var_shader_temp | nir_var_function_temp |
                nir_var_mem_global | nir_var_mem_shared,
       .callback = mem_access_size_align_cb,
    };
    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options);

    nir_lower_ssbo_options ssbo_opts = {
       .native_loads = pan_arch(gpu_id) >= 9,
       .native_offset = pan_arch(gpu_id) >= 9,
    };
    NIR_PASS(_, nir, nir_lower_ssbo, &ssbo_opts);

    NIR_PASS(_, nir, pan_lower_sample_pos);
    NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
    NIR_PASS(_, nir, nir_lower_64bit_phis);
    NIR_PASS(_, nir, pan_lower_helper_invocation);
    NIR_PASS(_, nir, nir_lower_int64);

    NIR_PASS(_, nir, nir_opt_idiv_const, 8);
    NIR_PASS(_, nir, nir_lower_idiv,
             &(nir_lower_idiv_options){.allow_fp16 = true});

    NIR_PASS(_, nir, nir_lower_tex,
             &(nir_lower_tex_options){
                .lower_txs_lod = true,
                .lower_txp = ~0,
                .lower_tg4_broadcom_swizzle = true,
                .lower_txd_cube_map = true,
                .lower_invalid_implicit_lod = true,
                .lower_index_to_offset = true,
             });

    NIR_PASS(_, nir, nir_lower_image_atomics_to_global);

    /* on bifrost, lower MSAA load/stores to 3D load/stores */
    if (pan_arch(gpu_id) < 9)
       NIR_PASS(_, nir, pan_nir_lower_image_ms);

    NIR_PASS(_, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
    NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
    NIR_PASS(_, nir, nir_lower_var_copies);
    NIR_PASS(_, nir, nir_lower_alu);
    NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
    NIR_PASS(_, nir, pan_nir_lower_frag_coord_zw);
 }

 static bi_context *
 bi_compile_variant_nir(nir_shader *nir,
                        const struct panfrost_compile_inputs *inputs,
                        struct util_dynarray *binary, struct bi_shader_info info,
                        enum bi_idvs_mode idvs)
 {
    bi_context *ctx = rzalloc(NULL, bi_context);

    /* There may be another program in the dynarray, start at the end */
    unsigned offset = binary->size;

    ctx->inputs = inputs;
    ctx->nir = nir;
    ctx->stage = nir->info.stage;
    ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
    ctx->arch = pan_arch(inputs->gpu_id);
    ctx->info = info;
    ctx->idvs = idvs;
    ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;

    if (idvs != BI_IDVS_NONE) {
       /* Specializing shaders for IDVS is destructive, so we need to
        * clone. However, the last (second) IDVS shader does not need
        * to be preserved so we can skip cloning that one.
        */
       if (offset == 0)
          ctx->nir = nir = nir_shader_clone(ctx, nir);

       NIR_PASS(_, nir, nir_shader_instructions_pass,
                bifrost_nir_specialize_idvs, nir_metadata_control_flow, &idvs);

       /* After specializing, clean up the mess */
       bool progress = true;

       while (progress) {
          progress = false;

          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_opt_dead_cf);
       }
    }

    /* If nothing is pushed, all UBOs need to be uploaded */
    ctx->ubo_mask = ~0;

    list_inithead(&ctx->blocks);

    bool skip_internal = nir->info.internal;
    skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);

    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
       nir_print_shader(nir, stdout);
    }

    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);

    nir_foreach_function_impl(impl, nir) {
       nir_index_blocks(impl);

       ctx->indexed_nir_blocks =
          rzalloc_array(ctx, bi_block *, impl->num_blocks);

       ctx->ssa_alloc += impl->ssa_alloc;

       emit_cf_list(ctx, &impl->body);
       bi_emit_phis_deferred(ctx);
       break; /* TODO: Multi-function shaders */
    }

    /* Index blocks now that we're done emitting */
    bi_foreach_block(ctx, block) {
       block->index = ctx->num_blocks++;
    }

    bi_validate(ctx, "NIR -> BIR");

    /* If the shader doesn't write any colour or depth outputs, it may
     * still need an ATEST at the very end! */
    bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
                            !ctx->emitted_atest && !bi_skip_atest(ctx, false);

    if (need_dummy_atest) {
       bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
       bi_builder b = bi_init_builder(ctx, bi_after_block(end));
       bi_emit_atest(&b, bi_zero());
    }

    bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);

    /* Runs before constant folding */
    bi_lower_swizzle(ctx);
    bi_validate(ctx, "Early lowering");

    /* Runs before copy prop */
    if (optimize && !ctx->inputs->no_ubo_to_push) {
       bi_opt_push_ubo(ctx);
    }

    if (likely(optimize)) {
       bi_opt_copy_prop(ctx);

       while (bi_opt_constant_fold(ctx))
          bi_opt_copy_prop(ctx);

       bi_opt_mod_prop_forward(ctx);
       bi_opt_mod_prop_backward(ctx);

       /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
        * mod_prop_backward to fuse VAR_TEX */
       if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
           !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
          bi_opt_dce(ctx, false);
          bi_opt_message_preload(ctx);
          bi_opt_copy_prop(ctx);
       }

       bi_opt_dce(ctx, false);
       bi_opt_cse(ctx);
       bi_opt_dce(ctx, false);
       if (!ctx->inputs->no_ubo_to_push)
          bi_opt_reorder_push(ctx);
       bi_validate(ctx, "Optimization passes");
    }

    bi_lower_opt_instructions(ctx);

    if (ctx->arch >= 9) {
       va_optimize(ctx);
       va_lower_isel(ctx);

       bi_foreach_instr_global_safe(ctx, I) {
          /* Phis become single moves so shouldn't be affected */
          if (I->op == BI_OPCODE_PHI)
             continue;

          va_lower_constants(ctx, I);

          bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
          va_repair_fau(&b, I);
       }

       /* We need to clean up after constant lowering */
       if (likely(optimize)) {
          bi_opt_cse(ctx);
          bi_opt_dce(ctx, false);
       }

       bi_validate(ctx, "Valhall passes");
    }

    bi_foreach_block(ctx, block) {
       bi_lower_branch(ctx, block);
    }

    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
       bi_print_shader(ctx, stdout);

    /* Analyze before register allocation to avoid false dependencies. The
     * skip bit is a function of only the data flow graph and is invariant
     * under valid scheduling. Helpers are only defined for fragment
     * shaders, so this analysis is only required in fragment shaders.
     */
    if (ctx->stage == MESA_SHADER_FRAGMENT) {
       bi_opt_dce(ctx, false);
       bi_analyze_helper_requirements(ctx);
    }

    /* Fuse TEXC after analyzing helper requirements so the analysis
     * doesn't have to know about dual textures */
    if (likely(optimize)) {
       bi_opt_fuse_dual_texture(ctx);
    }

    /* Lower FAU after fusing dual texture, because fusing dual texture
     * creates new immediates that themselves may need lowering.
     */
    if (ctx->arch <= 8) {
       bi_lower_fau(ctx);
    }

    /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
    if (likely(optimize)) {
       bi_opt_cse(ctx);
       bi_opt_dce(ctx, false);
    }

    bi_validate(ctx, "Late lowering");

    if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
       bi_pressure_schedule(ctx);
       bi_validate(ctx, "Pre-RA scheduling");
    }

    bi_register_allocate(ctx);

    if (likely(optimize))
       bi_opt_post_ra(ctx);

    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
       bi_print_shader(ctx, stdout);

    if (ctx->arch >= 9) {
       va_assign_slots(ctx);
       va_insert_flow_control_nops(ctx);
       va_merge_flow(ctx);
       va_mark_last(ctx);
    } else {
       bi_schedule(ctx);
       bi_assign_scoreboard(ctx);

       /* Analyze after scheduling since we depend on instruction
        * order. Valhall calls as part of va_insert_flow_control_nops,
        * as the handling for clauses differs from instructions.
        */
       bi_analyze_helper_terminate(ctx);
       bi_mark_clauses_td(ctx);
    }

    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
       bi_print_shader(ctx, stdout);

    if (ctx->arch <= 8) {
       bi_pack_clauses(ctx, binary, offset);
    } else {
       bi_pack_valhall(ctx, binary);
    }

    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
       if (ctx->arch <= 8) {
          disassemble_bifrost(stdout, binary->data + offset,
                              binary->size - offset,
                              bifrost_debug & BIFROST_DBG_VERBOSE);
       } else {
          disassemble_valhall(stdout, binary->data + offset,
                              binary->size - offset,
                              bifrost_debug & BIFROST_DBG_VERBOSE);
       }

       fflush(stdout);
    }

    if (!skip_internal &&
        ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
       char *shaderdb;

       if (ctx->arch >= 9) {
          shaderdb = va_print_stats(ctx, binary->size - offset);
       } else {
          shaderdb = bi_print_stats(ctx, binary->size - offset);
       }

       if (bifrost_debug & BIFROST_DBG_SHADERDB)
          fprintf(stderr, "SHADER-DB: %s\n", shaderdb);

       if (inputs->debug)
          util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);

       ralloc_free(shaderdb);
    }

    return ctx;
 }

 static void
 bi_compile_variant(nir_shader *nir,
                    const struct panfrost_compile_inputs *inputs,
                    struct util_dynarray *binary, struct pan_shader_info *info,
                    enum bi_idvs_mode idvs)
 {
    struct bi_shader_info local_info = {
       .push = &info->push,
       .bifrost = &info->bifrost,
       .tls_size = info->tls_size,
       .push_offset = info->push.count,
    };

    unsigned offset = binary->size;

    /* If there is no position shader (gl_Position is not written), then
     * there is no need to build a varying shader either. This case is hit
     * for transform feedback only vertex shaders which only make sense with
     * rasterizer discard.
     */
    if ((offset == 0) && (idvs == BI_IDVS_VARYING))
       return;

    /* Software invariant: Only a secondary shader can appear at a nonzero
     * offset, to keep the ABI simple. */
    assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));

    bi_context *ctx =
       bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);

    /* A register is preloaded <==> it is live before the first block */
    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
    uint64_t preload = first_block->reg_live_in;

    /* If multisampling is used with a blend shader, the blend shader needs
     * to access the sample coverage mask in r60 and the sample ID in r61.
     * Blend shaders run in the same context as fragment shaders, so if a
     * blend shader could run, we need to preload these registers
     * conservatively. There is believed to be little cost to doing so, so
     * do so always to avoid variants of the preload descriptor.
     *
     * We only do this on Valhall, as Bifrost has to update the RSD for
     * multisampling w/ blend shader anyway, so this is handled in the
     * driver. We could unify the paths if the cost is acceptable.
     */
    if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
       preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);

    info->ubo_mask |= ctx->ubo_mask;
    info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);

    if (idvs == BI_IDVS_VARYING) {
       info->vs.secondary_enable = (binary->size > offset);
       info->vs.secondary_offset = offset;
       info->vs.secondary_preload = preload;
       info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
    } else {
       info->preload = preload;
       info->work_reg_count = ctx->info.work_reg_count;
    }

    if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
       /* Find the psiz write */
       bi_instr *write = NULL;

       bi_foreach_instr_global(ctx, I) {
          if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
             write = I;
             break;
          }
       }

       assert(write != NULL);

       /* NOP it out, preserving its flow control. TODO: maybe DCE */
       if (write->flow) {
          bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
          bi_instr *nop = bi_nop(&b);
          nop->flow = write->flow;
       }

       bi_remove_instruction(write);

       info->vs.no_psiz_offset = binary->size;
       bi_pack_valhall(ctx, binary);
    }

    ralloc_free(ctx);
 }

 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
 static bool
 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
 {
    /* Opt-out */
    if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
       return false;

    /* IDVS splits up vertex shaders, not defined on other shader stages */
    if (nir->info.stage != MESA_SHADER_VERTEX)
       return false;

    /* Bifrost cannot write gl_PointSize during IDVS */
    if ((inputs->gpu_id < 0x9000) &&
        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
       return false;

    /* Otherwise, IDVS is usually better */
    return true;
 }

 void
 bifrost_compile_shader_nir(nir_shader *nir,
                            const struct panfrost_compile_inputs *inputs,
                            struct util_dynarray *binary,
                            struct pan_shader_info *info)
 {
    bifrost_debug = debug_get_option_bifrost_debug();

    /* Combine stores late, to give the driver a chance to lower dual-source
     * blending as regular store_output intrinsics.
     */
    NIR_PASS(_, nir, pan_nir_lower_zs_store);

    bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);

    info->tls_size = nir->scratch_size;
    info->vs.idvs = bi_should_idvs(nir, inputs);

    pan_nir_collect_varyings(nir, info);

    if (info->vs.idvs) {
       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
    } else {
       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
    }

    if (gl_shader_stage_is_compute(nir->info.stage)) {
       /* Workgroups may be merged if the structure of the workgroup is
        * not software visible. This is true if neither shared memory
        * nor barriers are used. The hardware may be able to optimize
        * compute shaders that set this flag.
        */
       info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
                                           !nir->info.uses_control_barrier &&
                                           !nir->info.uses_memory_barrier;
    }

    info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
 }