src/intel/compiler/brw_kernel.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2020 Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "brw_kernel.h"
 #include "brw_nir.h"
 #include "elk/elk_nir_options.h"
 #include "intel_nir.h"

 #include "intel_nir.h"
 #include "nir_clc_helpers.h"
 #include "compiler/nir/nir_builder.h"
 #include "compiler/spirv/nir_spirv.h"
 #include "compiler/spirv/spirv_info.h"
 #include "dev/intel_debug.h"
 #include "util/u_atomic.h"
 #include "util/u_dynarray.h"

 static const nir_shader *
 load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
                 const nir_shader_compiler_options *nir_options,
                 const struct spirv_to_nir_options *spirv_options)
 {
    if (compiler->clc_shader)
       return compiler->clc_shader;

    nir_shader *nir =  nir_load_libclc_shader(64, disk_cache,
                                              spirv_options, nir_options,
                                              disk_cache != NULL);
    if (nir == NULL)
       return NULL;

    const nir_shader *old_nir =
       p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
    if (old_nir == NULL) {
       /* We won the race */
       ralloc_steal(compiler, nir);
       return nir;
    } else {
       /* Someone else built the shader first */
       ralloc_free(nir);
       return old_nir;
    }
 }

 static nir_builder
 builder_init_new_impl(nir_function *func)
 {
    nir_function_impl *impl = nir_function_impl_create(func);
    return nir_builder_at(nir_before_impl(impl));
 }

 static void
 implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op,
                          enum glsl_base_type data_base_type,
                          nir_variable_mode mode)
 {
    nir_builder b = builder_init_new_impl(func);
    const struct glsl_type *data_type = glsl_scalar_type(data_base_type);

    unsigned p = 0;

    nir_deref_instr *ret = NULL;
    ret = nir_build_deref_cast(&b, nir_load_param(&b, p++),
                               nir_var_function_temp, data_type, 0);

    nir_intrinsic_op op = nir_intrinsic_deref_atomic;
    nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op);
    nir_intrinsic_set_atomic_op(atomic, atomic_op);

    for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) {
       nir_def *src = nir_load_param(&b, p++);
       if (i == 0) {
          /* The first source is our deref */
          assert(nir_intrinsic_infos[op].src_components[i] == -1);
          src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def;
       }
       atomic->src[i] = nir_src_for_ssa(src);
    }

    nir_def_init_for_type(&atomic->instr, &atomic->def, data_type);

    nir_builder_instr_insert(&b, &atomic->instr);
    nir_store_deref(&b, ret, &atomic->def, ~0);
 }

 static void
 implement_sub_group_ballot_builtin(nir_function *func)
 {
    nir_builder b = builder_init_new_impl(func);
    nir_deref_instr *ret =
       nir_build_deref_cast(&b, nir_load_param(&b, 0),
                            nir_var_function_temp, glsl_uint_type(), 0);
    nir_def *cond = nir_load_param(&b, 1);

    nir_intrinsic_instr *ballot =
       nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
    ballot->src[0] = nir_src_for_ssa(cond);
    ballot->num_components = 1;
    nir_def_init(&ballot->instr, &ballot->def, 1, 32);
    nir_builder_instr_insert(&b, &ballot->instr);

    nir_store_deref(&b, ret, &ballot->def, ~0);
 }

 static bool
 implement_intel_builtins(nir_shader *nir)
 {
    bool progress = false;

    nir_foreach_function(func, nir) {
       if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) {
          /* float atom_min(__global float volatile *p, float val) */
          implement_atomic_builtin(func, nir_atomic_op_fmin,
                                   GLSL_TYPE_FLOAT, nir_var_mem_global);
          progress = true;
       } else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) {
          /* float atom_max(__global float volatile *p, float val) */
          implement_atomic_builtin(func, nir_atomic_op_fmax,
                                   GLSL_TYPE_FLOAT, nir_var_mem_global);
          progress = true;
       } else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) {
          /* float atomic_min(__shared float volatile *, float) */
          implement_atomic_builtin(func, nir_atomic_op_fmin,
                                   GLSL_TYPE_FLOAT, nir_var_mem_shared);
          progress = true;
       } else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) {
          /* float atomic_max(__shared float volatile *, float) */
          implement_atomic_builtin(func, nir_atomic_op_fmax,
                                   GLSL_TYPE_FLOAT, nir_var_mem_shared);
          progress = true;
       } else if (strcmp(func->name, "intel_sub_group_ballot") == 0) {
          implement_sub_group_ballot_builtin(func);
          progress = true;
       }
    }

    nir_shader_preserve_all_metadata(nir);

    return progress;
 }

 static bool
 lower_kernel_intrinsics(nir_shader *nir)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(nir);

    bool progress = false;

    unsigned kernel_sysvals_start = 0;
    unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
    nir->num_uniforms += kernel_arg_start;

    nir_builder b = nir_builder_create(impl);

    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          switch (intrin->intrinsic) {
          case nir_intrinsic_load_kernel_input: {
             b.cursor = nir_instr_remove(&intrin->instr);

             nir_intrinsic_instr *load =
                nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
             load->num_components = intrin->num_components;
             load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
             nir_intrinsic_set_base(load, kernel_arg_start);
             nir_intrinsic_set_range(load, nir->num_uniforms);
             nir_def_init(&load->instr, &load->def,
                          intrin->def.num_components,
                          intrin->def.bit_size);
             nir_builder_instr_insert(&b, &load->instr);

             nir_def_rewrite_uses(&intrin->def, &load->def);
             progress = true;
             break;
          }

          case nir_intrinsic_load_constant_base_ptr: {
             b.cursor = nir_instr_remove(&intrin->instr);
             nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
                nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
                nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
             nir_def_rewrite_uses(&intrin->def, const_data_base_addr);
             progress = true;
             break;
          }

          case nir_intrinsic_load_num_workgroups: {
             b.cursor = nir_instr_remove(&intrin->instr);

             nir_intrinsic_instr *load =
                nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
             load->num_components = 3;
             load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
             nir_intrinsic_set_base(load, kernel_sysvals_start +
                offsetof(struct brw_kernel_sysvals, num_work_groups));
             nir_intrinsic_set_range(load, 3 * 4);
             nir_def_init(&load->instr, &load->def, 3, 32);
             nir_builder_instr_insert(&b, &load->instr);
             nir_def_rewrite_uses(&intrin->def, &load->def);
             progress = true;
             break;
          }

          default:
             break;
          }
       }
    }

    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_control_flow);
    } else {
       nir_metadata_preserve(impl, nir_metadata_all);
    }

    return progress;
 }

 static const struct spirv_capabilities spirv_caps = {
    .Addresses = true,
    .Float16 = true,
    .Float64 = true,
    .Groups = true,
    .StorageImageWriteWithoutFormat = true,
    .Int8 = true,
    .Int16 = true,
    .Int64 = true,
    .Int64Atomics = true,
    .Kernel = true,
    .Linkage = true, /* We receive linked kernel from clc */
    .DenormFlushToZero = true,
    .DenormPreserve = true,
    .SignedZeroInfNanPreserve = true,
    .RoundingModeRTE = true,
    .RoundingModeRTZ = true,
    .GenericPointer = true,
    .GroupNonUniform = true,
    .GroupNonUniformArithmetic = true,
    .GroupNonUniformClustered = true,
    .GroupNonUniformBallot = true,
    .GroupNonUniformQuad = true,
    .GroupNonUniformShuffle = true,
    .GroupNonUniformVote = true,
    .SubgroupDispatch = true,

    .SubgroupShuffleINTEL = true,
    .SubgroupBufferBlockIOINTEL = true,
 };

 bool
 brw_kernel_from_spirv(struct brw_compiler *compiler,
                       struct disk_cache *disk_cache,
                       struct brw_kernel *kernel,
                       void *log_data, void *mem_ctx,
                       const uint32_t *spirv, size_t spirv_size,
                       const char *entrypoint_name,
                       char **error_str)
 {
    const struct intel_device_info *devinfo = compiler->devinfo;
    const nir_shader_compiler_options *nir_options =
       compiler->nir_options[MESA_SHADER_KERNEL];

    struct spirv_to_nir_options spirv_options = {
       .environment = NIR_SPIRV_OPENCL,
       .capabilities = &spirv_caps,
       .printf = true,
       .shared_addr_format = nir_address_format_62bit_generic,
       .global_addr_format = nir_address_format_62bit_generic,
       .temp_addr_format = nir_address_format_62bit_generic,
       .constant_addr_format = nir_address_format_64bit_global,
    };

    spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
                                               nir_options, &spirv_options);
    if (spirv_options.clc_shader == NULL) {
       fprintf(stderr, "ERROR: libclc shader missing."
               " Consider installing the libclc package\n");
       abort();
    }

    assert(spirv_size % 4 == 0);
    nir_shader *nir =
       spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
                    entrypoint_name, &spirv_options, nir_options);
    nir_validate_shader(nir, "after spirv_to_nir");
    nir_validate_ssa_dominance(nir, "after spirv_to_nir");
    ralloc_steal(mem_ctx, nir);
    nir->info.name = ralloc_strdup(nir, entrypoint_name);

    if (INTEL_DEBUG(DEBUG_CS)) {
       /* Re-index SSA defs so we print more sensible numbers. */
       nir_foreach_function_impl(impl, nir) {
          nir_index_ssa_defs(impl);
       }

       fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
       nir_print_shader(nir, stderr);
    }

    nir_lower_printf_options printf_opts = {
       .ptr_bit_size               = 64,
       .max_buffer_size            = 1024 * 1024,
       .use_printf_base_identifier = true,
    };
    NIR_PASS_V(nir, nir_lower_printf, &printf_opts);

    NIR_PASS_V(nir, implement_intel_builtins);
    NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);

    /* We have to lower away local constant initializers right before we
     * inline functions.  That way they get properly initialized at the top
     * of the function and not at the top of its caller.
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
    NIR_PASS_V(nir, nir_lower_returns);
    NIR_PASS_V(nir, nir_inline_functions);
    NIR_PASS_V(nir, nir_copy_prop);
    NIR_PASS_V(nir, nir_opt_deref);

    /* Pick off the single entrypoint that we want */
    nir_remove_non_entrypoints(nir);

    /* Now that we've deleted all but the main function, we can go ahead and
     * lower the rest of the constant initializers.  We do this here so that
     * nir_remove_dead_variables and split_per_member_structs below see the
     * corresponding stores.
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);

    /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
     * aligned and so it can just read/write them as vec4s.  This results in a
     * LOT of vec4->vec3 casts on loads and stores.  One solution to this
     * problem is to get rid of all vec3 variables.
     */
    NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
               nir_var_shader_temp | nir_var_function_temp |
               nir_var_mem_shared | nir_var_mem_global|
               nir_var_mem_constant);

    /* We assign explicit types early so that the optimizer can take advantage
     * of that information and hopefully get rid of some of our memcpys.
     */
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
               nir_var_uniform |
               nir_var_shader_temp | nir_var_function_temp |
               nir_var_mem_shared | nir_var_mem_global,
               glsl_get_cl_type_size_align);

    struct brw_nir_compiler_opts opts = {};
    brw_preprocess_nir(compiler, nir, &opts);

    int max_arg_idx = -1;
    nir_foreach_uniform_variable(var, nir) {
       assert(var->data.location < 256);
       max_arg_idx = MAX2(max_arg_idx, var->data.location);
    }

    kernel->args_size = nir->num_uniforms;
    kernel->arg_count = max_arg_idx + 1;

    /* No bindings */
    struct brw_kernel_arg_desc *args =
       rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
    kernel->args = args;

    nir_foreach_uniform_variable(var, nir) {
       struct brw_kernel_arg_desc arg_desc = {
          .offset = var->data.driver_location,
          .size = glsl_get_explicit_size(var->type, false),
       };
       assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);

       assert(var->data.location >= 0);
       args[var->data.location] = arg_desc;
    }

    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);

    /* Lower again, this time after dead-variables to get more compact variable
     * layouts.
     */
    nir->global_mem_size = 0;
    nir->scratch_size = 0;
    nir->info.shared_size = 0;
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
               nir_var_shader_temp | nir_var_function_temp |
               nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
               glsl_get_cl_type_size_align);
    if (nir->constant_data_size > 0) {
       assert(nir->constant_data == NULL);
       nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
       nir_gather_explicit_io_initializers(nir, nir->constant_data,
                                           nir->constant_data_size,
                                           nir_var_mem_constant);
    }

    if (INTEL_DEBUG(DEBUG_CS)) {
       /* Re-index SSA defs so we print more sensible numbers. */
       nir_foreach_function_impl(impl, nir) {
          nir_index_ssa_defs(impl);
       }

       fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
       nir_print_shader(nir, stderr);
    }

    NIR_PASS_V(nir, nir_lower_memcpy);

    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
               nir_address_format_64bit_global);

    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
               nir_address_format_32bit_offset_as_64bit);

    NIR_PASS_V(nir, nir_lower_explicit_io,
               nir_var_shader_temp | nir_var_function_temp |
               nir_var_mem_shared | nir_var_mem_global,
               nir_address_format_62bit_generic);

    NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);

    NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
    NIR_PASS_V(nir, lower_kernel_intrinsics);

    struct brw_cs_prog_key key = { };

    memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
    kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);

    struct brw_compile_cs_params params = {
       .base = {
          .nir = nir,
          .stats = kernel->stats,
          .log_data = log_data,
          .mem_ctx = mem_ctx,
       },
       .key = &key,
       .prog_data = &kernel->prog_data,
    };

    kernel->code = brw_compile_cs(compiler, &params);

    if (error_str)
       *error_str = params.base.error_str;

    return kernel->code != NULL;
 }

 static nir_def *
 rebuild_value_from_store(struct util_dynarray *stores,
                          nir_def *value, unsigned read_offset)
 {
    unsigned read_size = value->num_components * value->bit_size / 8;

    util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
       nir_intrinsic_instr *store = *_store;

       unsigned write_offset = nir_src_as_uint(store->src[1]);
       unsigned write_size = nir_src_num_components(store->src[0]) *
                             nir_src_bit_size(store->src[0]) / 8;
       if (write_offset <= read_offset &&
           (write_offset + write_size) >= (read_offset + read_size)) {
          assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
          assert(write_size == read_size);
          return store->src[0].ssa;
       }
    }
    unreachable("Matching scratch store not found");
 }

 /**
  * Remove temporary variables stored to scratch to be then reloaded
  * immediately. Remap the load to the store SSA value.
  *
  * This workaround is only meant to be applied to shaders in src/intel/shaders
  * were we know there should be no issue. More complex cases might not work
  * with this approach.
  */
 static bool
 nir_remove_llvm17_scratch(nir_shader *nir)
 {
    struct util_dynarray scratch_stores;
    void *mem_ctx = ralloc_context(NULL);

    util_dynarray_init(&scratch_stores, mem_ctx);

    nir_foreach_function_impl(func, nir) {
       nir_foreach_block(block, func) {
          nir_foreach_instr(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

             if (intrin->intrinsic != nir_intrinsic_store_scratch)
                continue;

             nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
             if (offset != NULL) {
                util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
             }
          }
       }
    }

    bool progress = false;
    if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
       nir_foreach_function_impl(func, nir) {
          nir_foreach_block(block, func) {
             nir_foreach_instr_safe(instr, block) {
                if (instr->type != nir_instr_type_intrinsic)
                   continue;

                nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

                if (intrin->intrinsic != nir_intrinsic_load_scratch)
                   continue;

                nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
                if (offset == NULL)
                   continue;

                nir_def_replace(&intrin->def,
                                rebuild_value_from_store(&scratch_stores, &intrin->def, nir_src_as_uint(intrin->src[0])));

                progress = true;
             }
          }
       }
    }

    util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
       nir_intrinsic_instr *store = *_store;
       nir_instr_remove(&store->instr);
    }

    /* Quick sanity check */
    assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
           progress);

    ralloc_free(mem_ctx);

    return progress;
 }

 static void
 cleanup_llvm17_scratch(nir_shader *nir)
 {
    {
       bool progress;
       do {
          progress = false;
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_opt_constant_folding);
          NIR_PASS(progress, nir, nir_opt_cse);
          NIR_PASS(progress, nir, nir_opt_algebraic);
       } while (progress);
    }

    nir_remove_llvm17_scratch(nir);

    {
       bool progress;
       do {
          progress = false;
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_opt_constant_folding);
          NIR_PASS(progress, nir, nir_opt_cse);
          NIR_PASS(progress, nir, nir_opt_algebraic);
       } while (progress);
    }
 }

 nir_shader *
 brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
                    size_t spirv_size, bool llvm17_wa)
 {
    struct spirv_to_nir_options spirv_options = {
       .environment = NIR_SPIRV_OPENCL,
       .capabilities = &spirv_caps,
       .printf = true,
       .shared_addr_format = nir_address_format_62bit_generic,
       .global_addr_format = nir_address_format_62bit_generic,
       .temp_addr_format = nir_address_format_62bit_generic,
       .constant_addr_format = nir_address_format_64bit_global,
       .create_library = true,
    };

    assert(spirv_size % 4 == 0);

    assert(gfx_version);
    const nir_shader_compiler_options *nir_options =
       gfx_version >= 9 ? &brw_scalar_nir_options
                        : &elk_scalar_nir_options;

    nir_shader *nir =
       spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
                    "library", &spirv_options, nir_options);
    nir_validate_shader(nir, "after spirv_to_nir");
    nir_validate_ssa_dominance(nir, "after spirv_to_nir");
    ralloc_steal(mem_ctx, nir);
    nir->info.name = ralloc_strdup(nir, "library");

    if (INTEL_DEBUG(DEBUG_CS)) {
       /* Re-index SSA defs so we print more sensible numbers. */
       nir_foreach_function_impl(impl, nir) {
          nir_index_ssa_defs(impl);
       }

       fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
       nir_print_shader(nir, stderr);
    }

    nir_lower_printf_options printf_opts = {
       .ptr_bit_size               = 64,
       .use_printf_base_identifier = true,
    };
    NIR_PASS_V(nir, nir_lower_printf, &printf_opts);

    NIR_PASS_V(nir, implement_intel_builtins);
    NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);

    /* We have to lower away local constant initializers right before we
     * inline functions.  That way they get properly initialized at the top
     * of the function and not at the top of its caller.
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
                                                       nir_var_function_temp));
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
               nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
    {
       bool progress;
       do
       {
          progress = false;
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
          NIR_PASS(progress, nir, nir_opt_deref);
          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_opt_undef);
          NIR_PASS(progress, nir, nir_opt_constant_folding);
          NIR_PASS(progress, nir, nir_opt_cse);
          NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
          NIR_PASS(progress, nir, nir_opt_algebraic);
       } while (progress);
    }

    NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
    NIR_PASS_V(nir, nir_lower_returns);
    NIR_PASS_V(nir, nir_inline_functions);

    assert(nir->scratch_size == 0);
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);

    {
       bool progress;
       do
       {
          progress = false;
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
          NIR_PASS(progress, nir, nir_opt_deref);
          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_opt_undef);
          NIR_PASS(progress, nir, nir_opt_constant_folding);
          NIR_PASS(progress, nir, nir_opt_cse);
          NIR_PASS(progress, nir, nir_split_var_copies);
          NIR_PASS(progress, nir, nir_lower_var_copies);
          NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
          NIR_PASS(progress, nir, nir_opt_algebraic);
          NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
          NIR_PASS(progress, nir, nir_opt_dead_cf);
          NIR_PASS(progress, nir, nir_opt_remove_phis);
          NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
          NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
          NIR_PASS(progress, nir, nir_opt_memcpy);
       } while (progress);
    }

    NIR_PASS_V(nir, nir_scale_fdiv);

    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
               nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);


    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);

    nir->scratch_size = 0;
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
               nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
               nir_var_mem_global | nir_var_mem_constant,
               glsl_get_cl_type_size_align);

    // Lower memcpy - needs to wait until types are sized
    {
       bool progress;
       do {
          progress = false;
          NIR_PASS(progress, nir, nir_opt_memcpy);
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
          NIR_PASS(progress, nir, nir_opt_deref);
          NIR_PASS(progress, nir, nir_opt_dce);
          NIR_PASS(progress, nir, nir_split_var_copies);
          NIR_PASS(progress, nir, nir_lower_var_copies);
          NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
          NIR_PASS(progress, nir, nir_opt_constant_folding);
          NIR_PASS(progress, nir, nir_opt_cse);
       } while (progress);
    }
    NIR_PASS_V(nir, nir_lower_memcpy);

    NIR_PASS_V(nir, nir_lower_explicit_io,
               nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
               nir_address_format_32bit_offset_as_64bit);

    NIR_PASS_V(nir, nir_lower_system_values);

    /* Hopefully we can drop this once lower_vars_to_ssa has improved to not
     * lower everything to scratch.
     */
    if (llvm17_wa)
       cleanup_llvm17_scratch(nir);

    /* Lower again, this time after dead-variables to get more compact variable
     * layouts.
     */
    nir->global_mem_size = 0;
    nir->scratch_size = 0;
    nir->info.shared_size = 0;
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
               nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
               glsl_get_cl_type_size_align);
    if (nir->constant_data_size > 0) {
       assert(nir->constant_data == NULL);
       nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
       nir_gather_explicit_io_initializers(nir, nir->constant_data,
                                           nir->constant_data_size,
                                           nir_var_mem_constant);
    }

    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
               nir_address_format_64bit_global);

    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
               nir_address_format_32bit_offset_as_64bit);

    NIR_PASS_V(nir, nir_lower_explicit_io,
               nir_var_shader_temp | nir_var_function_temp |
               nir_var_mem_shared | nir_var_mem_global,
               nir_address_format_62bit_generic);

    if (INTEL_DEBUG(DEBUG_CS)) {
       /* Re-index SSA defs so we print more sensible numbers. */
       nir_foreach_function_impl(impl, nir) {
          nir_index_ssa_defs(impl);
       }

       fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
       nir_print_shader(nir, stderr);
    }

    return nir;
 }