src/mesa/program/prog_to_nir.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2015 Intel Corporation
  * Copyright © 2014-2015 Broadcom
  * Copyright (C) 2014 Rob Clark <[email protected]>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
 #include "compiler/glsl/list.h"

 #include "main/mtypes.h"
 #include "main/shader_types.h"
 #include "util/ralloc.h"

 #include "prog_to_nir.h"
 #include "prog_instruction.h"
 #include "prog_parameter.h"
 #include "prog_print.h"
 #include "program.h"
 #include "state_tracker/st_nir.h"

 /**
  * \file prog_to_nir.c
  *
  * A translator from Mesa IR (prog_instruction.h) to NIR.  This is primarily
  * intended to support ARB_vertex_program, ARB_fragment_program, and fixed-function
  * vertex processing.  Full GLSL support should use glsl_to_nir instead.
  */

 struct ptn_compile {
    const struct gl_context *ctx;
    const struct gl_program *prog;
    nir_builder build;
    bool error;

    nir_variable *parameters;
    nir_variable *sampler_vars[32]; /* matches number of bits in TexSrcUnit */
    nir_def **output_regs;
    nir_def **temp_regs;

    nir_def *addr_reg;
 };

 #define SWIZ(X, Y, Z, W) \
    (unsigned[4]){ SWIZZLE_##X, SWIZZLE_##Y, SWIZZLE_##Z, SWIZZLE_##W }
 #define ptn_channel(b, src, ch) nir_channel(b, src, SWIZZLE_##ch)

 static nir_def *
 ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
 {
    nir_builder *b = &c->build;
    nir_alu_src src;

    memset(&src, 0, sizeof(src));

    switch (prog_src->File) {
    case PROGRAM_UNDEFINED:
       return nir_imm_float(b, 0.0);
    case PROGRAM_TEMPORARY:
       assert(!prog_src->RelAddr && prog_src->Index >= 0);
       src.src = nir_src_for_ssa(nir_load_reg(b, c->temp_regs[prog_src->Index]));
       break;
    case PROGRAM_INPUT: {
       /* ARB_vertex_program doesn't allow relative addressing on vertex
        * attributes; ARB_fragment_program has no relative addressing at all.
        */
       assert(!prog_src->RelAddr);
       assert(prog_src->Index >= 0 && prog_src->Index < VARYING_SLOT_MAX);

       unsigned slot = prog_src->Index;
       nir_def *input;

       if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
          if (slot == VARYING_SLOT_POS && c->ctx->Const.GLSLFragCoordIsSysVal) {
             nir_variable *pos =
                nir_get_variable_with_location(b->shader, nir_var_system_value,
                                               SYSTEM_VALUE_FRAG_COORD,
                                               glsl_vec4_type());
             src.src = nir_src_for_ssa(nir_load_var(b, pos));
             break;
          }

          nir_def *baryc = nir_load_barycentric_pixel(b, 32);

          if (slot != VARYING_SLOT_COL0 && slot != VARYING_SLOT_COL1) {
             nir_intrinsic_set_interp_mode(nir_instr_as_intrinsic(baryc->parent_instr),
                                           INTERP_MODE_SMOOTH);
          }

          input = nir_load_interpolated_input(b, 4, 32, baryc, nir_imm_int(b, 0),
                                              .io_semantics.location = slot);

          /* fogcoord is defined as <f, 0.0, 0.0, 1.0>.  Make the actual
           * input variable a float, and create a local containing the
           * full vec4 value.
           */
          if (slot == VARYING_SLOT_FOGC) {
             input = nir_vec4(b, nir_channel(b, input, 0),
                              nir_imm_float(b, 0),
                              nir_imm_float(b, 0),
                              nir_imm_float(b, 1));
          }
       } else {
          input = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
                                 .io_semantics.location = slot);
       }

       src.src = nir_src_for_ssa(input);
       break;
    }
    case PROGRAM_STATE_VAR:
    case PROGRAM_CONSTANT: {
       /* We actually want to look at the type in the Parameters list for this,
        * because it lets us upload constant builtin uniforms as actual
        * constants.
        */
       struct gl_program_parameter_list *plist = c->prog->Parameters;
       gl_register_file file = prog_src->RelAddr ? prog_src->File :
          plist->Parameters[prog_src->Index].Type;

       switch (file) {
       case PROGRAM_CONSTANT:
          if ((c->prog->arb.IndirectRegisterFiles &
               (1 << PROGRAM_CONSTANT)) == 0) {
             unsigned pvo = plist->Parameters[prog_src->Index].ValueOffset;
             float *v = (float *) plist->ParameterValues + pvo;
             src.src = nir_src_for_ssa(nir_imm_vec4(b, v[0], v[1], v[2], v[3]));
             break;
          }
          FALLTHROUGH;
       case PROGRAM_STATE_VAR: {
          assert(c->parameters != NULL);

          nir_deref_instr *deref = nir_build_deref_var(b, c->parameters);

          nir_def *index = nir_imm_int(b, prog_src->Index);

          /* Add the address register. Note this is (uniquely) a scalar, so the
           * component sizes match.
           */
          if (prog_src->RelAddr)
             index = nir_iadd(b, index, nir_load_reg(b, c->addr_reg));

          deref = nir_build_deref_array(b, deref, index);
          src.src = nir_src_for_ssa(nir_load_deref(b, deref));
          break;
       }
       default:
          fprintf(stderr, "bad uniform src register file: %s (%d)\n",
                  _mesa_register_file_name(file), file);
          abort();
       }
       break;
    }
    default:
       fprintf(stderr, "unknown src register file: %s (%d)\n",
               _mesa_register_file_name(prog_src->File), prog_src->File);
       abort();
    }

    nir_def *def;
    if (!HAS_EXTENDED_SWIZZLE(prog_src->Swizzle) &&
        (prog_src->Negate == NEGATE_NONE || prog_src->Negate == NEGATE_XYZW)) {
       /* The simple non-SWZ case. */
       for (int i = 0; i < 4; i++)
          src.swizzle[i] = GET_SWZ(prog_src->Swizzle, i);

       def = nir_mov_alu(b, src, 4);

       if (prog_src->Negate)
          def = nir_fneg(b, def);
    } else {
       /* The SWZ instruction allows per-component zero/one swizzles, and also
        * per-component negation.
        */
       nir_def *chans[4];
       for (int i = 0; i < 4; i++) {
          int swizzle = GET_SWZ(prog_src->Swizzle, i);
          if (swizzle == SWIZZLE_ZERO) {
             chans[i] = nir_imm_float(b, 0.0);
          } else if (swizzle == SWIZZLE_ONE) {
             chans[i] = nir_imm_float(b, 1.0);
          } else {
             assert(swizzle != SWIZZLE_NIL);
             nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
             nir_def_init(&mov->instr, &mov->def, 1, 32);
             mov->src[0] = src;
             mov->src[0].swizzle[0] = swizzle;
             nir_builder_instr_insert(b, &mov->instr);

             chans[i] = &mov->def;
          }

          if (prog_src->Negate & (1 << i))
             chans[i] = nir_fneg(b, chans[i]);
       }
       def = nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
    }

    return def;
 }

 /* EXP - Approximate Exponential Base 2
  *  dst.x = 2^{\lfloor src.x\rfloor}
  *  dst.y = src.x - \lfloor src.x\rfloor
  *  dst.z = 2^{src.x}
  *  dst.w = 1.0
  */
 static nir_def *
 ptn_exp(nir_builder *b, nir_def **src)
 {
    nir_def *srcx = ptn_channel(b, src[0], X);

    return nir_vec4(b, nir_fexp2(b, nir_ffloor(b, srcx)),
                       nir_fsub(b, srcx, nir_ffloor(b, srcx)),
                       nir_fexp2(b, srcx),
                       nir_imm_float(b, 1.0));
 }

 /* LOG - Approximate Logarithm Base 2
  *  dst.x = \lfloor\log_2{|src.x|}\rfloor
  *  dst.y = \frac{|src.x|}{2^{\lfloor\log_2{|src.x|}\rfloor}}
  *  dst.z = \log_2{|src.x|}
  *  dst.w = 1.0
  */
 static nir_def *
 ptn_log(nir_builder *b, nir_def **src)
 {
    nir_def *abs_srcx = nir_fabs(b, ptn_channel(b, src[0], X));
    nir_def *log2 = nir_flog2(b, abs_srcx);

    return nir_vec4(b, nir_ffloor(b, log2),
                       nir_fdiv(b, abs_srcx, nir_fexp2(b, nir_ffloor(b, log2))),
                       nir_flog2(b, abs_srcx),
                       nir_imm_float(b, 1.0));
 }

 /* DST - Distance Vector
  *   dst.x = 1.0
  *   dst.y = src0.y \times src1.y
  *   dst.z = src0.z
  *   dst.w = src1.w
  */
 static nir_def *
 ptn_dst(nir_builder *b, nir_def **src)
 {
    return nir_vec4(b, nir_imm_float(b, 1.0),
                       nir_fmul(b, ptn_channel(b, src[0], Y),
                                   ptn_channel(b, src[1], Y)),
                       ptn_channel(b, src[0], Z),
                       ptn_channel(b, src[1], W));
 }

 /* LIT - Light Coefficients
  *  dst.x = 1.0
  *  dst.y = max(src.x, 0.0)
  *  dst.z = (src.x > 0.0) ? max(src.y, 0.0)^{clamp(src.w, -128.0, 128.0))} : 0
  *  dst.w = 1.0
  */
 static nir_def *
 ptn_lit(nir_builder *b, nir_def **src)
 {
    nir_def *src0_y = ptn_channel(b, src[0], Y);
    nir_def *wclamp = nir_fmax(b, nir_fmin(b, ptn_channel(b, src[0], W),
                                               nir_imm_float(b, 128.0)),
                                   nir_imm_float(b, -128.0));
    nir_def *pow = nir_fpow(b, nir_fmax(b, src0_y, nir_imm_float(b, 0.0)),
                                wclamp);

    nir_def *z = nir_bcsel(b, nir_fle_imm(b, ptn_channel(b, src[0], X), 0.0),
                               nir_imm_float(b, 0.0), pow);

    return nir_vec4(b, nir_imm_float(b, 1.0),
                       nir_fmax(b, ptn_channel(b, src[0], X),
                                   nir_imm_float(b, 0.0)),
                       z,
                       nir_imm_float(b, 1.0));
 }

 /* SCS - Sine Cosine
  *   dst.x = \cos{src.x}
  *   dst.y = \sin{src.x}
  *   dst.z = 0.0
  *   dst.w = 1.0
  */
 static nir_def *
 ptn_scs(nir_builder *b, nir_def **src)
 {
    return nir_vec4(b, nir_fcos(b, ptn_channel(b, src[0], X)),
                       nir_fsin(b, ptn_channel(b, src[0], X)),
                       nir_imm_float(b, 0.0),
                       nir_imm_float(b, 1.0));
 }

 static nir_def *
 ptn_xpd(nir_builder *b, nir_def **src)
 {
    nir_def *vec =
       nir_fsub(b, nir_fmul(b, nir_swizzle(b, src[0], SWIZ(Y, Z, X, W), 3),
                               nir_swizzle(b, src[1], SWIZ(Z, X, Y, W), 3)),
                   nir_fmul(b, nir_swizzle(b, src[1], SWIZ(Y, Z, X, W), 3),
                               nir_swizzle(b, src[0], SWIZ(Z, X, Y, W), 3)));

    return nir_vec4(b, nir_channel(b, vec, 0),
                       nir_channel(b, vec, 1),
                       nir_channel(b, vec, 2),
                       nir_imm_float(b, 1.0));
 }

 static void
 ptn_kil(nir_builder *b, nir_def **src)
 {
    /* flt must be exact, because NaN shouldn't discard. (apps rely on this) */
    b->exact = true;
    nir_def *cmp = nir_bany(b, nir_flt_imm(b, src[0], 0.0));
    b->exact = false;

    nir_discard_if(b, cmp);
 }

 enum glsl_sampler_dim
 _mesa_texture_index_to_sampler_dim(gl_texture_index index, bool *is_array)
 {
    *is_array = false;

    switch (index) {
    case TEXTURE_2D_MULTISAMPLE_INDEX:
       return GLSL_SAMPLER_DIM_MS;
    case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX:
       *is_array = true;
       return GLSL_SAMPLER_DIM_MS;
    case TEXTURE_BUFFER_INDEX:
       return GLSL_SAMPLER_DIM_BUF;
    case TEXTURE_1D_INDEX:
       return GLSL_SAMPLER_DIM_1D;
    case TEXTURE_2D_INDEX:
       return GLSL_SAMPLER_DIM_2D;
    case TEXTURE_3D_INDEX:
       return GLSL_SAMPLER_DIM_3D;
    case TEXTURE_CUBE_INDEX:
       return GLSL_SAMPLER_DIM_CUBE;
    case TEXTURE_CUBE_ARRAY_INDEX:
       *is_array = true;
       return GLSL_SAMPLER_DIM_CUBE;
    case TEXTURE_RECT_INDEX:
       return GLSL_SAMPLER_DIM_RECT;
    case TEXTURE_1D_ARRAY_INDEX:
       *is_array = true;
       return GLSL_SAMPLER_DIM_1D;
    case TEXTURE_2D_ARRAY_INDEX:
       *is_array = true;
       return GLSL_SAMPLER_DIM_2D;
    case TEXTURE_EXTERNAL_INDEX:
       return GLSL_SAMPLER_DIM_EXTERNAL;
    case NUM_TEXTURE_TARGETS:
       break;
    }
    unreachable("unknown texture target");
 }

 static nir_def *
 ptn_tex(struct ptn_compile *c, nir_def **src,
         struct prog_instruction *prog_inst)
 {
    nir_builder *b = &c->build;
    nir_tex_instr *instr;
    nir_texop op;
    unsigned num_srcs;

    switch (prog_inst->Opcode) {
    case OPCODE_TEX:
       op = nir_texop_tex;
       num_srcs = 1;
       break;
    case OPCODE_TXB:
       op = nir_texop_txb;
       num_srcs = 2;
       break;
    case OPCODE_TXD:
       op = nir_texop_txd;
       num_srcs = 3;
       break;
    case OPCODE_TXL:
       op = nir_texop_txl;
       num_srcs = 2;
       break;
    case OPCODE_TXP:
       op = nir_texop_tex;
       num_srcs = 2;
       break;
    default:
       fprintf(stderr, "unknown tex op %d\n", prog_inst->Opcode);
       abort();
    }

    /* Deref sources */
    num_srcs += 2;

    if (prog_inst->TexShadow)
       num_srcs++;

    instr = nir_tex_instr_create(b->shader, num_srcs);
    instr->op = op;
    instr->dest_type = nir_type_float32;
    instr->is_shadow = prog_inst->TexShadow;

    bool is_array;
    instr->sampler_dim = _mesa_texture_index_to_sampler_dim(prog_inst->TexSrcTarget, &is_array);

    instr->coord_components =
       glsl_get_sampler_dim_coordinate_components(instr->sampler_dim);

    nir_variable *var = c->sampler_vars[prog_inst->TexSrcUnit];
    if (!var) {
       const struct glsl_type *type =
          glsl_sampler_type(instr->sampler_dim, instr->is_shadow, false, GLSL_TYPE_FLOAT);
       char samplerName[20];
       snprintf(samplerName, sizeof(samplerName), "sampler_%d", prog_inst->TexSrcUnit);
       var = nir_variable_create(b->shader, nir_var_uniform, type, samplerName);
       var->data.binding = prog_inst->TexSrcUnit;
       var->data.explicit_binding = true;
       c->sampler_vars[prog_inst->TexSrcUnit] = var;
    }

    nir_deref_instr *deref = nir_build_deref_var(b, var);

    unsigned src_number = 0;

    instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_texture_deref,
                                                 &deref->def);
    src_number++;
    instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref,
                                                 &deref->def);
    src_number++;

    instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_coord,
                                                 nir_trim_vector(b, src[0],
                                                                 instr->coord_components));
    src_number++;

    if (prog_inst->Opcode == OPCODE_TXP) {
       instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_projector,
                                                    ptn_channel(b, src[0], W));
       src_number++;
    }

    if (prog_inst->Opcode == OPCODE_TXB) {
       instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_bias,
                                                    ptn_channel(b, src[0], W));
       src_number++;
    }

    if (prog_inst->Opcode == OPCODE_TXL) {
       instr->src[src_number] = nir_tex_src_for_ssa(nir_tex_src_lod,
                                                    ptn_channel(b, src[0], W));
       src_number++;
    }

    if (instr->is_shadow) {
       if (instr->coord_components < 3)
          instr->src[src_number].src = nir_src_for_ssa(ptn_channel(b, src[0], Z));
       else
          instr->src[src_number].src = nir_src_for_ssa(ptn_channel(b, src[0], W));

       instr->src[src_number].src_type = nir_tex_src_comparator;
       src_number++;
    }

    assert(src_number == num_srcs);

    nir_def_init(&instr->instr, &instr->def, 4, 32);
    nir_builder_instr_insert(b, &instr->instr);

    return &instr->def;
 }

 static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_NOP] = 0,
    [OPCODE_ABS] = nir_op_fabs,
    [OPCODE_ADD] = nir_op_fadd,
    [OPCODE_ARL] = 0,
    [OPCODE_CMP] = 0,
    [OPCODE_COS] = 0,
    [OPCODE_DDX] = 0,
    [OPCODE_DDY] = 0,
    [OPCODE_DP2] = 0,
    [OPCODE_DP3] = 0,
    [OPCODE_DP4] = 0,
    [OPCODE_DPH] = 0,
    [OPCODE_DST] = 0,
    [OPCODE_END] = 0,
    [OPCODE_EX2] = 0,
    [OPCODE_EXP] = 0,
    [OPCODE_FLR] = nir_op_ffloor,
    [OPCODE_FRC] = nir_op_ffract,
    [OPCODE_LG2] = 0,
    [OPCODE_LIT] = 0,
    [OPCODE_LOG] = 0,
    [OPCODE_LRP] = 0,
    [OPCODE_MAD] = 0,
    [OPCODE_MAX] = nir_op_fmax,
    [OPCODE_MIN] = nir_op_fmin,
    [OPCODE_MOV] = nir_op_mov,
    [OPCODE_MUL] = nir_op_fmul,
    [OPCODE_POW] = 0,
    [OPCODE_RCP] = 0,

    [OPCODE_RSQ] = 0,
    [OPCODE_SCS] = 0,
    [OPCODE_SGE] = 0,
    [OPCODE_SIN] = 0,
    [OPCODE_SLT] = 0,
    [OPCODE_SSG] = nir_op_fsign,
    [OPCODE_SUB] = nir_op_fsub,
    [OPCODE_SWZ] = 0,
    [OPCODE_TEX] = 0,
    [OPCODE_TXB] = 0,
    [OPCODE_TXD] = 0,
    [OPCODE_TXL] = 0,
    [OPCODE_TXP] = 0,
    [OPCODE_XPD] = 0,
 };

 static void
 ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
 {
    nir_builder *b = &c->build;
    unsigned i;
    const unsigned op = prog_inst->Opcode;

    if (op == OPCODE_END)
       return;

    nir_def *src[3];
    for (i = 0; i < 3; i++) {
       src[i] = ptn_get_src(c, &prog_inst->SrcReg[i]);
    }

    nir_def *dst = NULL;
    if (c->error)
       return;

    switch (op) {
    case OPCODE_DDX:
       dst = nir_ddx(b, src[0]);
       break;

    case OPCODE_DDY:
       dst = nir_ddy(b, src[0]);
       break;

    case OPCODE_RSQ:
       dst = nir_frsq(b, nir_fabs(b, ptn_channel(b, src[0], X)));
       break;

    case OPCODE_RCP:
       dst = nir_frcp(b, ptn_channel(b, src[0], X));
       break;

    case OPCODE_EX2:
       dst = nir_fexp2(b, ptn_channel(b, src[0], X));
       break;

    case OPCODE_LG2:
       dst = nir_flog2(b, ptn_channel(b, src[0], X));
       break;

    case OPCODE_POW:
       dst = nir_fpow(b, ptn_channel(b, src[0], X), ptn_channel(b, src[1], X));
       break;

    case OPCODE_COS:
       dst = nir_fcos(b, ptn_channel(b, src[0], X));
       break;

    case OPCODE_SIN:
       dst = nir_fsin(b, ptn_channel(b, src[0], X));
       break;

    case OPCODE_ARL:
       dst = nir_f2i32(b, nir_ffloor(b, src[0]));
       break;

    case OPCODE_EXP:
       dst = ptn_exp(b, src);
       break;

    case OPCODE_LOG:
       dst = ptn_log(b, src);
       break;

    case OPCODE_LRP:
       dst = nir_flrp(b, src[2], src[1], src[0]);
       break;

    case OPCODE_MAD:
       dst = nir_fadd(b, nir_fmul(b, src[0], src[1]), src[2]);
       break;

    case OPCODE_DST:
       dst = ptn_dst(b, src);
       break;

    case OPCODE_LIT:
       dst = ptn_lit(b, src);
       break;

    case OPCODE_XPD:
       dst = ptn_xpd(b, src);
       break;

    case OPCODE_DP2:
       dst = nir_fdot2(b, src[0], src[1]);
       break;

    case OPCODE_DP3:
       dst = nir_fdot3(b, src[0], src[1]);
       break;

    case OPCODE_DP4:
       dst = nir_fdot4(b, src[0], src[1]);
       break;

    case OPCODE_DPH:
       dst = nir_fdph(b, src[0], src[1]);
       break;

    case OPCODE_KIL:
       ptn_kil(b, src);
       break;

    case OPCODE_CMP:
       dst = nir_bcsel(b, nir_flt_imm(b, src[0], 0.0), src[1], src[2]);
       break;

    case OPCODE_SCS:
       dst = ptn_scs(b, src);
       break;

    case OPCODE_SLT:
       dst = nir_slt(b, src[0], src[1]);
       break;

    case OPCODE_SGE:
       dst = nir_sge(b, src[0], src[1]);
       break;

    case OPCODE_TEX:
    case OPCODE_TXB:
    case OPCODE_TXD:
    case OPCODE_TXL:
    case OPCODE_TXP:
       dst = ptn_tex(c, src, prog_inst);
       break;

    case OPCODE_SWZ:
       /* Extended swizzles were already handled in ptn_get_src(). */
       dst = nir_build_alu_src_arr(b, nir_op_mov, src);
       break;

    case OPCODE_NOP:
       break;

    default:
       if (op_trans[op] != 0) {
          dst = nir_build_alu_src_arr(b, op_trans[op], src);
       } else {
          fprintf(stderr, "unknown opcode: %s\n", _mesa_opcode_string(op));
          abort();
       }
       break;
    }

    if (dst == NULL)
       return;

    if (dst->num_components == 1)
       dst = nir_replicate(b, dst, 4);

    assert(dst->num_components == 4);

    if (prog_inst->Saturate)
       dst = nir_fsat(b, dst);

    const struct prog_dst_register *prog_dst = &prog_inst->DstReg;
    assert(!prog_dst->RelAddr);

    nir_def *reg = NULL;
    unsigned write_mask = prog_dst->WriteMask;

    switch (prog_dst->File) {
    case PROGRAM_TEMPORARY:
       reg = c->temp_regs[prog_dst->Index];
       break;
    case PROGRAM_OUTPUT:
       reg = c->output_regs[prog_dst->Index];
       break;
    case PROGRAM_ADDRESS:
       assert(prog_dst->Index == 0);
       reg = c->addr_reg;

       /* The address register (uniquely) is scalar. */
       dst = nir_channel(b, dst, 0);
       write_mask &= 1;
       break;
    case PROGRAM_UNDEFINED:
       return;
    }

    /* In case there was some silly .y write to the scalar address reg */
    if (write_mask == 0)
       return;

    assert(reg != NULL);
    nir_build_store_reg(b, dst, reg, .write_mask = write_mask);
 }

 /**
  * Puts a NIR intrinsic to store of each PROGRAM_OUTPUT value to the output
  * variables at the end of the shader.
  *
  * We don't generate these incrementally as the PROGRAM_OUTPUT values are
  * written, because there's no output load intrinsic, which means we couldn't
  * handle writemasks.
  */
 static void
 ptn_add_output_stores(struct ptn_compile *c)
 {
    nir_builder *b = &c->build;

    u_foreach_bit64(slot, b->shader->info.outputs_written) {
       nir_def *src = nir_load_reg(b, c->output_regs[slot]);
       if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
           slot == FRAG_RESULT_DEPTH) {
          /* result.depth has this strange convention of being the .z component of
           * a vec4 with undefined .xyw components.  We resolve it to a scalar, to
           * match GLSL's gl_FragDepth and the expectations of most backends.
           */
          src = nir_channel(b, src, 2);
       }
       if (c->prog->Target == GL_VERTEX_PROGRAM_ARB &&
           (slot == VARYING_SLOT_FOGC || slot == VARYING_SLOT_PSIZ)) {
          /* result.{fogcoord,psiz} is a single component value */
          src = nir_channel(b, src, 0);
       }

       nir_store_output(b, src, nir_imm_int(b, 0),
                        .io_semantics.location = slot);
    }
 }

 static void
 setup_registers_and_variables(struct ptn_compile *c)
 {
    nir_builder *b = &c->build;

    /* Create output registers. */
    int max_outputs = util_last_bit64(c->prog->info.outputs_written);
    c->output_regs = rzalloc_array(c, nir_def *, max_outputs);

    u_foreach_bit64(i, c->prog->info.outputs_written) {
       /* Since we can't load from outputs in the IR, we make temporaries
        * for the outputs and emit stores to the real outputs at the end of
        * the shader.
        */
       c->output_regs[i] = nir_decl_reg(b, 4, 32, 0);
    }

    /* Create temporary registers. */
    c->temp_regs = rzalloc_array(c, nir_def *,
                                 c->prog->arb.NumTemporaries);

    for (unsigned i = 0; i < c->prog->arb.NumTemporaries; i++) {
       c->temp_regs[i] = nir_decl_reg(b, 4, 32, 0);
    }

    /* Create the address register (for ARB_vertex_program). This is uniquely a
     * scalar, requiring special handling for stores.
     */
    c->addr_reg = nir_decl_reg(b, 1, 32, 0);
 }

 struct nir_shader *
 prog_to_nir(const struct gl_context *ctx, const struct gl_program *prog)
 {
    const struct nir_shader_compiler_options *options =
       st_get_nir_compiler_options(ctx->st, prog->info.stage);
    struct ptn_compile *c;
    struct nir_shader *s;
    gl_shader_stage stage = _mesa_program_enum_to_shader_stage(prog->Target);

    c = rzalloc(NULL, struct ptn_compile);
    if (!c)
       return NULL;
    c->prog = prog;
    c->ctx = ctx;

    c->build = nir_builder_init_simple_shader(stage, options, NULL);

    /* Copy the shader_info from the gl_program */
    c->build.shader->info = prog->info;

    s = c->build.shader;

    if (prog->Parameters->NumParameters > 0) {
       const struct glsl_type *type =
          glsl_array_type(glsl_vec4_type(), prog->Parameters->NumParameters, 0);
       c->parameters =
          nir_variable_create(s, nir_var_uniform, type,
                              prog->Parameters->Parameters[0].Name);
    }

    setup_registers_and_variables(c);
    if (unlikely(c->error))
       goto fail;

    for (unsigned int i = 0; i < prog->arb.NumInstructions; i++) {
       ptn_emit_instruction(c, &prog->arb.Instructions[i]);

       if (unlikely(c->error))
          break;
    }

    ptn_add_output_stores(c);

    s->info.name = ralloc_asprintf(s, "ARB%d", prog->Id);
    s->info.num_textures = util_last_bit(prog->SamplersUsed);
    s->info.num_ubos = 0;
    s->info.num_abos = 0;
    s->info.num_ssbos = 0;
    s->info.num_images = 0;
    s->info.uses_texture_gather = false;
    s->info.clip_distance_array_size = 0;
    s->info.cull_distance_array_size = 0;
    s->info.separate_shader = true;
    s->info.io_lowered = true;
    s->info.internal = false;

    /* ARB_vp: */
    if (prog->arb.IsPositionInvariant) {
       NIR_PASS(_, s, st_nir_lower_position_invariant,
                  ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS,
                  prog->Parameters);
    }

    /* Add OPTION ARB_fog_exp code */
    if (prog->arb.Fog)
       NIR_PASS(_, s, st_nir_lower_fog, prog->arb.Fog, prog->Parameters);

 fail:
    if (c->error) {
       ralloc_free(s);
       s = NULL;
    }
    ralloc_free(c);
    return s;
 }