/*
 * Copyright © 2019 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifndef MI_BUILDER_H
#define MI_BUILDER_H

#include "dev/intel_device_info.h"
#include "genxml/genX_bits.h"
#include "util/bitscan.h"
#include "util/fast_idiv_by_const.h"
#include "util/u_math.h"

#ifndef MI_BUILDER_NUM_ALLOC_GPRS
/** The number of GPRs the MI builder is allowed to allocate
 *
 * This may be set by a user of this API so that it can reserve some GPRs at
 * the top end for its own use.
 */
#define MI_BUILDER_NUM_ALLOC_GPRS 16
#endif

#ifndef MI_BUILDER_DEFAULT_WRITE_CHECK
#define MI_BUILDER_DEFAULT_WRITE_CHECK true
#endif

#ifndef MI_BUILDER_RAW_MEM_FENCING
#define MI_BUILDER_RAW_MEM_FENCING GFX_VER >= 20
#endif

/** These must be defined by the user of the builder
 *
 * void *__gen_get_batch_dwords(__gen_user_data *user_data,
 *                              unsigned num_dwords);
 *
 * __gen_address_type
 * __gen_address_offset(__gen_address_type addr, uint64_t offset);
 *
 *
 * If self-modifying batches are supported, we must be able to pass batch
 * addresses around as void*s so pinning as well as batch chaining or some
 * other mechanism for ensuring batch pointers remain valid during building is
 * required. The following function must also be defined, it returns an
 * address in canonical form:
 *
 * __gen_address_type
 * __gen_get_batch_address(__gen_user_data *user_data, void *location);
 *
 * Also, __gen_combine_address must accept a location value of NULL and return
 * a fully valid 64-bit address.
 */

/**
 * On Gfx20+ this must also be defined by the user of the builder
 *
 * bool *
 * __gen_get_write_fencing_status(__gen_user_data *user_data);
 *
 * Returns a pointer to a boolean tracking the status of fencing for MI
 * commands writing to memory.
 */

/*
 * Start of the actual MI builder
 */

#define __genxml_cmd_length(cmd) cmd ## _length
#define __genxml_cmd_header(cmd) cmd ## _header
#define __genxml_cmd_pack(cmd) cmd ## _pack

#define mi_builder_pack(b, cmd, dst, name)                          \
   for (struct cmd name = { __genxml_cmd_header(cmd) },                 \
        *_dst = (struct cmd *)(dst); __builtin_expect(_dst != NULL, 1); \
        __genxml_cmd_pack(cmd)((b)->user_data, (void *)_dst, &name),    \
        _dst = NULL)

/* Get the instruction pointer inside a mi_builder_pack() block */
#define mi_builder_get_inst_ptr(b) \
   ((uint8_t *)_dst)

#define mi_builder_emit(b, cmd, name)                               \
   mi_builder_pack((b), cmd, __gen_get_batch_dwords((b)->user_data, __genxml_cmd_length(cmd)), name)

enum mi_value_type {
   MI_VALUE_TYPE_IMM,
   MI_VALUE_TYPE_MEM32,
   MI_VALUE_TYPE_MEM64,
   MI_VALUE_TYPE_REG32,
   MI_VALUE_TYPE_REG64,
};

struct mi_value {
   enum mi_value_type type;

   union {
      uint64_t imm;
      __gen_address_type addr;
      uint32_t reg;
   };

#if GFX_VERx10 >= 75
   bool invert;
#endif
};

struct mi_reg_num {
   uint32_t num;
#if GFX_VER >= 11
   bool cs;
#endif
};

static inline struct mi_reg_num
mi_adjust_reg_num(uint32_t reg)
{
#if GFX_VER >= 11
   bool cs = reg >= 0x2000 && reg < 0x4000;
   return (struct mi_reg_num) {
      .num = reg - (cs ? 0x2000 : 0),
      .cs = cs,
   };
#else
   return (struct mi_reg_num) { .num = reg, };
#endif
}

#if GFX_VER >= 9
#define MI_BUILDER_MAX_MATH_DWORDS 256
#else
#define MI_BUILDER_MAX_MATH_DWORDS 64
#endif

struct mi_builder {
   const struct intel_device_info *devinfo;
   __gen_user_data *user_data;

   bool no_read_write_fencing;

#if GFX_VERx10 >= 75
   uint32_t gprs;
   uint8_t gpr_refs[MI_BUILDER_NUM_ALLOC_GPRS];

   unsigned num_math_dwords;
   uint32_t math_dwords[MI_BUILDER_MAX_MATH_DWORDS];
#endif

#if GFX_VERx10 >= 125
   uint32_t mocs;
#endif

#if GFX_VER >= 12
   bool write_check;
#endif
};

static inline void
mi_builder_init(struct mi_builder *b,
                const struct intel_device_info *devinfo,
                __gen_user_data *user_data)
{
   memset(b, 0, sizeof(*b));
   b->devinfo = devinfo;
   b->user_data = user_data;

#if GFX_VER >= 12
   b->write_check = MI_BUILDER_DEFAULT_WRITE_CHECK;
#endif
   b->no_read_write_fencing = false;
#if GFX_VERx10 >= 75
   b->gprs = 0;
   b->num_math_dwords = 0;
#endif
}

static inline void
mi_builder_flush_math(struct mi_builder *b)
{
#if GFX_VERx10 >= 75
   if (b->num_math_dwords == 0)
      return;

   uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                                     1 + b->num_math_dwords);
   mi_builder_pack(b, GENX(MI_MATH), dw, math) {
#if GFX_VERx10 >= 125
      math.MOCS = b->mocs;
#endif
      math.DWordLength = 1 + b->num_math_dwords - GENX(MI_MATH_length_bias);
   }
   memcpy(dw + 1, b->math_dwords, b->num_math_dwords * sizeof(uint32_t));
   b->num_math_dwords = 0;
#endif
}

/**
 * Set mocs index to mi_build
 *
 * This is required when a MI_MATH instruction will be emitted and
 * the code is used in GFX 12.5 or newer.
 */
static inline void
mi_builder_set_mocs(UNUSED struct mi_builder *b, UNUSED uint32_t mocs)
{
#if GFX_VERx10 >= 125
   if (b->mocs != 0 && b->mocs != mocs)
      mi_builder_flush_math(b);
   b->mocs = mocs;
#endif
}

/**
 * Set write checks on immediate writes
 *
 * This ensures that the next memory write will complete only when all emitted
 * previously emitted memory write are .
 */
static inline void
mi_builder_set_write_check(UNUSED struct mi_builder *b, UNUSED bool check)
{
#if GFX_VER >= 12
   b->write_check = check;
#endif
}

static inline bool
mi_builder_write_checked(UNUSED struct mi_builder *b)
{
#if GFX_VER >= 12
   return b->write_check;
#else
   return false;
#endif
}

#define _MI_BUILDER_GPR_BASE 0x2600
/* The actual hardware limit on GPRs */
#define _MI_BUILDER_NUM_HW_GPRS 16

#if GFX_VERx10 >= 75

static inline bool
mi_value_is_reg(struct mi_value val)
{
   return val.type == MI_VALUE_TYPE_REG32 ||
          val.type == MI_VALUE_TYPE_REG64;
}

static inline bool
mi_value_is_gpr(struct mi_value val)
{
   return mi_value_is_reg(val) &&
          val.reg >= _MI_BUILDER_GPR_BASE &&
          val.reg < _MI_BUILDER_GPR_BASE +
                    _MI_BUILDER_NUM_HW_GPRS * 8;
}

static inline bool
_mi_value_is_allocated_gpr(struct mi_value val)
{
   return mi_value_is_reg(val) &&
          val.reg >= _MI_BUILDER_GPR_BASE &&
          val.reg < _MI_BUILDER_GPR_BASE +
                    MI_BUILDER_NUM_ALLOC_GPRS * 8;
}

static inline uint32_t
_mi_value_as_gpr(struct mi_value val)
{
   assert(mi_value_is_gpr(val));
   /* Some of the GRL metakernels will generate 64bit value in a GP register,
    * then use only half of that as the last operation on that value. So allow
    * unref on part of a GP register.
    */
   assert(val.reg % 4 == 0);
   return (val.reg - _MI_BUILDER_GPR_BASE) / 8;
}

static inline struct mi_value
mi_new_gpr(struct mi_builder *b)
{
   unsigned gpr = ffs(~b->gprs) - 1;
   assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
   assert(b->gpr_refs[gpr] == 0);
   b->gprs |= (1u << gpr);
   b->gpr_refs[gpr] = 1;

   return (struct mi_value) {
      .type = MI_VALUE_TYPE_REG64,
      .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
   };
}

static inline struct mi_value
mi_reserve_gpr(struct mi_builder *b, unsigned gpr)
{
   assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
   assert(!(b->gprs & (1 << gpr)));
   assert(b->gpr_refs[gpr] == 0);
   b->gprs |= (1u << gpr);
   b->gpr_refs[gpr] = 128; /* Enough that we won't unref it */

   return (struct mi_value) {
      .type = MI_VALUE_TYPE_REG64,
      .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
   };
}
#endif /* GFX_VERx10 >= 75 */

/** Take a reference to a mi_value
 *
 * The MI builder uses reference counting to automatically free ALU GPRs for
 * re-use in calculations.  All mi_* math functions consume the reference
 * they are handed for each source and return a reference to a value which the
 * caller must consume.  In particular, if you pas the same value into a
 * single mi_* math function twice (say to add a number to itself), you
 * are responsible for calling mi_value_ref() to get a second reference
 * because the mi_* math function will consume it twice.
 */
static inline void
mi_value_add_refs(struct mi_builder *b, struct mi_value val, unsigned num_refs)
{
#if GFX_VERx10 >= 75
   if (_mi_value_is_allocated_gpr(val)) {
      unsigned gpr = _mi_value_as_gpr(val);
      assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
      assert(b->gprs & (1u << gpr));
      assert(b->gpr_refs[gpr] < UINT8_MAX);
      b->gpr_refs[gpr] += num_refs;
   }
#endif /* GFX_VERx10 >= 75 */
}

static inline struct mi_value
mi_value_ref(struct mi_builder *b, struct mi_value val)
{
   mi_value_add_refs(b, val, 1);
   return val;
}


/** Drop a reference to a mi_value
 *
 * See also mi_value_ref.
 */
static inline void
mi_value_unref(struct mi_builder *b, struct mi_value val)
{
#if GFX_VERx10 >= 75
   if (_mi_value_is_allocated_gpr(val)) {
      unsigned gpr = _mi_value_as_gpr(val);
      assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
      assert(b->gprs & (1u << gpr));
      assert(b->gpr_refs[gpr] > 0);
      if (--b->gpr_refs[gpr] == 0)
         b->gprs &= ~(1u << gpr);
   }
#endif /* GFX_VERx10 >= 75 */
}

/* On Gfx20+ memory read/write can be process unordered, so we need to track
 * the writes to memory to make sure any memory read will see the effect of a
 * previous write.
 */
static inline void
mi_builder_set_write(struct mi_builder *b)
{
#if MI_BUILDER_RAW_MEM_FENCING
   *__gen_get_write_fencing_status(b->user_data) = true;
#endif
}

static inline void
mi_ensure_write_fence(struct mi_builder *b)
{
#if MI_BUILDER_RAW_MEM_FENCING
   if (!b->no_read_write_fencing &&
       *__gen_get_write_fencing_status(b->user_data)) {
      mi_builder_emit(b, GENX(MI_MEM_FENCE), fence)
         fence.FenceType = FENCE_TYPE_MI_WRITE;
      *__gen_get_write_fencing_status(b->user_data) = false;
   }
#endif
}

static inline struct mi_value
mi_imm(uint64_t imm)
{
   return (struct mi_value) {
      .type = MI_VALUE_TYPE_IMM,
      .imm = imm,
   };
}

static inline struct mi_value
mi_reg32(uint32_t reg)
{
   struct mi_value val = {
      .type = MI_VALUE_TYPE_REG32,
      .reg = reg,
   };
#if GFX_VERx10 >= 75
   assert(!_mi_value_is_allocated_gpr(val));
#endif
   return val;
}

static inline struct mi_value
mi_reg64(uint32_t reg)
{
   struct mi_value val = {
      .type = MI_VALUE_TYPE_REG64,
      .reg = reg,
   };
#if GFX_VERx10 >= 75
   assert(!_mi_value_is_allocated_gpr(val));
#endif
   return val;
}

static inline struct mi_value
mi_mem32(__gen_address_type addr)
{
   return (struct mi_value) {
      .type = MI_VALUE_TYPE_MEM32,
      .addr = addr,
   };
}

static inline struct mi_value
mi_mem64(__gen_address_type addr)
{
   return (struct mi_value) {
      .type = MI_VALUE_TYPE_MEM64,
      .addr = addr,
   };
}

static inline struct mi_value
mi_value_half(struct mi_value value, bool top_32_bits)
{
   switch (value.type) {
   case MI_VALUE_TYPE_IMM:
      if (top_32_bits)
         value.imm >>= 32;
      else
         value.imm &= 0xffffffffu;
      return value;

   case MI_VALUE_TYPE_MEM32:
      assert(!top_32_bits);
      return value;

   case MI_VALUE_TYPE_MEM64:
      if (top_32_bits)
         value.addr = __gen_address_offset(value.addr, 4);
      value.type = MI_VALUE_TYPE_MEM32;
      return value;

   case MI_VALUE_TYPE_REG32:
      assert(!top_32_bits);
      return value;

   case MI_VALUE_TYPE_REG64:
      if (top_32_bits)
         value.reg += 4;
      value.type = MI_VALUE_TYPE_REG32;
      return value;
   }

   unreachable("Invalid mi_value type");
}

static inline void
_mi_copy_no_unref(struct mi_builder *b,
                  struct mi_value dst, struct mi_value src)
{
#if GFX_VERx10 >= 75
   /* TODO: We could handle src.invert by emitting a bit of math if we really
    * wanted to.
    */
   assert(!dst.invert && !src.invert);
#endif
   mi_builder_flush_math(b);

   if (src.type == MI_VALUE_TYPE_MEM64 ||
       src.type == MI_VALUE_TYPE_MEM32)
      mi_ensure_write_fence(b);

   switch (dst.type) {
   case MI_VALUE_TYPE_IMM:
      unreachable("Cannot copy to an immediate");

   case MI_VALUE_TYPE_MEM64:
   case MI_VALUE_TYPE_REG64:
      switch (src.type) {
      case MI_VALUE_TYPE_IMM:
         if (dst.type == MI_VALUE_TYPE_REG64) {
            uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                                              GENX(MI_LOAD_REGISTER_IMM_length) + 2);
            struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
            mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
               lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
                                 GENX(MI_LOAD_REGISTER_IMM_length_bias);
#if GFX_VER >= 11
               lri.AddCSMMIOStartOffset = reg.cs;
#endif
            }
            dw[1] = reg.num;
            dw[2] = src.imm;
            dw[3] = reg.num + 4;
            dw[4] = src.imm >> 32;
         } else {
#if GFX_VER >= 8
            assert(dst.type == MI_VALUE_TYPE_MEM64);
            uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                                              GENX(MI_STORE_DATA_IMM_length) + 1);
            mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdi) {
               sdi.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
                                 GENX(MI_STORE_DATA_IMM_length_bias);
               sdi.StoreQword = true;
               sdi.Address = dst.addr;
#if GFX_VER >= 12
               sdi.ForceWriteCompletionCheck = b->write_check;
#endif
            }
            dw[3] = src.imm;
            dw[4] = src.imm >> 32;
#else
         _mi_copy_no_unref(b, mi_value_half(dst, false),
                              mi_value_half(src, false));
         _mi_copy_no_unref(b, mi_value_half(dst, true),
                              mi_value_half(src, true));
#endif
         }
         break;
      case MI_VALUE_TYPE_REG32:
      case MI_VALUE_TYPE_MEM32:
         _mi_copy_no_unref(b, mi_value_half(dst, false),
                              mi_value_half(src, false));
         _mi_copy_no_unref(b, mi_value_half(dst, true),
                              mi_imm(0));
         break;
      case MI_VALUE_TYPE_REG64:
      case MI_VALUE_TYPE_MEM64:
         _mi_copy_no_unref(b, mi_value_half(dst, false),
                              mi_value_half(src, false));
         _mi_copy_no_unref(b, mi_value_half(dst, true),
                              mi_value_half(src, true));
         break;
      default:
         unreachable("Invalid mi_value type");
      }
      break;

   case MI_VALUE_TYPE_MEM32:
      switch (src.type) {
      case MI_VALUE_TYPE_IMM:
         mi_builder_emit(b, GENX(MI_STORE_DATA_IMM), sdi) {
            sdi.Address = dst.addr;
#if GFX_VER >= 12
            sdi.ForceWriteCompletionCheck = b->write_check;
#endif
            sdi.ImmediateData = src.imm;
         }
         break;

      case MI_VALUE_TYPE_MEM32:
      case MI_VALUE_TYPE_MEM64:
#if GFX_VER >= 8
         mi_builder_emit(b, GENX(MI_COPY_MEM_MEM), cmm) {
            cmm.DestinationMemoryAddress = dst.addr;
            cmm.SourceMemoryAddress = src.addr;
         }
#elif GFX_VERx10 == 75
         {
            struct mi_value tmp = mi_new_gpr(b);
            _mi_copy_no_unref(b, tmp, src);
            _mi_copy_no_unref(b, dst, tmp);
            mi_value_unref(b, tmp);
         }
#else
         unreachable("Cannot do mem <-> mem copy on IVB and earlier");
#endif
         break;

      case MI_VALUE_TYPE_REG32:
      case MI_VALUE_TYPE_REG64:
         mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
            struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
            srm.RegisterAddress = reg.num;
#if GFX_VER >= 11
            srm.AddCSMMIOStartOffset = reg.cs;
#endif
            srm.MemoryAddress = dst.addr;
         }
         break;

      default:
         unreachable("Invalid mi_value type");
      }
      break;

   case MI_VALUE_TYPE_REG32:
      switch (src.type) {
      case MI_VALUE_TYPE_IMM:
         mi_builder_emit(b, GENX(MI_LOAD_REGISTER_IMM), lri) {
            struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
            lri.RegisterOffset = reg.num;
#if GFX_VER >= 11
            lri.AddCSMMIOStartOffset = reg.cs;
#endif
            lri.DataDWord = src.imm;
         }
         break;

      case MI_VALUE_TYPE_MEM32:
      case MI_VALUE_TYPE_MEM64:
#if GFX_VER >= 7
         mi_builder_emit(b, GENX(MI_LOAD_REGISTER_MEM), lrm) {
            struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
            lrm.RegisterAddress = reg.num;
#if GFX_VER >= 11
            lrm.AddCSMMIOStartOffset = reg.cs;
#endif
            lrm.MemoryAddress = src.addr;
         }
#else
         unreachable("Cannot load do mem -> reg copy on SNB and earlier");
#endif
         break;

      case MI_VALUE_TYPE_REG32:
      case MI_VALUE_TYPE_REG64:
#if GFX_VERx10 >= 75
         if (src.reg != dst.reg) {
            mi_builder_emit(b, GENX(MI_LOAD_REGISTER_REG), lrr) {
               struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
               lrr.SourceRegisterAddress = reg.num;
#if GFX_VER >= 11
               lrr.AddCSMMIOStartOffsetSource = reg.cs;
#endif
               reg = mi_adjust_reg_num(dst.reg);
               lrr.DestinationRegisterAddress = reg.num;
#if GFX_VER >= 11
               lrr.AddCSMMIOStartOffsetDestination = reg.cs;
#endif
            }
         }
#else
         unreachable("Cannot do reg <-> reg copy on IVB and earlier");
#endif
         break;

      default:
         unreachable("Invalid mi_value type");
      }
      break;

   default:
      unreachable("Invalid mi_value type");
   }


   if (dst.type == MI_VALUE_TYPE_MEM64 ||
       dst.type == MI_VALUE_TYPE_MEM32) {
      /* Immediate writes can already wait for writes, so no need to do
       * additional fencing later.
       */
      if (src.type != MI_VALUE_TYPE_IMM || !mi_builder_write_checked(b))
         mi_builder_set_write(b);
   }
}

#if GFX_VERx10 >= 75
static inline struct mi_value
mi_resolve_invert(struct mi_builder *b, struct mi_value src);
#endif

/** Store the value in src to the value represented by dst
 *
 * If the bit size of src and dst mismatch, this function does an unsigned
 * integer cast.  If src has more bits than dst, it takes the bottom bits.  If
 * src has fewer bits then dst, it fills the top bits with zeros.
 *
 * This function consumes one reference for each of src and dst.
 */
static inline void
mi_store(struct mi_builder *b, struct mi_value dst, struct mi_value src)
{
#if GFX_VERx10 >= 75
   src = mi_resolve_invert(b, src);
#endif
   _mi_copy_no_unref(b, dst, src);
   mi_value_unref(b, src);
   mi_value_unref(b, dst);
}

static inline void
mi_memset(struct mi_builder *b, __gen_address_type dst,
          uint32_t value, uint32_t size)
{
#if GFX_VERx10 >= 75
   assert(b->num_math_dwords == 0);
#endif

   /* This memset operates in units of dwords. */
   assert(size % 4 == 0);

   for (uint32_t i = 0; i < size; i += 4) {
      mi_store(b, mi_mem32(__gen_address_offset(dst, i)),
                      mi_imm(value));
   }
}

/* NOTE: On IVB, this function stomps GFX7_3DPRIM_BASE_VERTEX */
static inline void
mi_memcpy(struct mi_builder *b, __gen_address_type dst,
          __gen_address_type src, uint32_t size)
{
#if GFX_VERx10 >= 75
   assert(b->num_math_dwords == 0);
#endif

   /* Flush once only */
   mi_ensure_write_fence(b);
   b->no_read_write_fencing = true;

   /* Hold off write checks until the last write. */
   bool write_check = mi_builder_write_checked(b);
   mi_builder_set_write_check(b, false);

   /* This memcpy operates in units of dwords. */
   assert(size % 4 == 0);

   for (uint32_t i = 0; i < size; i += 4) {
      if (i == size - 4)
         mi_builder_set_write_check(b, write_check);

      struct mi_value dst_val = mi_mem32(__gen_address_offset(dst, i));
      struct mi_value src_val = mi_mem32(__gen_address_offset(src, i));
#if GFX_VERx10 >= 75
      mi_store(b, dst_val, src_val);
#else
      /* IVB does not have a general purpose register for command streamer
       * commands. Therefore, we use an alternate temporary register.
       */
      struct mi_value tmp_reg = mi_reg32(0x2440); /* GFX7_3DPRIM_BASE_VERTEX */
      mi_store(b, tmp_reg, src_val);
      mi_store(b, dst_val, tmp_reg);
#endif
   }

   b->no_read_write_fencing = false;
}

/*
 * MI_MATH Section.  Only available on Haswell+
 */

#if GFX_VERx10 >= 75

/**
 * Perform a predicated store (assuming the condition is already loaded
 * in the MI_PREDICATE_RESULT register) of the value in src to the memory
 * location specified by dst.  Non-memory destinations are not supported.
 *
 * This function consumes one reference for each of src and dst.
 */
static inline void
mi_store_if(struct mi_builder *b, struct mi_value dst, struct mi_value src)
{
   assert(!dst.invert && !src.invert);

   mi_builder_flush_math(b);

   /* We can only predicate MI_STORE_REGISTER_MEM, so restrict the
    * destination to be memory, and resolve the source to a temporary
    * register if it isn't in one already.
    */
   assert(dst.type == MI_VALUE_TYPE_MEM64 ||
          dst.type == MI_VALUE_TYPE_MEM32);

   if (src.type != MI_VALUE_TYPE_REG32 &&
       src.type != MI_VALUE_TYPE_REG64) {
      struct mi_value tmp = mi_new_gpr(b);
      _mi_copy_no_unref(b, tmp, src);
      src = tmp;
   }

   if (dst.type == MI_VALUE_TYPE_MEM64) {
      mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
         struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
         srm.RegisterAddress = reg.num;
#if GFX_VER >= 11
         srm.AddCSMMIOStartOffset = reg.cs;
#endif
         srm.MemoryAddress = dst.addr;
         srm.PredicateEnable = true;
      }
      mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
         struct mi_reg_num reg = mi_adjust_reg_num(src.reg + 4);
         srm.RegisterAddress = reg.num;
#if GFX_VER >= 11
         srm.AddCSMMIOStartOffset = reg.cs;
#endif
         srm.MemoryAddress = __gen_address_offset(dst.addr, 4);
         srm.PredicateEnable = true;
      }
   } else {
      mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
         struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
         srm.RegisterAddress = reg.num;
#if GFX_VER >= 11
         srm.AddCSMMIOStartOffset = reg.cs;
#endif
         srm.MemoryAddress = dst.addr;
         srm.PredicateEnable = true;
      }
   }

   mi_builder_set_write(b);

   mi_value_unref(b, src);
   mi_value_unref(b, dst);
}

static inline void
_mi_builder_push_math(struct mi_builder *b,
                      const uint32_t *dwords,
                      unsigned num_dwords)
{
   assert(num_dwords < MI_BUILDER_MAX_MATH_DWORDS);
   if (b->num_math_dwords + num_dwords > MI_BUILDER_MAX_MATH_DWORDS)
      mi_builder_flush_math(b);

   memcpy(&b->math_dwords[b->num_math_dwords],
          dwords, num_dwords * sizeof(*dwords));
   b->num_math_dwords += num_dwords;
}

static inline uint32_t
_mi_pack_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
{
   struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
      .Operand2 = operand2,
      .Operand1 = operand1,
      .ALUOpcode = opcode,
   };

   uint32_t dw;
   GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);

   return dw;
}

static inline struct mi_value
mi_value_to_gpr(struct mi_builder *b, struct mi_value val)
{
   if (mi_value_is_gpr(val))
      return val;

   /* Save off the invert flag because it makes copy() grumpy */
   bool invert = val.invert;
   val.invert = false;

   struct mi_value tmp = mi_new_gpr(b);
   _mi_copy_no_unref(b, tmp, val);
   tmp.invert = invert;

   return tmp;
}

static inline uint64_t
mi_value_to_u64(struct mi_value val)
{
   assert(val.type == MI_VALUE_TYPE_IMM);
   return val.invert ? ~val.imm : val.imm;
}

static inline uint32_t
_mi_math_load_src(struct mi_builder *b, unsigned src, struct mi_value *val)
{
   if (val->type == MI_VALUE_TYPE_IMM &&
       (val->imm == 0 || val->imm == UINT64_MAX)) {
      uint64_t imm = val->invert ? ~val->imm : val->imm;
      return _mi_pack_alu(imm ? MI_ALU_LOAD1 : MI_ALU_LOAD0, src, 0);
   } else {
      *val = mi_value_to_gpr(b, *val);
      return _mi_pack_alu(val->invert ? MI_ALU_LOADINV : MI_ALU_LOAD,
                          src, _mi_value_as_gpr(*val));
   }
}

static inline struct mi_value
mi_math_binop(struct mi_builder *b, uint32_t opcode,
              struct mi_value src0, struct mi_value src1,
              uint32_t store_op, uint32_t store_src)
{
   struct mi_value dst = mi_new_gpr(b);

   uint32_t dw[4];
   dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &src0);
   dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &src1);
   dw[2] = _mi_pack_alu(opcode, 0, 0);
   dw[3] = _mi_pack_alu(store_op, _mi_value_as_gpr(dst), store_src);
   _mi_builder_push_math(b, dw, 4);

   mi_value_unref(b, src0);
   mi_value_unref(b, src1);

   return dst;
}

static inline struct mi_value
mi_inot(struct mi_builder *b, struct mi_value val)
{
   if (val.type == MI_VALUE_TYPE_IMM)
      return mi_imm(~mi_value_to_u64(val));

   val.invert = !val.invert;
   return val;
}

static inline struct mi_value
mi_resolve_invert(struct mi_builder *b, struct mi_value src)
{
   if (!src.invert)
      return src;

   assert(src.type != MI_VALUE_TYPE_IMM);
   return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_iadd(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) + mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_ADD, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_iadd_imm(struct mi_builder *b,
                struct mi_value src, uint64_t N)
{
   if (N == 0)
      return src;

   return mi_iadd(b, src, mi_imm(N));
}

static inline struct mi_value
mi_isub(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) - mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_SUB, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_ieq(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) == mi_value_to_u64(src1) ? ~0ull : 0);

   /* Compute "equal" by subtracting and storing the zero bit */
   return mi_math_binop(b, MI_ALU_SUB, src0, src1,
                            MI_ALU_STORE, MI_ALU_ZF);
}

static inline struct mi_value
mi_ine(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) != mi_value_to_u64(src1) ? ~0ull : 0);

   /* Compute "not equal" by subtracting and storing the inverse zero bit */
   return mi_math_binop(b, MI_ALU_SUB, src0, src1,
                            MI_ALU_STOREINV, MI_ALU_ZF);
}

static inline struct mi_value
mi_ult(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) < mi_value_to_u64(src1) ? ~0ull : 0);

   /* Compute "less than" by subtracting and storing the carry bit */
   return mi_math_binop(b, MI_ALU_SUB, src0, src1,
                           MI_ALU_STORE, MI_ALU_CF);
}

static inline struct mi_value
mi_uge(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) >= mi_value_to_u64(src1) ? ~0ull : 0);

   /* Compute "less than" by subtracting and storing the carry bit */
   return mi_math_binop(b, MI_ALU_SUB, src0, src1,
                           MI_ALU_STOREINV, MI_ALU_CF);
}

static inline struct mi_value
mi_iand(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) & mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_AND, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_nz(struct mi_builder *b, struct mi_value src)
{
   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src) != 0 ? ~0ull : 0);

   return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
                           MI_ALU_STOREINV, MI_ALU_ZF);
}

static inline struct mi_value
mi_z(struct mi_builder *b, struct mi_value src)
{
   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src) == 0 ? ~0ull : 0);

   return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
                           MI_ALU_STORE, MI_ALU_ZF);
}

static inline struct mi_value
mi_ior(struct mi_builder *b,
       struct mi_value src0, struct mi_value src1)
{
   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) | mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_OR, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

#if GFX_VERx10 >= 125
static inline struct mi_value
mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src1.type == MI_VALUE_TYPE_IMM) {
      assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
      assert(mi_value_to_u64(src1) <= 32);
   }

   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) << mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_SHL, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_ushr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src1.type == MI_VALUE_TYPE_IMM) {
      assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
      assert(mi_value_to_u64(src1) <= 32);
   }

   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src0) >> mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_SHR, src0, src1,
                           MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_ushr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
{
   if (shift == 0)
      return src;

   if (shift >= 64)
      return mi_imm(0);

   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src) >> shift);

   struct mi_value res = mi_value_to_gpr(b, src);

   /* Annoyingly, we only have power-of-two shifts */
   while (shift) {
      int bit = u_bit_scan(&shift);
      assert(bit <= 5);
      res = mi_ushr(b, res, mi_imm(1ULL << bit));
   }

   return res;
}

static inline struct mi_value
mi_ishr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
{
   if (src1.type == MI_VALUE_TYPE_IMM) {
      assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
      assert(mi_value_to_u64(src1) <= 32);
   }

   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
      return mi_imm((int64_t)mi_value_to_u64(src0) >> mi_value_to_u64(src1));

   return mi_math_binop(b, MI_ALU_SAR, src0, src1,
                            MI_ALU_STORE, MI_ALU_ACCU);
}

static inline struct mi_value
mi_ishr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
{
   if (shift == 0)
      return src;

   if (shift >= 64)
      return mi_imm(0);

   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm((int64_t)mi_value_to_u64(src) >> shift);

   struct mi_value res = mi_value_to_gpr(b, src);

   /* Annoyingly, we only have power-of-two shifts */
   while (shift) {
      int bit = u_bit_scan(&shift);
      assert(bit <= 5);
      res = mi_ishr(b, res, mi_imm(1 << bit));
   }

   return res;
}
#endif /* if GFX_VERx10 >= 125 */

static inline struct mi_value
mi_imul_imm(struct mi_builder *b, struct mi_value src, uint32_t N)
{
   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src) * N);

   if (N == 0) {
      mi_value_unref(b, src);
      return mi_imm(0);
   }

   if (N == 1)
      return src;

   src = mi_value_to_gpr(b, src);

   struct mi_value res = mi_value_ref(b, src);

   unsigned top_bit = 31 - __builtin_clz(N);
   for (int i = top_bit - 1; i >= 0; i--) {
      res = mi_iadd(b, res, mi_value_ref(b, res));
      if (N & (1 << i))
         res = mi_iadd(b, res, mi_value_ref(b, src));
   }

   mi_value_unref(b, src);

   return res;
}

static inline struct mi_value
mi_ishl_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
{
   if (shift == 0)
      return src;

   if (shift >= 64)
      return mi_imm(0);

   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm(mi_value_to_u64(src) << shift);

   struct mi_value res = mi_value_to_gpr(b, src);

#if GFX_VERx10 >= 125
   /* Annoyingly, we only have power-of-two shifts */
   while (shift) {
      int bit = u_bit_scan(&shift);
      assert(bit <= 5);
      res = mi_ishl(b, res, mi_imm(1 << bit));
   }
#else
   for (unsigned i = 0; i < shift; i++)
      res = mi_iadd(b, res, mi_value_ref(b, res));
#endif

   return res;
}

static inline struct mi_value
mi_ushr32_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
{
   if (shift == 0)
      return src;

   if (shift >= 64)
      return mi_imm(0);

   /* We right-shift by left-shifting by 32 - shift and taking the top 32 bits
    * of the result.
    */
   if (src.type == MI_VALUE_TYPE_IMM)
      return mi_imm((mi_value_to_u64(src) >> shift) & UINT32_MAX);

   if (shift > 32) {
      struct mi_value tmp = mi_new_gpr(b);
      _mi_copy_no_unref(b, mi_value_half(tmp, false),
                               mi_value_half(src, true));
      _mi_copy_no_unref(b, mi_value_half(tmp, true), mi_imm(0));
      mi_value_unref(b, src);
      src = tmp;
      shift -= 32;
   }
   assert(shift <= 32);
   struct mi_value tmp = mi_ishl_imm(b, src, 32 - shift);
   struct mi_value dst = mi_new_gpr(b);
   _mi_copy_no_unref(b, mi_value_half(dst, false),
                            mi_value_half(tmp, true));
   _mi_copy_no_unref(b, mi_value_half(dst, true), mi_imm(0));
   mi_value_unref(b, tmp);
   return dst;
}

static inline struct mi_value
mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D)
{
   if (N.type == MI_VALUE_TYPE_IMM) {
      assert(mi_value_to_u64(N) <= UINT32_MAX);
      return mi_imm(mi_value_to_u64(N) / D);
   }

   /* We implicitly assume that N is only a 32-bit value */
   if (D == 0) {
      /* This is invalid but we should do something */
      return mi_imm(0);
   } else if (util_is_power_of_two_or_zero(D)) {
      return mi_ushr32_imm(b, N, util_logbase2(D));
   } else {
      struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
      assert(m.multiplier <= UINT32_MAX);

      if (m.pre_shift)
         N = mi_ushr32_imm(b, N, m.pre_shift);

      /* Do the 32x32 multiply  into gpr0 */
      N = mi_imul_imm(b, N, m.multiplier);

      if (m.increment)
         N = mi_iadd(b, N, mi_imm(m.multiplier));

      N = mi_ushr32_imm(b, N, 32);

      if (m.post_shift)
         N = mi_ushr32_imm(b, N, m.post_shift);

      return N;
   }
}

#endif /* MI_MATH section */

/* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */
#if MI_BUILDER_CAN_WRITE_BATCH

struct mi_reloc_imm_token {
   enum mi_value_type dst_type;
   uint32_t *ptr[2];
};

/* Emits a immediate write to an address/register where the immediate value
 * can be updated later.
 */
static inline struct mi_reloc_imm_token
mi_store_relocated_imm(struct mi_builder *b, struct mi_value dst)
{
   mi_builder_flush_math(b);

   struct mi_reloc_imm_token token = {
      .dst_type = dst.type,
   };

   uint32_t *dw;
   switch (dst.type) {
   case MI_VALUE_TYPE_MEM32:
      dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                              GENX(MI_STORE_DATA_IMM_length));
      mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
         sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) -
                           GENX(MI_STORE_DATA_IMM_length_bias);
         sdm.Address = dst.addr;
      }
      token.ptr[0] = dw + GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32;
      mi_builder_set_write(b);
      break;

   case MI_VALUE_TYPE_MEM64:
      dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                              GENX(MI_STORE_DATA_IMM_length) + 1);
      mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
         sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
                           GENX(MI_STORE_DATA_IMM_length_bias);
         sdm.Address = dst.addr;
      }
      token.ptr[0] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32];
      token.ptr[1] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32 + 1];
      mi_builder_set_write(b);
      break;

   case MI_VALUE_TYPE_REG32:
      dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                              GENX(MI_LOAD_REGISTER_IMM_length));
      mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
         lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) -
                           GENX(MI_LOAD_REGISTER_IMM_length_bias);
         struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
#if GFX_VER >= 11
         lri.AddCSMMIOStartOffset = reg.cs;
#endif
         lri.RegisterOffset = reg.num;
      }
      token.ptr[0] = &dw[2];
      break;

   case MI_VALUE_TYPE_REG64: {
      dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
                                              GENX(MI_LOAD_REGISTER_IMM_length) + 2);
      struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
      mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
         lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
                           GENX(MI_LOAD_REGISTER_IMM_length_bias);
#if GFX_VER >= 11
         lri.AddCSMMIOStartOffset = reg.cs;
#endif
      }
      dw[1] = reg.num;
      dw[3] = reg.num + 4;
      token.ptr[0] = &dw[2];
      token.ptr[1] = &dw[4];
      break;
   }

   default:
      unreachable("Invalid value type");
   }

   mi_value_unref(b, dst);
   return token;
}

static inline void
mi_relocate_store_imm(struct mi_reloc_imm_token token, uint64_t value)
{
   switch (token.dst_type) {
   case MI_VALUE_TYPE_MEM64:
   case MI_VALUE_TYPE_REG64:
      *token.ptr[1] = value >> 32;
      FALLTHROUGH;
   case MI_VALUE_TYPE_MEM32:
   case MI_VALUE_TYPE_REG32:
      *token.ptr[0] = value & 0xffffffff;
      break;
   default:
      unreachable("Invalid value type");
   }
}

struct mi_address_token {
   /* Pointers to address memory fields in the batch. */
   uint64_t *ptrs[2];
};

/* Emits a 64bit memory write to a yet unknown address using a value from a
 * register
 */
static inline struct mi_address_token
mi_store_relocated_address_reg64(struct mi_builder *b, struct mi_value addr_reg)
{
   mi_builder_flush_math(b);

   assert(addr_reg.type == MI_VALUE_TYPE_REG64);

   struct mi_address_token token = {};

   for (unsigned i = 0; i < 2; i++) {
      mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
         srm.RegisterAddress = addr_reg.reg + (i * 4);

         const unsigned addr_dw =
            GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8;
         token.ptrs[i] = (uint64_t *)(mi_builder_get_inst_ptr(_dst) + addr_dw);
      }
   }

   mi_builder_set_write(b);
   mi_value_unref(b, addr_reg);
   return token;
}

static inline void
mi_self_mod_barrier(struct mi_builder *b, unsigned cs_prefetch_size)
{
   /* First make sure all the memory writes from previous modifying commands
    * have landed. We want to do this before going through the CS cache,
    * otherwise we could be fetching memory that hasn't been written to yet.
    */
   mi_builder_emit(b, GENX(PIPE_CONTROL), pc) {
      pc.CommandStreamerStallEnable = true;
   }
   /* Documentation says Gfx11+ should be able to invalidate the command cache
    * but experiment show it doesn't work properly, so for now just get over
    * the CS prefetch.
    */
   for (uint32_t i = 0; i < (cs_prefetch_size / 4); i++)
      mi_builder_emit(b, GENX(MI_NOOP), noop);
}

static inline void
mi_resolve_relocated_address_token(struct mi_builder *b,
                                   struct mi_address_token token,
                                   void *batch_location)
{
   __gen_address_type addr = __gen_get_batch_address(b->user_data,
                                                    batch_location);
   uint64_t addr_addr_u64 = __gen_combine_address(b->user_data, batch_location,
                                                  addr, 0);
   *(token.ptrs[0]) = addr_addr_u64;
   *(token.ptrs[1]) = addr_addr_u64 + 4;
}

#endif /* MI_BUILDER_CAN_WRITE_BATCH */

#if GFX_VERx10 >= 125

/*
 * Indirect load/store.  Only available on XE_HP+
 */

MUST_CHECK static inline struct mi_value
mi_load_mem64_offset(struct mi_builder *b,
                     __gen_address_type addr, struct mi_value offset)
{
   mi_ensure_write_fence(b);

   uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
   struct mi_value addr_val = mi_imm(addr_u64);

   struct mi_value dst = mi_new_gpr(b);

   uint32_t dw[5];
   dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
   dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
   dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
   dw[3] = _mi_pack_alu(MI_ALU_LOADIND, _mi_value_as_gpr(dst), MI_ALU_ACCU);
   dw[4] = _mi_pack_alu(MI_ALU_FENCE_RD, 0, 0);
   _mi_builder_push_math(b, dw, 5);

   mi_value_unref(b, addr_val);
   mi_value_unref(b, offset);

   return dst;
}

static inline void
mi_store_mem64_offset(struct mi_builder *b,
                          __gen_address_type addr, struct mi_value offset,
                          struct mi_value data)
{
   uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
   struct mi_value addr_val = mi_imm(addr_u64);

   data = mi_value_to_gpr(b, mi_resolve_invert(b, data));

   uint32_t dw[5];
   dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
   dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
   dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
   dw[3] = _mi_pack_alu(MI_ALU_STOREIND, MI_ALU_ACCU, _mi_value_as_gpr(data));
   dw[4] = _mi_pack_alu(MI_ALU_FENCE_WR, 0, 0);
   _mi_builder_push_math(b, dw, 5);

   mi_value_unref(b, addr_val);
   mi_value_unref(b, offset);
   mi_value_unref(b, data);

   /* This is the only math case which has side-effects outside of regular
    * registers to flush math afterwards so we don't confuse anyone.
    */
   mi_builder_flush_math(b);
   /* mi_builder_set_write() is not required here because we have a FENCE_WR
    * in the ALU instruction.
    */
}

#endif /* GFX_VERx10 >= 125 */

#if GFX_VER >= 9

/*
 * Control-flow Section.  Only available on Gfx9+
 */

struct _mi_goto {
   bool predicated;
   void *mi_bbs;
};

struct mi_goto_target {
   bool placed;
   unsigned num_gotos;
   struct _mi_goto gotos[8];
   __gen_address_type addr;
};

#define MI_GOTO_TARGET_INIT ((struct mi_goto_target) {})

/* On >= Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
 * bit0 of the MI_SET_PREDICATE_RESULT register.
 *
 * ACM PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
 * Predication Enable:
 *
 *   "This bit is used to enable predication of this command. If this bit is
 *    set and Bit 0 of the MI_SET_PREDICATE_RESULT register is set, this
 *    command is ignored. Otherwise the command is performed normally."
 *
 * The register offset is not listed in the PRMs, but BSpec places it a
 * 0x2418.
 *
 * On < Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
 * bit0 of MI_PREDICATE_RESULT_1.
 *
 * SKL PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
 * Predication Enable:
 *
 *    "This bit is used to enable predication of this command. If this bit is
 *     set and Bit 0 of the MI_PREDICATE_RESULT_1 register is clear, this
 *     command is ignored. Otherwise the command is performed normally.
 *     Specific to the Render command stream only."
 *
 * The register offset is listed in the SKL PRMs, Vol 2c: Command Reference:
 * Registers, MI_PREDICATE_RESULT_1, at 0x241C.
 */
#if GFX_VERx10 >= 125
#define MI_BUILDER_MI_PREDICATE_RESULT_num  0x2418
#else
#define MI_BUILDER_MI_PREDICATE_RESULT_num  0x241C
#endif

static inline void
mi_goto_if(struct mi_builder *b, struct mi_value cond,
           struct mi_goto_target *t)
{
   /* First, set up the predicate, if any */
   bool predicated;
   if (cond.type == MI_VALUE_TYPE_IMM) {
      /* If it's an immediate, the goto either doesn't happen or happens
       * unconditionally.
       */
      if (mi_value_to_u64(cond) == 0)
         return;

      assert(mi_value_to_u64(cond) == ~0ull);
      predicated = false;
   } else if (mi_value_is_reg(cond) &&
              cond.reg == MI_BUILDER_MI_PREDICATE_RESULT_num) {
      /* If it's MI_PREDICATE_RESULT, we use whatever predicate the client
       * provided us with
       */
      assert(cond.type == MI_VALUE_TYPE_REG32);
      predicated = true;
   } else {
      mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), cond);
      predicated = true;
   }

#if GFX_VERx10 >= 125
   if (predicated) {
      mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
         sp.PredicateEnable = NOOPOnResultClear;
      }
   }
#endif
   if (t->placed) {
      mi_builder_emit(b, GENX(MI_BATCH_BUFFER_START), bbs) {
         bbs.PredicationEnable         = predicated;
         bbs.AddressSpaceIndicator     = ASI_PPGTT;
         bbs.BatchBufferStartAddress   = t->addr;
      }
   } else {
      assert(t->num_gotos < ARRAY_SIZE(t->gotos));
      struct _mi_goto g = {
         .predicated = predicated,
         .mi_bbs = __gen_get_batch_dwords(b->user_data,
                                          GENX(MI_BATCH_BUFFER_START_length)),
      };
      memset(g.mi_bbs, 0, 4 * GENX(MI_BATCH_BUFFER_START_length));
      t->gotos[t->num_gotos++] = g;
   }
   if (predicated) {
#if GFX_VERx10 >= 125
      mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
         sp.PredicateEnable = NOOPNever;
      }
#else
      mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
#endif
   }
}

static inline void
mi_goto(struct mi_builder *b, struct mi_goto_target *t)
{
   mi_goto_if(b, mi_imm(-1), t);
}

static inline void
mi_goto_target(struct mi_builder *b, struct mi_goto_target *t)
{
#if GFX_VERx10 >= 125
   mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
      sp.PredicateEnable = NOOPNever;
      t->addr = __gen_get_batch_address(b->user_data,
                                        mi_builder_get_inst_ptr(b));
   }
#else
   mi_builder_emit(b, GENX(MI_NOOP), sp) {
      t->addr = __gen_get_batch_address(b->user_data,
                                        mi_builder_get_inst_ptr(b));
   }
   mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
#endif
   t->placed = true;

   struct GENX(MI_BATCH_BUFFER_START) bbs = { GENX(MI_BATCH_BUFFER_START_header) };
   bbs.AddressSpaceIndicator     = ASI_PPGTT;
   bbs.BatchBufferStartAddress   = t->addr;

   for (unsigned i = 0; i < t->num_gotos; i++) {
      bbs.PredicationEnable = t->gotos[i].predicated;
      GENX(MI_BATCH_BUFFER_START_pack)(b->user_data, t->gotos[i].mi_bbs, &bbs);
   }
}

static inline struct mi_goto_target
mi_goto_target_init_and_place(struct mi_builder *b)
{
   struct mi_goto_target t = MI_GOTO_TARGET_INIT;
   mi_goto_target(b, &t);
   return t;
}

#define mi_loop(b) \
   for (struct mi_goto_target __break = MI_GOTO_TARGET_INIT, \
        __continue = mi_goto_target_init_and_place(b); !__break.placed; \
        mi_goto(b, &__continue), mi_goto_target(b, &__break))

#define mi_break(b) mi_goto(b, &__break)
#define mi_break_if(b, cond) mi_goto_if(b, cond, &__break)
#define mi_continue(b) mi_goto(b, &__continue)
#define mi_continue_if(b, cond) mi_goto_if(b, cond, &__continue)

#endif /* GFX_VER >= 9 */

#endif /* MI_BUILDER_H */
