src/intel/dev/i915/intel_device_info.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2023 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include <string.h>

 #include "intel/dev/i915/intel_device_info.h"
 #include "intel/dev/intel_device_info.h"

 #include "intel/dev/intel_hwconfig.h"
 #include "intel/common/intel_gem.h"
 #include "intel/common/i915/intel_gem.h"

 #include "util/bitscan.h"
 #include "util/log.h"
 #include "util/os_misc.h"

 #include "drm-uapi/i915_drm.h"

 /* At some point in time, some people decided to redefine what topology means,
  * from useful HW related information (slice, subslice, etc...), to much less
  * useful generic stuff that no one cares about (a single slice with lots of
  * subslices). Of course all of this was done without asking the people who
  * defined the topology query in the first place, to solve a lack of
  * information Gfx10+. This function is here to workaround the fact it's not
  * possible to change people's mind even before this stuff goes upstream. Sad
  * times...
  */
 static void
 update_from_single_slice_topology(struct intel_device_info *devinfo,
                                   const struct drm_i915_query_topology_info *topology,
                                   const struct drm_i915_query_topology_info *geom_topology)
 {
    /* An array of bit masks of the subslices available for 3D
     * workloads, analogous to intel_device_info::subslice_masks.  This
     * may differ from the set of enabled subslices on XeHP+ platforms
     * with compute-only subslices.
     */
    uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 };

    assert(devinfo->verx10 >= 125);

    intel_device_info_topology_reset_masks(devinfo);

    assert(topology->max_slices == 1);
    assert(topology->max_subslices > 0);
    assert(topology->max_eus_per_subslice > 0);

    /* i915 gives us only one slice so we have to rebuild that out of groups of
     * 4 dualsubslices.
     */
    devinfo->max_subslices_per_slice = 4;
    devinfo->max_eus_per_subslice = 16;
    devinfo->subslice_slice_stride = 1;
    devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8);
    devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8);

    for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) {
       const uint32_t s = ss_idx / 4;
       const uint32_t ss = ss_idx % 4;

       /* Determine whether ss_idx is enabled (ss_idx_available) and
        * available for 3D workloads (geom_ss_idx_available), which may
        * differ on XeHP+ if ss_idx is a compute-only DSS.
        */
       const bool ss_idx_available =
          (topology->data[topology->subslice_offset + ss_idx / 8] >>
           (ss_idx % 8)) & 1;
       const bool geom_ss_idx_available =
          (geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >>
           (ss_idx % 8)) & 1;

       if (geom_ss_idx_available) {
          assert(ss_idx_available);
          geom_subslice_masks[s * devinfo->subslice_slice_stride +
                              ss / 8] |= 1u << (ss % 8);
       }

       if (!ss_idx_available)
          continue;

       devinfo->max_slices = MAX2(devinfo->max_slices, s + 1);
       devinfo->slice_masks |= 1u << s;

       devinfo->subslice_masks[s * devinfo->subslice_slice_stride +
                               ss / 8] |= 1u << (ss % 8);

       for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
          const bool eu_available =
             (topology->data[topology->eu_offset +
                             ss_idx * topology->eu_stride +
                             eu / 8] >> (eu % 8)) & 1;

          if (!eu_available)
             continue;

          devinfo->eu_masks[s * devinfo->eu_slice_stride +
                            ss * devinfo->eu_subslice_stride +
                            eu / 8] |= 1u << (eu % 8);
       }
    }

    intel_device_info_topology_update_counts(devinfo);
    intel_device_info_update_pixel_pipes(devinfo, geom_subslice_masks);
    intel_device_info_update_l3_banks(devinfo);
 }

 static void
 update_from_topology(struct intel_device_info *devinfo,
                      const struct drm_i915_query_topology_info *topology)
 {
    intel_device_info_topology_reset_masks(devinfo);

    assert(topology->max_slices > 0);
    assert(topology->max_subslices > 0);
    assert(topology->max_eus_per_subslice > 0);

    devinfo->subslice_slice_stride = topology->subslice_stride;

    devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
    devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;

    assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
    memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
    devinfo->max_slices = topology->max_slices;
    devinfo->max_subslices_per_slice = topology->max_subslices;
    devinfo->max_eus_per_subslice = topology->max_eus_per_subslice;

    uint32_t subslice_mask_len =
       topology->max_slices * topology->subslice_stride;
    assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
    memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
           subslice_mask_len);

    uint32_t eu_mask_len =
       topology->eu_stride * topology->max_subslices * topology->max_slices;
    assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
    memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);

    /* Now that all the masks are in place, update the counts. */
    intel_device_info_topology_update_counts(devinfo);
    intel_device_info_update_pixel_pipes(devinfo, devinfo->subslice_masks);
    intel_device_info_update_l3_banks(devinfo);
 }

 /* Generate detailed mask from the I915_PARAM_SLICE_MASK,
  * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
  */
 bool
 intel_device_info_i915_update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
                        uint32_t subslice_mask, uint32_t n_eus)
 {
    struct drm_i915_query_topology_info *topology;

    assert((slice_mask & 0xff) == slice_mask);

    size_t data_length = 100;

    topology = calloc(1, sizeof(*topology) + data_length);
    if (!topology)
       return false;

    topology->max_slices = util_last_bit(slice_mask);
    topology->max_subslices = util_last_bit(subslice_mask);

    topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
    topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);

    uint32_t n_subslices = __builtin_popcount(slice_mask) *
       __builtin_popcount(subslice_mask);
    uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
    uint32_t eu_mask = (1U << max_eus_per_subslice) - 1;

    topology->max_eus_per_subslice = max_eus_per_subslice;
    topology->eu_offset = topology->subslice_offset +
       topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
    topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8);

    /* Set slice mask in topology */
    for (int b = 0; b < topology->subslice_offset; b++)
       topology->data[b] = (slice_mask >> (b * 8)) & 0xff;

    for (int s = 0; s < topology->max_slices; s++) {

       /* Set subslice mask in topology */
       for (int b = 0; b < topology->subslice_stride; b++) {
          int subslice_offset = topology->subslice_offset +
             s * topology->subslice_stride + b;

          topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
       }

       /* Set eu mask in topology */
       for (int ss = 0; ss < topology->max_subslices; ss++) {
          for (int b = 0; b < topology->eu_stride; b++) {
             int eu_offset = topology->eu_offset +
                (s * topology->max_subslices + ss) * topology->eu_stride + b;

             topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
          }
       }
    }

    update_from_topology(devinfo, topology);
    free(topology);

    return true;
 }

 static bool
 getparam(int fd, uint32_t param, int *value)
 {
    int tmp;

    struct drm_i915_getparam gp = {
       .param = param,
       .value = &tmp,
    };

    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
    if (ret != 0)
       return false;

    *value = tmp;
    return true;
 }

 static bool
 get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value)
 {
    struct drm_i915_gem_context_param gp = {
       .ctx_id = context,
       .param = param,
    };

    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
    if (ret != 0)
       return false;

    *value = gp.value;
    return true;
 }

 /**
  * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
  * (kernel 4.13+)
  */
 static bool
 getparam_topology(struct intel_device_info *devinfo, int fd)
 {
    int slice_mask = 0;
    if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
       goto maybe_warn;

    int n_eus;
    if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
       goto maybe_warn;

    int subslice_mask = 0;
    if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
       goto maybe_warn;

    return intel_device_info_i915_update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);

  maybe_warn:
    /* Only with Gfx8+ are we starting to see devices with fusing that can only
     * be detected at runtime.
     */
    if (devinfo->ver >= 8)
       mesa_logw("Kernel 4.1 required to properly query GPU properties.");

    return false;
 }

 /**
  * preferred API for updating the topology in devinfo (kernel 4.17+)
  */
 static bool
 query_topology(struct intel_device_info *devinfo, int fd)
 {
    struct drm_i915_query_topology_info *topo_info =
       intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL);
    if (topo_info == NULL)
       return false;

    if (devinfo->verx10 >= 125) {
       struct drm_i915_query_topology_info *geom_topo_info =
          intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL);
       if (geom_topo_info == NULL) {
          free(topo_info);
          return false;
       }

       update_from_single_slice_topology(devinfo, topo_info, geom_topo_info);
       free(geom_topo_info);
    } else {
       update_from_topology(devinfo, topo_info);
    }

    free(topo_info);

    return true;

 }

 /**
  * Reports memory region info, and allows buffers to target system-memory,
  * and/or device local memory.
  */
 bool
 intel_device_info_i915_query_regions(struct intel_device_info *devinfo, int fd, bool update)
 {
    struct drm_i915_query_memory_regions *meminfo =
       intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL);

    if (meminfo == NULL) {
       /* If the memory region uAPI query is not available, try to generate some
        * numbers out of os_* utils for sram only.
        */
       bool ret = intel_device_info_compute_system_memory(devinfo, false);
       devinfo->mem.sram.mappable.size /= 2;
       return ret;
    }

    for (int i = 0; i < meminfo->num_regions; i++) {
       const struct drm_i915_memory_region_info *mem = &meminfo->regions[i];
       switch (mem->region.memory_class) {
       case I915_MEMORY_CLASS_SYSTEM: {
          if (!update) {
             devinfo->mem.sram.mem.klass = mem->region.memory_class;
             devinfo->mem.sram.mem.instance = mem->region.memory_instance;
             /* i915 reports the whole RAM as SRAM size but Xe KMD only reports
              * half, so adjusting i915 to follow Xe KMD.
              */
             devinfo->mem.sram.mappable.size = mem->probed_size / 2;
          } else {
             assert(devinfo->mem.sram.mem.klass == mem->region.memory_class);
             assert(devinfo->mem.sram.mem.instance == mem->region.memory_instance);
             assert(devinfo->mem.sram.mappable.size == mem->probed_size / 2);
          }
          /* if running without elevated privileges i915 reports
           * unallocated_size == probed_size
           */
          devinfo->mem.sram.mappable.free = mem->unallocated_size;
          break;
       }
       case I915_MEMORY_CLASS_DEVICE:
          if (!update) {
             devinfo->mem.vram.mem.klass = mem->region.memory_class;
             devinfo->mem.vram.mem.instance = mem->region.memory_instance;
             if (mem->probed_cpu_visible_size > 0) {
                devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size;
                devinfo->mem.vram.unmappable.size =
                   mem->probed_size - mem->probed_cpu_visible_size;
             } else {
                /* We are running on an older kernel without support for the
                 * small-bar uapi. These kernels only support systems where the
                 * entire vram is mappable.
                 */
                devinfo->mem.vram.mappable.size = mem->probed_size;
                devinfo->mem.vram.unmappable.size = 0;
             }
          } else {
             assert(devinfo->mem.vram.mem.klass == mem->region.memory_class);
             assert(devinfo->mem.vram.mem.instance == mem->region.memory_instance);
             assert((devinfo->mem.vram.mappable.size +
                     devinfo->mem.vram.unmappable.size) == mem->probed_size);
          }
          if (mem->unallocated_cpu_visible_size > 0) {
             if (mem->unallocated_size != -1) {
                devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size;
                devinfo->mem.vram.unmappable.free =
                   mem->unallocated_size - mem->unallocated_cpu_visible_size;
             }
          } else {
             /* We are running on an older kernel without support for the
              * small-bar uapi. These kernels only support systems where the
              * entire vram is mappable.
              */
             if (mem->unallocated_size != -1) {
                devinfo->mem.vram.mappable.free = mem->unallocated_size;
                devinfo->mem.vram.unmappable.free = 0;
             }
          }
          break;
       default:
          break;
       }
    }

    free(meminfo);
    devinfo->mem.use_class_instance = true;
    return true;
 }

 static int
 intel_get_aperture_size(int fd, uint64_t *size)
 {
    struct drm_i915_gem_get_aperture aperture = { 0 };

    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
    if (ret == 0 && size)
       *size = aperture.aper_size;

    return ret;
 }

 static bool
 has_bit6_swizzle(int fd)
 {
    struct drm_gem_close close;

    struct drm_i915_gem_create gem_create = {
       .size = 4096,
    };

    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
       unreachable("Failed to create GEM BO");
       return false;
    }

    bool swizzled = false;

    /* set_tiling overwrites the input on the error path, so we have to open
     * code intel_ioctl.
     */
    struct drm_i915_gem_set_tiling set_tiling = {
       .handle = gem_create.handle,
       .tiling_mode = I915_TILING_X,
       .stride = 512,
    };

    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling)) {
       unreachable("Failed to set BO tiling");
       goto close_and_return;
    }

    struct drm_i915_gem_get_tiling get_tiling = {
       .handle = gem_create.handle,
    };

    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
       unreachable("Failed to get BO tiling");
       goto close_and_return;
    }

    assert(get_tiling.tiling_mode == I915_TILING_X);
    swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;

 close_and_return:
    memset(&close, 0, sizeof(close));
    close.handle = gem_create.handle;
    intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);

    return swizzled;
 }

 static bool
 has_get_tiling(int fd)
 {
    int ret;

    struct drm_i915_gem_create gem_create = {
       .size = 4096,
    };

    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
       unreachable("Failed to create GEM BO");
       return false;
    }

    struct drm_i915_gem_get_tiling get_tiling = {
       .handle = gem_create.handle,
    };
    ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);

    struct drm_gem_close close = {
       .handle = gem_create.handle,
    };
    intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);

    return ret == 0;
 }

 static void
 fixup_chv_device_info(struct intel_device_info *devinfo)
 {
    assert(devinfo->platform == INTEL_PLATFORM_CHV);

    /* Cherryview is annoying.  The number of EUs is depending on fusing and
     * isn't determinable from the PCI ID alone.  We default to the minimum
     * available for that PCI ID and then compute the real value from the
     * subslice information we get from the kernel.
     */
    const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
    const uint32_t eu_total = intel_device_info_eu_total(devinfo);

    /* Logical CS threads = EUs per subslice * num threads per EU */
    uint32_t max_cs_threads =
       eu_total / subslice_total * devinfo->num_thread_per_eu;

    /* Fuse configurations may give more threads than expected, never less. */
    if (max_cs_threads > devinfo->max_cs_threads)
       devinfo->max_cs_threads = max_cs_threads;

    intel_device_info_update_cs_workgroup_threads(devinfo);

    /* Braswell is even more annoying.  Its marketing name isn't determinable
     * from the PCI ID and is also dependent on fusing.
     */
    if (devinfo->pci_device_id != 0x22B1)
       return;

    char *bsw_model;
    switch (eu_total) {
    case 16: bsw_model = "405"; break;
    case 12: bsw_model = "400"; break;
    default: bsw_model = "   "; break;
    }

    char *needle = strstr(devinfo->name, "XXX");
    assert(needle);
    if (needle)
       memcpy(needle, bsw_model, 3);
 }

 void *
 intel_device_info_i915_query_hwconfig(int fd, int32_t *len)
 {
    return intel_i915_query_alloc(fd, DRM_I915_QUERY_HWCONFIG_BLOB, len);
 }

 bool intel_device_info_i915_get_info_from_fd(int fd, struct intel_device_info *devinfo)
 {
    void *hwconfig_blob;
    int32_t len;

    int val;
    if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY, &val))
       devinfo->timestamp_frequency = val;
    else if (devinfo->ver >= 10) {
       mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
       return false;
    }

    if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
       devinfo->revision = 0;

    if (!query_topology(devinfo, fd)) {
       if (devinfo->ver >= 10) {
          /* topology uAPI required for CNL+ (kernel 4.17+) */
          return false;
       }

       /* else use the kernel 4.13+ api for gfx8+.  For older kernels, topology
        * will be wrong, affecting GPU metrics. In this case, fail silently.
        */
       getparam_topology(devinfo, fd);
    }

    hwconfig_blob = intel_device_info_i915_query_hwconfig(fd, &len);
    if (hwconfig_blob) {
       if (intel_hwconfig_process_table(devinfo, hwconfig_blob, len))
          intel_device_info_update_after_hwconfig(devinfo);

       free(hwconfig_blob);
    }

    intel_device_info_i915_query_regions(devinfo, fd, false);

    if (devinfo->platform == INTEL_PLATFORM_CHV)
       fixup_chv_device_info(devinfo);

    /* Broadwell PRM says:
     *
     *   "Before Gfx8, there was a historical configuration control field to
     *    swizzle address bit[6] for in X/Y tiling modes. This was set in three
     *    different places: TILECTL[1:0], ARB_MODE[5:4], and
     *    DISP_ARB_CTL[14:13].
     *
     *    For Gfx8 and subsequent generations, the swizzle fields are all
     *    reserved, and the CPU's memory controller performs all address
     *    swizzling modifications."
     */
    devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd);

    intel_get_aperture_size(fd, &devinfo->aperture_bytes);
    get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size);
    devinfo->has_tiling_uapi = has_get_tiling(fd);
    devinfo->has_caching_uapi =
       devinfo->platform < INTEL_PLATFORM_DG2_START && !devinfo->has_local_mem;
    if (devinfo->ver > 12 || intel_device_info_is_mtl_or_arl(devinfo))
       devinfo->has_set_pat_uapi = true;

    if (getparam(fd, I915_PARAM_MMAP_GTT_VERSION, &val))
       devinfo->has_mmap_offset = val >= 4;
    if (getparam(fd, I915_PARAM_HAS_USERPTR_PROBE, &val))
       devinfo->has_userptr_probe = val;
    if (getparam(fd, I915_PARAM_HAS_CONTEXT_ISOLATION, &val))
       devinfo->has_context_isolation = val;

    /* TODO: We might be able to reduce alignment to 4Kb on DG1. */
    if (devinfo->verx10 >= 125)
       devinfo->mem_alignment = 64 * 1024;
    else if (devinfo->has_local_mem)
       devinfo->mem_alignment = 64 * 1024;
    else
       devinfo->mem_alignment = 4096;

    return true;
 }
	/*
	* Copyright © 2023 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#include <string.h>

	#include "intel/dev/i915/intel_device_info.h"
	#include "intel/dev/intel_device_info.h"

	#include "intel/dev/intel_hwconfig.h"
	#include "intel/common/intel_gem.h"
	#include "intel/common/i915/intel_gem.h"

	#include "util/bitscan.h"
	#include "util/log.h"
	#include "util/os_misc.h"

	#include "drm-uapi/i915_drm.h"

	/* At some point in time, some people decided to redefine what topology means,
	* from useful HW related information (slice, subslice, etc...), to much less
	* useful generic stuff that no one cares about (a single slice with lots of
	* subslices). Of course all of this was done without asking the people who
	* defined the topology query in the first place, to solve a lack of
	* information Gfx10+. This function is here to workaround the fact it's not
	* possible to change people's mind even before this stuff goes upstream. Sad
	* times...
	*/
	static void
	update_from_single_slice_topology(struct intel_device_info *devinfo,
	const struct drm_i915_query_topology_info *topology,
	const struct drm_i915_query_topology_info *geom_topology)
	{
	/* An array of bit masks of the subslices available for 3D
	* workloads, analogous to intel_device_info::subslice_masks. This
	* may differ from the set of enabled subslices on XeHP+ platforms
	* with compute-only subslices.
	*/
	uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 };

	assert(devinfo->verx10 >= 125);

	intel_device_info_topology_reset_masks(devinfo);

	assert(topology->max_slices == 1);
	assert(topology->max_subslices > 0);
	assert(topology->max_eus_per_subslice > 0);

	/* i915 gives us only one slice so we have to rebuild that out of groups of
	* 4 dualsubslices.
	*/
	devinfo->max_subslices_per_slice = 4;
	devinfo->max_eus_per_subslice = 16;
	devinfo->subslice_slice_stride = 1;
	devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8);
	devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8);

	for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) {
	const uint32_t s = ss_idx / 4;
	const uint32_t ss = ss_idx % 4;

	/* Determine whether ss_idx is enabled (ss_idx_available) and
	* available for 3D workloads (geom_ss_idx_available), which may
	* differ on XeHP+ if ss_idx is a compute-only DSS.
	*/
	const bool ss_idx_available =
	(topology->data[topology->subslice_offset + ss_idx / 8] >>
	(ss_idx % 8)) & 1;
	const bool geom_ss_idx_available =
	(geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >>
	(ss_idx % 8)) & 1;

	if (geom_ss_idx_available) {
	assert(ss_idx_available);
	geom_subslice_masks[s * devinfo->subslice_slice_stride +
	ss / 8] \|= 1u << (ss % 8);
	}

	if (!ss_idx_available)
	continue;

	devinfo->max_slices = MAX2(devinfo->max_slices, s + 1);
	devinfo->slice_masks \|= 1u << s;

	devinfo->subslice_masks[s * devinfo->subslice_slice_stride +
	ss / 8] \|= 1u << (ss % 8);

	for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
	const bool eu_available =
	(topology->data[topology->eu_offset +
	ss_idx * topology->eu_stride +
	eu / 8] >> (eu % 8)) & 1;

	if (!eu_available)
	continue;

	devinfo->eu_masks[s * devinfo->eu_slice_stride +
	ss * devinfo->eu_subslice_stride +
	eu / 8] \|= 1u << (eu % 8);
	}
	}

	intel_device_info_topology_update_counts(devinfo);
	intel_device_info_update_pixel_pipes(devinfo, geom_subslice_masks);
	intel_device_info_update_l3_banks(devinfo);
	}

	static void
	update_from_topology(struct intel_device_info *devinfo,
	const struct drm_i915_query_topology_info *topology)
	{
	intel_device_info_topology_reset_masks(devinfo);

	assert(topology->max_slices > 0);
	assert(topology->max_subslices > 0);
	assert(topology->max_eus_per_subslice > 0);

	devinfo->subslice_slice_stride = topology->subslice_stride;

	devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
	devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;

	assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
	memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
	devinfo->max_slices = topology->max_slices;
	devinfo->max_subslices_per_slice = topology->max_subslices;
	devinfo->max_eus_per_subslice = topology->max_eus_per_subslice;

	uint32_t subslice_mask_len =
	topology->max_slices * topology->subslice_stride;
	assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
	memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
	subslice_mask_len);

	uint32_t eu_mask_len =
	topology->eu_stride * topology->max_subslices * topology->max_slices;
	assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
	memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);

	/* Now that all the masks are in place, update the counts. */
	intel_device_info_topology_update_counts(devinfo);
	intel_device_info_update_pixel_pipes(devinfo, devinfo->subslice_masks);
	intel_device_info_update_l3_banks(devinfo);
	}

	/* Generate detailed mask from the I915_PARAM_SLICE_MASK,
	* I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
	*/
	bool
	intel_device_info_i915_update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
	uint32_t subslice_mask, uint32_t n_eus)
	{
	struct drm_i915_query_topology_info *topology;

	assert((slice_mask & 0xff) == slice_mask);

	size_t data_length = 100;

	topology = calloc(1, sizeof(*topology) + data_length);
	if (!topology)
	return false;

	topology->max_slices = util_last_bit(slice_mask);
	topology->max_subslices = util_last_bit(subslice_mask);

	topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
	topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);

	uint32_t n_subslices = __builtin_popcount(slice_mask) *
	__builtin_popcount(subslice_mask);
	uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
	uint32_t eu_mask = (1U << max_eus_per_subslice) - 1;

	topology->max_eus_per_subslice = max_eus_per_subslice;
	topology->eu_offset = topology->subslice_offset +
	topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
	topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8);

	/* Set slice mask in topology */
	for (int b = 0; b < topology->subslice_offset; b++)
	topology->data[b] = (slice_mask >> (b * 8)) & 0xff;

	for (int s = 0; s < topology->max_slices; s++) {

	/* Set subslice mask in topology */
	for (int b = 0; b < topology->subslice_stride; b++) {
	int subslice_offset = topology->subslice_offset +
	s * topology->subslice_stride + b;

	topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
	}

	/* Set eu mask in topology */
	for (int ss = 0; ss < topology->max_subslices; ss++) {
	for (int b = 0; b < topology->eu_stride; b++) {
	int eu_offset = topology->eu_offset +
	(s * topology->max_subslices + ss) * topology->eu_stride + b;

	topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
	}
	}
	}

	update_from_topology(devinfo, topology);
	free(topology);

	return true;
	}

	static bool
	getparam(int fd, uint32_t param, int *value)
	{
	int tmp;

	struct drm_i915_getparam gp = {
	.param = param,
	.value = &tmp,
	};

	int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
	if (ret != 0)
	return false;

	*value = tmp;
	return true;
	}

	static bool
	get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value)
	{
	struct drm_i915_gem_context_param gp = {
	.ctx_id = context,
	.param = param,
	};

	int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
	if (ret != 0)
	return false;

	*value = gp.value;
	return true;
	}

	/**
	* for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
	* (kernel 4.13+)
	*/
	static bool
	getparam_topology(struct intel_device_info *devinfo, int fd)
	{
	int slice_mask = 0;
	if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
	goto maybe_warn;

	int n_eus;
	if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
	goto maybe_warn;

	int subslice_mask = 0;
	if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
	goto maybe_warn;

	return intel_device_info_i915_update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);

	maybe_warn:
	/* Only with Gfx8+ are we starting to see devices with fusing that can only
	* be detected at runtime.
	*/
	if (devinfo->ver >= 8)
	mesa_logw("Kernel 4.1 required to properly query GPU properties.");

	return false;
	}

	/**
	* preferred API for updating the topology in devinfo (kernel 4.17+)
	*/
	static bool
	query_topology(struct intel_device_info *devinfo, int fd)
	{
	struct drm_i915_query_topology_info *topo_info =
	intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL);
	if (topo_info == NULL)
	return false;

	if (devinfo->verx10 >= 125) {
	struct drm_i915_query_topology_info *geom_topo_info =
	intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL);
	if (geom_topo_info == NULL) {
	free(topo_info);
	return false;
	}

	update_from_single_slice_topology(devinfo, topo_info, geom_topo_info);
	free(geom_topo_info);
	} else {
	update_from_topology(devinfo, topo_info);
	}

	free(topo_info);

	return true;

	}

	/**
	* Reports memory region info, and allows buffers to target system-memory,
	* and/or device local memory.
	*/
	bool
	intel_device_info_i915_query_regions(struct intel_device_info *devinfo, int fd, bool update)
	{
	struct drm_i915_query_memory_regions *meminfo =
	intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL);

	if (meminfo == NULL) {
	/* If the memory region uAPI query is not available, try to generate some
	* numbers out of os_* utils for sram only.
	*/
	bool ret = intel_device_info_compute_system_memory(devinfo, false);
	devinfo->mem.sram.mappable.size /= 2;
	return ret;
	}

	for (int i = 0; i < meminfo->num_regions; i++) {
	const struct drm_i915_memory_region_info *mem = &meminfo->regions[i];
	switch (mem->region.memory_class) {
	case I915_MEMORY_CLASS_SYSTEM: {
	if (!update) {
	devinfo->mem.sram.mem.klass = mem->region.memory_class;
	devinfo->mem.sram.mem.instance = mem->region.memory_instance;
	/* i915 reports the whole RAM as SRAM size but Xe KMD only reports
	* half, so adjusting i915 to follow Xe KMD.
	*/
	devinfo->mem.sram.mappable.size = mem->probed_size / 2;
	} else {
	assert(devinfo->mem.sram.mem.klass == mem->region.memory_class);
	assert(devinfo->mem.sram.mem.instance == mem->region.memory_instance);
	assert(devinfo->mem.sram.mappable.size == mem->probed_size / 2);
	}
	/* if running without elevated privileges i915 reports
	* unallocated_size == probed_size
	*/
	devinfo->mem.sram.mappable.free = mem->unallocated_size;
	break;
	}
	case I915_MEMORY_CLASS_DEVICE:
	if (!update) {
	devinfo->mem.vram.mem.klass = mem->region.memory_class;
	devinfo->mem.vram.mem.instance = mem->region.memory_instance;
	if (mem->probed_cpu_visible_size > 0) {
	devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size;
	devinfo->mem.vram.unmappable.size =
	mem->probed_size - mem->probed_cpu_visible_size;
	} else {
	/* We are running on an older kernel without support for the
	* small-bar uapi. These kernels only support systems where the
	* entire vram is mappable.
	*/
	devinfo->mem.vram.mappable.size = mem->probed_size;
	devinfo->mem.vram.unmappable.size = 0;
	}
	} else {
	assert(devinfo->mem.vram.mem.klass == mem->region.memory_class);
	assert(devinfo->mem.vram.mem.instance == mem->region.memory_instance);
	assert((devinfo->mem.vram.mappable.size +
	devinfo->mem.vram.unmappable.size) == mem->probed_size);
	}
	if (mem->unallocated_cpu_visible_size > 0) {
	if (mem->unallocated_size != -1) {
	devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size;
	devinfo->mem.vram.unmappable.free =
	mem->unallocated_size - mem->unallocated_cpu_visible_size;
	}
	} else {
	/* We are running on an older kernel without support for the
	* small-bar uapi. These kernels only support systems where the
	* entire vram is mappable.
	*/
	if (mem->unallocated_size != -1) {
	devinfo->mem.vram.mappable.free = mem->unallocated_size;
	devinfo->mem.vram.unmappable.free = 0;
	}
	}
	break;
	default:
	break;
	}
	}

	free(meminfo);
	devinfo->mem.use_class_instance = true;
	return true;
	}

	static int
	intel_get_aperture_size(int fd, uint64_t *size)
	{
	struct drm_i915_gem_get_aperture aperture = { 0 };

	int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
	if (ret == 0 && size)
	*size = aperture.aper_size;

	return ret;
	}

	static bool
	has_bit6_swizzle(int fd)
	{
	struct drm_gem_close close;

	struct drm_i915_gem_create gem_create = {
	.size = 4096,
	};

	if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
	unreachable("Failed to create GEM BO");
	return false;
	}

	bool swizzled = false;

	/* set_tiling overwrites the input on the error path, so we have to open
	* code intel_ioctl.
	*/
	struct drm_i915_gem_set_tiling set_tiling = {
	.handle = gem_create.handle,
	.tiling_mode = I915_TILING_X,
	.stride = 512,
	};

	if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling)) {
	unreachable("Failed to set BO tiling");
	goto close_and_return;
	}

	struct drm_i915_gem_get_tiling get_tiling = {
	.handle = gem_create.handle,
	};

	if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
	unreachable("Failed to get BO tiling");
	goto close_and_return;
	}

	assert(get_tiling.tiling_mode == I915_TILING_X);
	swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;

	close_and_return:
	memset(&close, 0, sizeof(close));
	close.handle = gem_create.handle;
	intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);

	return swizzled;
	}

	static bool
	has_get_tiling(int fd)
	{
	int ret;

	struct drm_i915_gem_create gem_create = {
	.size = 4096,
	};

	if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
	unreachable("Failed to create GEM BO");
	return false;
	}

	struct drm_i915_gem_get_tiling get_tiling = {
	.handle = gem_create.handle,
	};
	ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);

	struct drm_gem_close close = {
	.handle = gem_create.handle,
	};
	intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);

	return ret == 0;
	}

	static void
	fixup_chv_device_info(struct intel_device_info *devinfo)
	{
	assert(devinfo->platform == INTEL_PLATFORM_CHV);

	/* Cherryview is annoying. The number of EUs is depending on fusing and
	* isn't determinable from the PCI ID alone. We default to the minimum
	* available for that PCI ID and then compute the real value from the
	* subslice information we get from the kernel.
	*/
	const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
	const uint32_t eu_total = intel_device_info_eu_total(devinfo);

	/* Logical CS threads = EUs per subslice * num threads per EU */
	uint32_t max_cs_threads =
	eu_total / subslice_total * devinfo->num_thread_per_eu;

	/* Fuse configurations may give more threads than expected, never less. */
	if (max_cs_threads > devinfo->max_cs_threads)
	devinfo->max_cs_threads = max_cs_threads;

	intel_device_info_update_cs_workgroup_threads(devinfo);

	/* Braswell is even more annoying. Its marketing name isn't determinable
	* from the PCI ID and is also dependent on fusing.
	*/
	if (devinfo->pci_device_id != 0x22B1)
	return;

	char *bsw_model;
	switch (eu_total) {
	case 16: bsw_model = "405"; break;
	case 12: bsw_model = "400"; break;
	default: bsw_model = " "; break;
	}

	char *needle = strstr(devinfo->name, "XXX");
	assert(needle);
	if (needle)
	memcpy(needle, bsw_model, 3);
	}

	void *
	intel_device_info_i915_query_hwconfig(int fd, int32_t *len)
	{
	return intel_i915_query_alloc(fd, DRM_I915_QUERY_HWCONFIG_BLOB, len);
	}

	bool intel_device_info_i915_get_info_from_fd(int fd, struct intel_device_info *devinfo)
	{
	void *hwconfig_blob;
	int32_t len;

	int val;
	if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY, &val))
	devinfo->timestamp_frequency = val;
	else if (devinfo->ver >= 10) {
	mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
	return false;
	}

	if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
	devinfo->revision = 0;

	if (!query_topology(devinfo, fd)) {
	if (devinfo->ver >= 10) {
	/* topology uAPI required for CNL+ (kernel 4.17+) */
	return false;
	}

	/* else use the kernel 4.13+ api for gfx8+. For older kernels, topology
	* will be wrong, affecting GPU metrics. In this case, fail silently.
	*/
	getparam_topology(devinfo, fd);
	}

	hwconfig_blob = intel_device_info_i915_query_hwconfig(fd, &len);
	if (hwconfig_blob) {
	if (intel_hwconfig_process_table(devinfo, hwconfig_blob, len))
	intel_device_info_update_after_hwconfig(devinfo);

	free(hwconfig_blob);
	}

	intel_device_info_i915_query_regions(devinfo, fd, false);

	if (devinfo->platform == INTEL_PLATFORM_CHV)
	fixup_chv_device_info(devinfo);

	/* Broadwell PRM says:
	*
	* "Before Gfx8, there was a historical configuration control field to
	* swizzle address bit[6] for in X/Y tiling modes. This was set in three
	* different places: TILECTL[1:0], ARB_MODE[5:4], and
	* DISP_ARB_CTL[14:13].
	*
	* For Gfx8 and subsequent generations, the swizzle fields are all
	* reserved, and the CPU's memory controller performs all address
	* swizzling modifications."
	*/
	devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd);

	intel_get_aperture_size(fd, &devinfo->aperture_bytes);
	get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size);
	devinfo->has_tiling_uapi = has_get_tiling(fd);
	devinfo->has_caching_uapi =
	devinfo->platform < INTEL_PLATFORM_DG2_START && !devinfo->has_local_mem;
	if (devinfo->ver > 12 \|\| intel_device_info_is_mtl_or_arl(devinfo))
	devinfo->has_set_pat_uapi = true;

	if (getparam(fd, I915_PARAM_MMAP_GTT_VERSION, &val))
	devinfo->has_mmap_offset = val >= 4;
	if (getparam(fd, I915_PARAM_HAS_USERPTR_PROBE, &val))
	devinfo->has_userptr_probe = val;
	if (getparam(fd, I915_PARAM_HAS_CONTEXT_ISOLATION, &val))
	devinfo->has_context_isolation = val;

	/* TODO: We might be able to reduce alignment to 4Kb on DG1. */
	if (devinfo->verx10 >= 125)
	devinfo->mem_alignment = 64 * 1024;
	else if (devinfo->has_local_mem)
	devinfo->mem_alignment = 64 * 1024;
	else
	devinfo->mem_alignment = 4096;

	return true;
	}