| /* |
| * Copyright © 2020 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #pragma once |
| |
| #include <stdint.h> |
| |
| #include "compiler/shader_enums.h" |
| #include "util/macros.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| /** Vulkan defines shaderGroupHandleSize = 32 */ |
| #define BRW_RT_SBT_HANDLE_SIZE 32 |
| |
| /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ |
| #define BRW_RT_DISPATCH_GLOBALS_SIZE 80 |
| |
| /** Offset after the RT dispatch globals at which "push" constants live */ |
| #define BRW_RT_PUSH_CONST_OFFSET 128 |
| |
| /** Stride of the resume SBT */ |
| #define BRW_BTD_RESUME_SBT_STRIDE 8 |
| |
| /* Vulkan always uses exactly two levels of BVH: world and object. At the API |
| * level, these are referred to as top and bottom. |
| */ |
| enum brw_rt_bvh_level { |
| BRW_RT_BVH_LEVEL_WORLD = 0, |
| BRW_RT_BVH_LEVEL_OBJECT = 1, |
| }; |
| #define BRW_RT_MAX_BVH_LEVELS 2 |
| |
| enum brw_rt_bvh_node_type { |
| BRW_RT_BVH_NODE_TYPE_INTERNAL = 0, |
| BRW_RT_BVH_NODE_TYPE_INSTANCE = 1, |
| BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3, |
| BRW_RT_BVH_NODE_TYPE_QUAD = 4, |
| }; |
| |
| /** HitKind values returned for triangle geometry |
| * |
| * This enum must match the SPIR-V enum. |
| */ |
| enum brw_rt_hit_kind { |
| BRW_RT_HIT_KIND_FRONT_FACE = 0xfe, |
| BRW_RT_HIT_KIND_BACK_FACE = 0xff, |
| }; |
| |
| /** Ray flags |
| * |
| * This enum must match the SPIR-V RayFlags enum. |
| */ |
| enum brw_rt_ray_flags { |
| BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01, |
| BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02, |
| BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04, |
| BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08, |
| BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10, |
| BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20, |
| BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40, |
| BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80, |
| BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100, |
| BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200, |
| }; |
| |
| struct brw_rt_scratch_layout { |
| /** Number of stack IDs per DSS */ |
| uint32_t stack_ids_per_dss; |
| |
| /** Start offset (in bytes) of the hardware MemRay stack */ |
| uint32_t ray_stack_start; |
| |
| /** Stride (in bytes) of the hardware MemRay stack */ |
| uint32_t ray_stack_stride; |
| |
| /** Start offset (in bytes) of the SW stacks */ |
| uint64_t sw_stack_start; |
| |
| /** Size (in bytes) of the SW stack for a single shader invocation */ |
| uint32_t sw_stack_size; |
| |
| /** Total size (in bytes) of the RT scratch memory area */ |
| uint64_t total_size; |
| }; |
| |
| /** Parameters passed to the raygen trampoline shader |
| * |
| * This struct is carefully construected to be 32B and must be passed to the |
| * raygen trampoline shader as as inline constant data. |
| */ |
| struct brw_rt_raygen_trampoline_params { |
| /** The GPU address of the RT_DISPATCH_GLOBALS */ |
| uint64_t rt_disp_globals_addr; |
| |
| /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */ |
| uint64_t raygen_bsr_addr; |
| |
| /** 1 if this is an indirect dispatch, 0 otherwise */ |
| uint8_t is_indirect; |
| |
| /** The integer log2 of the local group size |
| * |
| * Ray-tracing shaders don't have a concept of local vs. global workgroup |
| * size. They only have a single 3D launch size. The raygen trampoline |
| * shader is always dispatched with a local workgroup size equal to the |
| * SIMD width but the shape of the local workgroup is determined at |
| * dispatch time based on the shape of the launch and passed to the |
| * trampoline via this field. (There's no sense having a Z dimension on |
| * the local workgroup if the launch is 2D.) |
| * |
| * We use the integer log2 of the size because there's no point in |
| * non-power-of-two sizes and shifts are cheaper than division. |
| */ |
| uint8_t local_group_size_log2[3]; |
| |
| uint32_t pad[3]; |
| }; |
| |
| /** Size of the "hot zone" in bytes |
| * |
| * The hot zone is a SW-defined data structure which is a single uvec4 |
| * containing two bits of information: |
| * |
| * - hotzone.x: Stack offset (in bytes) |
| * |
| * This is the offset (in bytes) into the per-thread scratch space at which |
| * the current shader's stack starts. This is incremented by the calling |
| * shader prior to any shader call type instructions and gets decremented |
| * by the resume shader as part of completing the return operation. |
| * |
| * |
| * - hotzone.yzw: The launch ID associated with the current thread |
| * |
| * Inside a bindless shader, the only information we have is the DSS ID |
| * from the hardware EU and a per-DSS stack ID. In particular, the three- |
| * dimensional launch ID is lost the moment we leave the raygen trampoline. |
| */ |
| #define BRW_RT_SIZEOF_HOTZONE 16 |
| |
| /* From the BSpec "Address Computation for Memory Based Data Structures: |
| * Ray and TraversalStack (Async Ray Tracing)": |
| * |
| * sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B. |
| */ |
| #define BRW_RT_SIZEOF_RAY 64 |
| #define BRW_RT_SIZEOF_HIT_INFO 32 |
| #define BRW_RT_SIZEOF_TRAV_STACK 32 |
| |
| /* From the BSpec: |
| * |
| * syncStackSize = (maxBVHLevels % 2 == 1) ? |
| * (sizeof(HitInfo) * 2 + |
| * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) : |
| * (sizeof(HitInfo) * 2 + |
| * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels); |
| * |
| * The select is just to align to 64B. |
| */ |
| #define BRW_RT_SIZEOF_RAY_QUERY \ |
| (BRW_RT_SIZEOF_HIT_INFO * 2 + \ |
| (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ |
| (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) |
| |
| #define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \ |
| (BRW_RT_SIZEOF_HIT_INFO * 2 + \ |
| (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS) |
| |
| #define BRW_RT_SIZEOF_HW_STACK \ |
| (BRW_RT_SIZEOF_HIT_INFO * 2 + \ |
| BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ |
| BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS) |
| |
| /* This is a mesa-defined region for hit attribute data */ |
| #define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64 |
| #define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK |
| |
| #define BRW_RT_ASYNC_STACK_STRIDE \ |
| ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \ |
| BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64) |
| |
| static inline void |
| brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout, |
| const struct intel_device_info *devinfo, |
| uint32_t stack_ids_per_dss, |
| uint32_t sw_stack_size) |
| { |
| layout->stack_ids_per_dss = stack_ids_per_dss; |
| |
| const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo); |
| const uint32_t num_stack_ids = dss_count * stack_ids_per_dss; |
| |
| uint64_t size = 0; |
| |
| /* The first thing in our scratch area is an array of "hot zones" which |
| * store the stack offset as well as the launch IDs for each active |
| * invocation. |
| */ |
| size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids; |
| |
| /* Next, we place the HW ray stacks */ |
| assert(size % 64 == 0); /* Cache-line aligned */ |
| assert(size < UINT32_MAX); |
| layout->ray_stack_start = size; |
| layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE; |
| size += num_stack_ids * layout->ray_stack_stride; |
| |
| /* Finally, we place the SW stacks for the individual ray-tracing shader |
| * invocations. We align these to 64B to ensure that we don't have any |
| * shared cache lines which could hurt performance. |
| */ |
| assert(size % 64 == 0); |
| layout->sw_stack_start = size; |
| layout->sw_stack_size = ALIGN(sw_stack_size, 64); |
| |
| /* Currently it's always the case that sw_stack_size is a power of |
| * two, but power-of-two SW stack sizes are prone to causing |
| * collisions in the hashing function used by the L3 to map memory |
| * addresses to banks, which can cause stack accesses from most |
| * DSSes to bottleneck on a single L3 bank. Fix it by padding the |
| * SW stack by a single cacheline if it was a power of two. |
| */ |
| if (layout->sw_stack_size > 64 && |
| util_is_power_of_two_nonzero(layout->sw_stack_size)) |
| layout->sw_stack_size += 64; |
| |
| size += num_stack_ids * layout->sw_stack_size; |
| |
| layout->total_size = size; |
| } |
| |
| static inline uint32_t |
| brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo) |
| { |
| /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids |
| * which includes all the threads. |
| */ |
| uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; |
| uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ |
| return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY; |
| } |
| |
| static inline uint32_t |
| brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo) |
| { |
| /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids |
| * which includes all the threads. |
| */ |
| uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; |
| uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ |
| return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY; |
| } |
| |
| static inline uint32_t |
| brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo, |
| uint32_t ray_queries) |
| { |
| /* Don't bother a shadow stack if we only have a single query. We can |
| * directly write in the HW buffer. |
| */ |
| return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) + |
| ray_queries * 4; /* Ctrl + Level data */ |
| } |
| |
| #ifdef __cplusplus |
| } |
| #endif |