blob: 97cd68cf759368bca8b02cc245f366098a5b982c [file] [log] [blame]
/*
* Copyright © 2008 Jérôme Glisse
* Copyright © 2010 Marek Olšák <[email protected]>
* Copyright © 2015 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#include "amdgpu_cs.h"
#include "util/detect_os.h"
#include "amdgpu_winsys.h"
#include "util/os_time.h"
#include <inttypes.h>
#include <stdio.h>
#include "amd/common/sid.h"
/* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
* codes in the kernel).
*/
#if DETECT_OS_OPENBSD
#define ENODATA ENOTSUP
#elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
#define ENODATA ECONNREFUSED
#endif
/* FENCES */
void amdgpu_fence_destroy(struct amdgpu_fence *fence)
{
ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj);
if (fence->ctx)
amdgpu_ctx_reference(&fence->ctx, NULL);
util_queue_fence_destroy(&fence->submitted);
FREE(fence);
}
static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs *cs)
{
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
struct amdgpu_ctx *ctx = cs->ctx;
fence->reference.count = 1;
fence->aws = ctx->aws;
amdgpu_ctx_reference(&fence->ctx, ctx);
fence->ctx = ctx;
fence->ip_type = cs->ip_type;
if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) {
free(fence);
return NULL;
}
util_queue_fence_init(&fence->submitted);
util_queue_fence_reset(&fence->submitted);
fence->queue_index = cs->queue_index;
return (struct pipe_fence_handle *)fence;
}
static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
int r;
if (!fence)
return NULL;
pipe_reference_init(&fence->reference, 1);
fence->aws = aws;
fence->ip_type = 0xffffffff;
r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj);
if (r) {
FREE(fence);
return NULL;
}
util_queue_fence_init(&fence->submitted);
fence->imported = true;
return (struct pipe_fence_handle*)fence;
}
static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
if (!fence)
return NULL;
pipe_reference_init(&fence->reference, 1);
fence->aws = aws;
/* fence->ctx == NULL means that the fence is syncobj-based. */
/* Convert sync_file into syncobj. */
int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj);
if (r) {
FREE(fence);
return NULL;
}
r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd);
if (r) {
ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj);
FREE(fence);
return NULL;
}
util_queue_fence_init(&fence->submitted);
fence->imported = true;
return (struct pipe_fence_handle*)fence;
}
static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
struct pipe_fence_handle *pfence)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
int fd, r;
util_queue_fence_wait(&fence->submitted);
/* Convert syncobj into sync_file. */
r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd);
return r ? -1 : fd;
}
static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
uint32_t syncobj;
int fd = -1;
int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
&syncobj);
if (r) {
return -1;
}
r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd);
if (r) {
fd = -1;
}
ac_drm_cs_destroy_syncobj(aws->fd, syncobj);
return fd;
}
static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
uint64_t seq_no,
uint64_t *user_fence_cpu_address)
{
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
afence->seq_no = seq_no;
afence->user_fence_cpu_address = user_fence_cpu_address;
util_queue_fence_signal(&afence->submitted);
}
static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
{
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
afence->signalled = true;
util_queue_fence_signal(&afence->submitted);
}
bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
bool absolute)
{
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
int64_t abs_timeout;
uint64_t *user_fence_cpu;
if (afence->signalled)
return true;
if (absolute)
abs_timeout = timeout;
else
abs_timeout = os_time_get_absolute_timeout(timeout);
/* The fence might not have a number assigned if its IB is being
* submitted in the other thread right now. Wait until the submission
* is done. */
if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
return false;
user_fence_cpu = afence->user_fence_cpu_address;
if (user_fence_cpu) {
if (*user_fence_cpu >= afence->seq_no) {
afence->signalled = true;
return true;
}
/* No timeout, just query: no need for the ioctl. */
if (!absolute && !timeout)
return false;
}
if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
abs_timeout = INT64_MAX;
if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1,
abs_timeout, 0, NULL))
return false;
/* Check that guest-side syncobj agrees with the user fence. */
if (user_fence_cpu && afence->aws->info.is_virtio)
assert(afence->seq_no <= *user_fence_cpu);
afence->signalled = true;
return true;
}
static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
struct pipe_fence_handle *fence,
uint64_t timeout)
{
return amdgpu_fence_wait(fence, timeout, false);
}
static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct pipe_fence_handle *fence = NULL;
if (cs->noop)
return NULL;
if (cs->next_fence) {
amdgpu_fence_reference(&fence, cs->next_fence);
return fence;
}
fence = amdgpu_fence_create(cs);
if (!fence)
return NULL;
amdgpu_fence_reference(&cs->next_fence, fence);
return fence;
}
/* CONTEXTS */
static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
{
switch (radeon_priority) {
case RADEON_CTX_PRIORITY_REALTIME:
return AMDGPU_CTX_PRIORITY_VERY_HIGH;
case RADEON_CTX_PRIORITY_HIGH:
return AMDGPU_CTX_PRIORITY_HIGH;
case RADEON_CTX_PRIORITY_MEDIUM:
return AMDGPU_CTX_PRIORITY_NORMAL;
case RADEON_CTX_PRIORITY_LOW:
return AMDGPU_CTX_PRIORITY_LOW;
default:
unreachable("Invalid context priority");
}
}
static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
enum radeon_ctx_priority priority,
bool allow_context_lost)
{
struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
int r;
struct amdgpu_bo_alloc_request alloc_buffer = {};
uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
ac_drm_device *dev;
ac_drm_bo buf_handle;
if (!ctx)
return NULL;
ctx->aws = amdgpu_winsys(rws);
ctx->reference.count = 1;
ctx->allow_context_lost = allow_context_lost;
dev = ctx->aws->dev;
r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle);
if (r) {
fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
goto error_create;
}
alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle);
if (r) {
fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
goto error_user_fence_alloc;
}
ctx->user_fence_cpu_address_base = NULL;
r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base);
if (r) {
fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
goto error_user_fence_map;
}
memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
ctx->user_fence_bo = buf_handle;
ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle);
return (struct radeon_winsys_ctx*)ctx;
error_user_fence_map:
ac_drm_bo_free(dev, buf_handle);
error_user_fence_alloc:
ac_drm_cs_ctx_free(dev, ctx->ctx_handle);
error_create:
FREE(ctx);
return NULL;
}
static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
amdgpu_ctx_reference(&ctx, NULL);
}
static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
{
unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
if (unaligned_dw) {
int remaining = pad_dw_mask + 1 - unaligned_dw;
/* Only pad by 1 dword with the type-2 NOP if necessary. */
if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
ib[(*num_dw)++] = PKT2_NOP_PAD;
} else {
/* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
* packet. The size of the packet body after the header is always count + 1.
* If count == -1, there is no packet body. NOP is the only packet that can have
* count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
*/
ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
*num_dw += remaining - 1;
}
}
assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
}
static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
{
struct amdgpu_bo_alloc_request request = {0};
struct drm_amdgpu_bo_list_in bo_list_in;
struct drm_amdgpu_cs_chunk_ib ib_in = {0};
ac_drm_bo bo;
amdgpu_va_handle va_handle = NULL;
struct drm_amdgpu_cs_chunk chunks[2];
struct drm_amdgpu_bo_list_entry list;
unsigned noop_dw_size;
void *cpu = NULL;
uint64_t seq_no;
uint64_t va;
int r;
/* Older amdgpu doesn't report if the reset is complete or not. Detect
* it by submitting a no-op job. If it reports an error, then assume
* that the reset is not complete.
*/
uint32_t temp_ctx_handle;
r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle);
if (r)
return r;
request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
request.alloc_size = 4096;
request.phys_alignment = 4096;
r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo);
if (r)
goto destroy_ctx;
r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
request.alloc_size, request.phys_alignment,
0, &va, &va_handle,
AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
if (r)
goto destroy_bo;
uint32_t kms_handle;
ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle);
r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va,
AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
AMDGPU_VA_OP_MAP);
if (r)
goto destroy_bo;
r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu);
if (r)
goto destroy_bo;
noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
ac_drm_bo_cpu_unmap(ctx->aws->dev, bo);
list.bo_handle = kms_handle;
ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle);
list.bo_priority = 0;
bo_list_in.list_handle = ~0;
bo_list_in.bo_number = 1;
bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
ib_in.ip_type = AMD_IP_GFX;
ib_in.ib_bytes = noop_dw_size * 4;
ib_in.va_start = va;
chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
chunks[0].chunk_data = (uintptr_t)&bo_list_in;
chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[1].chunk_data = (uintptr_t)&ib_in;
r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no);
destroy_bo:
if (va_handle)
ac_drm_va_range_free(va_handle);
ac_drm_bo_free(ctx->aws->dev, bo);
destroy_ctx:
ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle);
return r;
}
static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
const char *format, ...)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
/* Don't overwrite the last reset status. */
if (ctx->sw_status != PIPE_NO_RESET)
return;
ctx->sw_status = status;
if (!ctx->allow_context_lost) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
/* Non-robust contexts are allowed to terminate the process. The only alternative is
* to skip command submission, which would look like a freeze because nothing is drawn,
* which looks like a hang without any reset.
*/
abort();
}
}
static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
bool *needs_reset, bool *reset_completed)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
if (needs_reset)
*needs_reset = false;
if (reset_completed)
*reset_completed = false;
/* Return a failure due to a GPU hang. */
uint64_t flags;
if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
/* If the caller is only interested in full reset (= wants to ignore soft
* recoveries), we can use the rejected cs count as a quick first check.
*/
return PIPE_NO_RESET;
}
/*
* ctx->sw_status is updated on alloc/ioctl failures.
*
* We only rely on amdgpu_cs_query_reset_state2 to tell us
* that the context reset is complete.
*/
if (ctx->sw_status != PIPE_NO_RESET) {
int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags);
if (!r) {
if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
if (reset_completed) {
/* The ARB_robustness spec says:
*
* If a reset status other than NO_ERROR is returned and subsequent
* calls return NO_ERROR, the context reset was encountered and
* completed. If a reset status is repeatedly returned, the context may
* be in the process of resetting.
*
* Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
* so don't do anything special. On older kernels, submit a no-op cs. If it
* succeeds then assume the reset is complete.
*/
if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
*reset_completed = true;
if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
*reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
}
}
} else {
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
}
/* Return a failure due to SW issues. */
if (needs_reset)
*needs_reset = true;
return ctx->sw_status;
}
if (needs_reset)
*needs_reset = false;
return PIPE_NO_RESET;
}
/* COMMAND SUBMISSION */
static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
{
return acs->ip_type == AMD_IP_GFX ||
acs->ip_type == AMD_IP_COMPUTE ||
acs->ip_type == AMD_IP_SDMA;
}
static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
{
if (cs->has_chaining)
return 4; /* for chaining */
return 0;
}
static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
struct amdgpu_buffer_list *list)
{
int num_buffers = list->num_buffers;
struct amdgpu_cs_buffer *buffers = list->buffers;
unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
int i = cs->buffer_indices_hashlist[hash];
/* not found or found */
if (i < 0)
return NULL;
if (i < num_buffers && buffers[i].bo == bo)
return &buffers[i];
/* Hash collision, look for the BO in the list of buffers linearly. */
for (int i = num_buffers - 1; i >= 0; i--) {
if (buffers[i].bo == bo) {
/* Put this buffer in the hash list.
* This will prevent additional hash collisions if there are
* several consecutive lookup_buffer calls for the same buffer.
*
* Example: Assuming buffers A,B,C collide in the hash list,
* the following sequence of buffers:
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
* will collide here: ^ and here: ^,
* meaning that we should get very few collisions in the end. */
cs->buffer_indices_hashlist[hash] = i & 0x7fff;
return &buffers[i];
}
}
return NULL;
}
struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
{
return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
}
static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
struct amdgpu_buffer_list *list, bool add_ref)
{
/* New buffer, check if the backing array is large enough. */
if (unlikely(list->num_buffers >= list->max_buffers)) {
unsigned new_max =
MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
struct amdgpu_cs_buffer *new_buffers;
new_buffers = (struct amdgpu_cs_buffer *)
REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
new_max * sizeof(*new_buffers));
if (!new_buffers) {
fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
return NULL;
}
list->max_buffers = new_max;
list->buffers = new_buffers;
}
unsigned idx = list->num_buffers++;
struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
if (add_ref)
p_atomic_inc(&bo->base.reference.count);
buffer->bo = bo;
buffer->usage = 0;
unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
return buffer;
}
static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
struct amdgpu_buffer_list *list, bool add_ref)
{
struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
}
static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
struct pb_buffer_lean *buf,
unsigned usage,
enum radeon_bo_domain domains)
{
/* Don't use the "domains" parameter. Amdgpu doesn't support changing
* the buffer placement during command submission.
*/
struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
struct amdgpu_cs_buffer *buffer;
/* Fast exit for no-op calls.
* This is very effective with suballocators and linear uploaders that
* are outside of the winsys.
*/
if (bo == cs->last_added_bo &&
(usage & cs->last_added_bo_usage) == usage)
return 0;
buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
if (!buffer)
return 0;
buffer->usage |= usage;
cs->last_added_bo_usage = buffer->usage;
cs->last_added_bo = bo;
return 0;
}
static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
struct amdgpu_ib *main_ib,
struct amdgpu_cs *cs)
{
struct pb_buffer_lean *pb;
uint8_t *mapped;
unsigned buffer_size;
/* Always create a buffer that is at least as large as the maximum seen IB size,
* aligned to a power of two.
*/
buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
/* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
if (!cs->has_chaining)
buffer_size *= 4;
const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
/* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
const unsigned max_size = 2 * 1024 * 1024;
buffer_size = MIN2(buffer_size, max_size);
buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
/* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
* The speed of writing to GTT WC is somewhere between no difference and very slow, while
* VRAM being very slow a lot more often.
*
* Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
* and doesn't have to wait for cached GL2 requests to be processed.
*/
enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
RADEON_FLAG_GL2_BYPASS;
if (cs->ip_type == AMD_IP_GFX ||
cs->ip_type == AMD_IP_COMPUTE ||
cs->ip_type == AMD_IP_SDMA) {
/* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
* on Navi 14
*/
flags |= RADEON_FLAG_32BIT;
}
pb = amdgpu_bo_create(aws, buffer_size,
aws->info.gart_page_size,
domain, (radeon_bo_flag)flags);
if (!pb)
return false;
mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
if (!mapped) {
radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
return false;
}
radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
main_ib->big_buffer_cpu_ptr = mapped;
main_ib->used_ib_space = 0;
return true;
}
static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
struct radeon_cmdbuf *rcs,
struct amdgpu_ib *main_ib,
struct amdgpu_cs *cs)
{
struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
/* This is the minimum size of a contiguous IB. */
unsigned ib_size = 16 * 1024;
/* Always allocate at least the size of the biggest cs_check_space call,
* because precisely the last call might have requested this size.
*/
ib_size = MAX2(ib_size, main_ib->max_check_space_size);
if (!cs->has_chaining) {
ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
IB_MAX_SUBMIT_BYTES));
}
/* Decay the IB buffer size over time, so that memory usage decreases after
* a temporary peak.
*/
main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
rcs->prev_dw = 0;
rcs->num_prev = 0;
rcs->current.cdw = 0;
rcs->current.buf = NULL;
/* Allocate a new buffer for IBs if the current buffer is all used. */
if (!main_ib->big_buffer ||
main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
return false;
}
chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
chunk_ib->ib_bytes = 0;
/* ib_bytes is in dwords and the conversion to bytes will be done before
* the CS ioctl. */
main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
main_ib->is_chained_ib = false;
amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
(radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
(radeon_bo_domain)0);
rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
cs->csc->ib_main_addr = rcs->current.buf;
ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
return true;
}
static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
{
if (ib->is_chained_ib) {
*ib->ptr_ib_size = rcs->current.cdw |
S_3F2_CHAIN(1) | S_3F2_VALID(1) |
S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
} else {
*ib->ptr_ib_size = rcs->current.cdw;
}
}
static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
struct amdgpu_ib *ib, enum amd_ip_type ip_type)
{
amdgpu_set_ib_size(rcs, ib);
ib->used_ib_space += rcs->current.cdw * 4;
ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
}
static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
struct amdgpu_cs_context *cs,
enum amd_ip_type ip_type)
{
for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
cs->chunk_ib[i].ip_type = ip_type;
cs->chunk_ib[i].flags = 0;
if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
/* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
* is the beginning of IBs because completion of an IB doesn't care about the state of
* GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
* executed in parallel, so draw calls from the current IB can finish after the next IB
* starts drawing, and so the cache flush at the end of IBs is usually late and thus
* useless.
*/
cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
}
}
cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
cs->last_added_bo = NULL;
return true;
}
static void cleanup_fence_list(struct amdgpu_fence_list *fences)
{
for (unsigned i = 0; i < fences->num; i++)
amdgpu_fence_drop_reference(fences->list[i]);
fences->num = 0;
}
static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
{
for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
unsigned num_buffers = cs->buffer_lists[i].num_buffers;
for (unsigned j = 0; j < num_buffers; j++)
amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
cs->buffer_lists[i].num_buffers = 0;
}
}
static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
{
cs->seq_no_dependencies.valid_fence_mask = 0;
cleanup_fence_list(&cs->syncobj_dependencies);
cleanup_fence_list(&cs->syncobj_to_signal);
amdgpu_fence_reference(&cs->fence, NULL);
cs->last_added_bo = NULL;
}
static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
{
amdgpu_cs_context_cleanup_buffers(aws, cs);
amdgpu_cs_context_cleanup(aws, cs);
for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
FREE(cs->buffer_lists[i].buffers);
FREE(cs->syncobj_dependencies.list);
FREE(cs->syncobj_to_signal.list);
}
static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
return cs->ip_type;
}
static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
{
/* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
return ip_type == AMD_IP_VCN_DEC ||
ip_type == AMD_IP_VCN_ENC ||
ip_type == AMD_IP_VCN_JPEG;
}
static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
if (!cs)
return;
amdgpu_cs_sync_flush(rcs);
util_queue_fence_destroy(&cs->flush_completed);
p_atomic_dec(&cs->aws->num_cs);
radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
FREE(rcs->prev);
amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
amdgpu_fence_reference(&cs->next_fence, NULL);
FREE(cs);
}
static bool
amdgpu_cs_create(struct radeon_cmdbuf *rcs,
struct radeon_winsys_ctx *rwctx,
enum amd_ip_type ip_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
void *flush_ctx)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
struct amdgpu_cs *cs;
cs = CALLOC_STRUCT(amdgpu_cs);
if (!cs) {
return false;
}
util_queue_fence_init(&cs->flush_completed);
cs->aws = ctx->aws;
cs->ctx = ctx;
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
cs->ip_type = ip_type;
cs->noop = ctx->aws->noop_cs;
cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
(ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
/* Compute the queue index by counting the IPs that have queues. */
assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
assert(ctx->aws->info.ip[ip_type].num_queues);
if (ip_uses_alt_fence(ip_type)) {
cs->queue_index = INT_MAX;
cs->uses_alt_fence = true;
} else {
cs->queue_index = 0;
for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
continue;
if (i == ip_type)
break;
cs->queue_index++;
}
assert(cs->queue_index < AMDGPU_MAX_QUEUES);
}
ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4,
(struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
FREE(cs);
return false;
}
if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
FREE(cs);
return false;
}
memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
/* Set the first submission context as current. */
rcs->csc = cs->csc = &cs->csc1;
cs->cst = &cs->csc2;
/* Assign to both amdgpu_cs_context; only csc will use it. */
cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
cs->csc1.aws = ctx->aws;
cs->csc2.aws = ctx->aws;
p_atomic_inc(&ctx->aws->num_cs);
if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs))
goto fail;
/* Currently only gfx, compute and sdma queues supports user queue. */
if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) {
if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type))
goto fail;
}
rcs->priv = cs;
return true;
fail:
amdgpu_cs_destroy(rcs);
return false;
}
static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
unsigned preamble_num_dw)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *aws = cs->aws;
struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
struct pb_buffer_lean *preamble_bo;
uint32_t *map;
/* Create the preamble IB buffer. */
preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
RADEON_DOMAIN_VRAM,
(radeon_bo_flag)
(RADEON_FLAG_NO_INTERPROCESS_SHARING |
RADEON_FLAG_GTT_WC));
if (!preamble_bo)
return false;
map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
(pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
if (!map) {
radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
return false;
}
/* Upload the preamble IB. */
memcpy(map, preamble_ib, preamble_num_dw * 4);
/* Pad the IB. */
amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
for (unsigned i = 0; i < 2; i++) {
csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
}
assert(!cs->preamble_ib_bo);
cs->preamble_ib_bo = preamble_bo;
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
return true;
}
static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
{
return true;
}
static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_ib *main_ib = &cs->main_ib;
assert(rcs->current.cdw <= rcs->current.max_dw);
unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
return false;
if (rcs->current.max_dw - rcs->current.cdw >= dw)
return true;
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
/* 125% of the size for IB epilog. */
unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
if (!cs->has_chaining)
return false;
/* Allocate a new chunk */
if (rcs->num_prev >= rcs->max_prev) {
unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
struct radeon_cmdbuf_chunk *new_prev;
new_prev = (struct radeon_cmdbuf_chunk*)
REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
sizeof(*new_prev) * new_max_prev);
if (!new_prev)
return false;
rcs->prev = new_prev;
rcs->max_prev = new_max_prev;
}
if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
return false;
assert(main_ib->used_ib_space == 0);
uint64_t va = main_ib->gpu_address;
/* This space was originally reserved. */
rcs->current.max_dw += cs_epilog_dw;
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
radeon_emit(rcs, va);
radeon_emit(rcs, va >> 32);
uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
assert(rcs->current.cdw <= rcs->current.max_dw);
amdgpu_set_ib_size(rcs, main_ib);
main_ib->ptr_ib_size = new_ptr_ib_size;
main_ib->is_chained_ib = true;
/* Hook up the new chunk */
rcs->prev[rcs->num_prev].buf = rcs->current.buf;
rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
rcs->num_prev++;
rcs->prev_dw += rcs->current.cdw;
rcs->current.cdw = 0;
rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
return true;
}
static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
{
unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
for (unsigned i = 0; i < num_buffers; i++) {
struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
struct amdgpu_cs_buffer *real_buffer =
amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
&cs->buffer_lists[AMDGPU_BO_REAL], true);
/* We need to set the usage because it determines the BO priority.
*
* Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
* BO fences to fence dependencies. Only the slab entries should do that.
*/
real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
}
}
static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
struct radeon_bo_list_item *list)
{
struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
/* We do this in the CS thread, but since we need to return the final usage of all buffers
* here, do it here too. There is no harm in doing it again in the CS thread.
*/
amdgpu_add_slab_backing_buffers(cs);
struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
unsigned num_real_buffers = real_buffers->num_buffers;
#if HAVE_AMDGPU_VIRTIO
assert(!cs->ws->info.is_virtio);
#endif
if (list) {
for (unsigned i = 0; i < num_real_buffers; i++) {
list[i].bo_size = real_buffers->buffers[i].bo->base.size;
list[i].vm_address =
amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
list[i].priority_usage = real_buffers->buffers[i].usage;
}
}
return num_real_buffers;
}
static void add_fence_to_list(struct amdgpu_fence_list *fences,
struct amdgpu_fence *fence)
{
unsigned idx = fences->num++;
if (idx >= fences->max) {
unsigned size;
const unsigned increment = 8;
fences->max = idx + increment;
size = fences->max * sizeof(fences->list[0]);
fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
}
amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
}
static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
struct pipe_fence_handle *pfence)
{
struct amdgpu_cs *acs = amdgpu_cs(rcs);
struct amdgpu_cs_context *cs = acs->csc;
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
util_queue_fence_wait(&fence->submitted);
if (!fence->imported) {
/* Ignore idle fences. This will only check the user fence in memory. */
if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
fence->queue_seq_no);
}
}
else
add_fence_to_list(&cs->syncobj_dependencies, fence);
}
static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
struct amdgpu_cs_context *cs,
unsigned queue_index_bit,
struct amdgpu_seq_no_fences *dependencies,
struct amdgpu_winsys_bo *bo, unsigned usage)
{
if (usage & RADEON_USAGE_SYNCHRONIZED) {
/* Add BO fences from queues other than 'queue_index' to dependencies. */
u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
add_seq_no_to_list(ws, dependencies, other_queue_idx,
bo->fences.seq_no[other_queue_idx]);
}
if (bo->alt_fence)
add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
}
}
static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
uint_seq_no new_queue_seq_no)
{
bo->fences.seq_no[queue_index] = new_queue_seq_no;
bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
}
static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
struct amdgpu_winsys_bo *bo, unsigned usage)
{
bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
}
static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
struct pipe_fence_handle *fence)
{
struct amdgpu_cs *acs = amdgpu_cs(rws);
struct amdgpu_cs_context *cs = acs->csc;
add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
}
static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
unsigned num_real_buffers,
struct drm_amdgpu_bo_list_entry *bo_list_real,
uint64_t *seq_no)
{
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_cs_context *cs = acs->cst;
struct drm_amdgpu_bo_list_in bo_list_in;
struct drm_amdgpu_cs_chunk chunks[8];
unsigned num_chunks = 0;
/* BO list */
bo_list_in.operation = ~0;
bo_list_in.list_handle = ~0;
bo_list_in.bo_number = num_real_buffers;
bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real;
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
num_chunks++;
/* Syncobj dependencies. */
unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
if (num_syncobj_dependencies) {
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
(struct drm_amdgpu_cs_chunk_sem *)
alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted));
sem_chunk[i].handle = fence->syncobj;
}
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
num_chunks++;
}
/* Syncobj signals. */
unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
(struct drm_amdgpu_cs_chunk_sem *)
alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
sem_chunk[i].handle = fence->syncobj;
}
sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
num_chunks++;
if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
num_chunks++;
}
/* Fence */
if (amdgpu_cs_has_user_fence(acs)) {
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
num_chunks++;
}
/* IB */
if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
num_chunks++;
}
/* IB */
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
num_chunks++;
if (cs->secure) {
cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
} else {
cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
}
assert(num_chunks <= 8);
/* Submit the command buffer.
*
* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
* quite often, but it eventually succeeds after enough attempts. This happens frequently
* with dEQP using NGG streamout.
*/
int r = 0;
do {
/* Wait 1 ms and try again. */
if (r == -ENOMEM)
os_time_sleep(1000);
r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no);
} while (r == -ENOMEM);
return r;
}
static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq,
struct amdgpu_cs_context *cs,
uint64_t num_fences,
struct drm_amdgpu_userq_fence_info *fence_info)
{
amdgpu_pkt_begin();
if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
if (num_fences) {
unsigned num_fences_in_iter;
/* FENCE_WAIT_MULTI packet supports max 32 fenes */
for (unsigned i = 0; i < num_fences; i = i + 32) {
num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32;
amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0));
amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1));
for (unsigned j = 0; j < num_fences_in_iter; j++) {
amdgpu_pkt_add_dw(fence_info[i + j].va);
amdgpu_pkt_add_dw(fence_info[i + j].va >> 32);
amdgpu_pkt_add_dw(fence_info[i + j].value);
amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
}
}
}
amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
amdgpu_pkt_add_dw(0x0);
amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start);
amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32);
if (userq->ip_type == AMD_IP_GFX)
amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1));
else
amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) |
S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
/* Add 8 for release mem packet and 2 for protected fence signal packet.
* Calculcating userq_fence_seq_num this way to match with kernel fence that is
* returned in userq_wait iotl.
*/
userq->user_fence_seq_num = *userq->wptr_bo_map + __num_dw_written + 8 + 2;
/* add release mem for user fence */
amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0));
amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) |
S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) |
S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3));
amdgpu_pkt_add_dw(S_030358_DATA_SEL(2));
amdgpu_pkt_add_dw(userq->user_fence_va);
amdgpu_pkt_add_dw(userq->user_fence_va >> 32);
amdgpu_pkt_add_dw(userq->user_fence_seq_num);
amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
amdgpu_pkt_add_dw(0);
/* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
* is only accessible from kernel through VMID 0.
*/
amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
amdgpu_pkt_add_dw(0);
} else {
fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
}
amdgpu_pkt_end();
}
static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
struct amdgpu_cs *acs,
uint32_t *shared_buf_kms_handles_write,
unsigned num_shared_buf_write,
uint32_t *shared_buf_kms_handles_read,
unsigned num_shared_buf_read,
uint64_t *seq_no,
uint64_t vm_timeline_point)
{
int r = 0;
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_cs_context *cs = acs->cst;
/* Syncobj dependencies. */
unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
uint32_t *syncobj_dependencies_list =
(uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
/* Currently only 1 vm timeline syncobj can be a dependency. */
uint16_t num_syncobj_timeline_dependencies = 1;
uint32_t syncobj_timeline_dependency;
uint64_t syncobj_timeline_dependency_point;
if (num_syncobj_dependencies) {
for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted));
syncobj_dependencies_list[i] = fence->syncobj;
}
}
syncobj_timeline_dependency = aws->vm_timeline_syncobj;
syncobj_timeline_dependency_point = vm_timeline_point;
/* Syncobj signals. Adding 1 for cs submission fence. */
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1;
uint32_t *syncobj_signal_list =
(uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t));
for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
syncobj_signal_list[i] = fence->syncobj;
}
syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj;
struct drm_amdgpu_userq_fence_info *fence_info;
struct drm_amdgpu_userq_wait userq_wait_data = {
.syncobj_handles = (uintptr_t)syncobj_dependencies_list,
.syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency,
.syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point,
.bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
.bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
.num_syncobj_timeline_handles = num_syncobj_timeline_dependencies,
.num_fences = 0,
.num_syncobj_handles = num_syncobj_dependencies,
.num_bo_read_handles = num_shared_buf_read,
.num_bo_write_handles = num_shared_buf_write,
.out_fences = (uintptr_t)NULL,
};
/*
* Buffers sharing synchronization follow these rules:
* - read-only buffers wait for all previous writes to complete
* - write-only(also read-write) buffers wait for all previous reads to complete
* To implement this strategy, we use amdgpu_userq_wait() before submitting
* a job, and amdgpu_userq_signal() after to indicate completion.
*/
r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
if (r)
fprintf(stderr, "amdgpu: getting wait num_fences failed\n");
fence_info = (struct drm_amdgpu_userq_fence_info*)
alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info));
userq_wait_data.out_fences = (uintptr_t)fence_info;
r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
if (r)
fprintf(stderr, "amdgpu: getting wait fences failed\n");
simple_mtx_lock(&userq->lock);
amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info);
struct drm_amdgpu_userq_signal userq_signal_data = {
.queue_id = userq->userq_handle,
.syncobj_handles = (uintptr_t)syncobj_signal_list,
.num_syncobj_handles = num_syncobj_to_signal,
.bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
.bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
.num_bo_read_handles = num_shared_buf_read,
.num_bo_write_handles = num_shared_buf_write,
};
r = ac_drm_userq_signal(aws->dev, &userq_signal_data);
if (!r)
userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = *userq->wptr_bo_map;
*seq_no = userq->user_fence_seq_num;
simple_mtx_unlock(&userq->lock);
return r;
}
enum queue_type {
KERNELQ,
KERNELQ_ALT_FENCE,
USERQ,
};
/* The template parameter determines whether the queue should skip code used by the default queue
* system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
* for all BOs.
*/
template<enum queue_type queue_type>
static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
{
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_cs_context *cs = acs->cst;
int r;
uint64_t seq_no = 0;
bool has_user_fence = amdgpu_cs_has_user_fence(acs);
/* The maximum timeline point of VM updates for all BOs used in this submit. */
uint64_t vm_timeline_point = 0;
simple_mtx_lock(&aws->bo_fence_lock);
unsigned queue_index;
struct amdgpu_queue *queue;
uint_seq_no prev_seq_no, next_seq_no;
if (queue_type != KERNELQ_ALT_FENCE) {
queue_index = acs->queue_index;
queue = &aws->queues[queue_index];
prev_seq_no = queue->latest_seq_no;
/* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
* but the values aren't related.
*/
next_seq_no = prev_seq_no + 1;
/* Wait for the oldest fence to signal. This should always check the user fence, then wait
* via the ioctl. We have to do this because we are going to release the oldest fence and
* replace it with the latest fence in the ring.
*/
struct pipe_fence_handle **oldest_fence =
&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
if (*oldest_fence) {
if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
/* Take the reference because the fence can be released by other threads after we
* unlock the mutex.
*/
struct pipe_fence_handle *tmp_fence = NULL;
amdgpu_fence_reference(&tmp_fence, *oldest_fence);
/* Unlock the mutex before waiting. */
simple_mtx_unlock(&aws->bo_fence_lock);
amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
amdgpu_fence_reference(&tmp_fence, NULL);
simple_mtx_lock(&aws->bo_fence_lock);
}
/* Remove the idle fence from the ring. */
amdgpu_fence_reference(oldest_fence, NULL);
}
}
/* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
* sequence number per queue and removes all older ones.
*/
struct amdgpu_seq_no_fences seq_no_dependencies;
memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
if (queue_type != KERNELQ_ALT_FENCE) {
/* Add a fence dependency on the previous IB if the IP has multiple physical queues to
* make it appear as if it had only 1 queue, or if the previous IB comes from a different
* context. The reasons are:
* - Our BO fence tracking only supports 1 queue per IP.
* - IBs from different contexts must wait for each other and can't execute in a random order.
*/
struct amdgpu_fence *prev_fence =
(struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
/* Add a dependency on a previous fence, unless we can determine that
* it's useless because the execution order is guaranteed.
*/
if (prev_fence) {
bool same_ctx = queue->last_ctx == acs->ctx;
/* userqueue submission mode uses a single queue per process. */
bool same_queue = aws->info.ip[acs->ip_type].num_queues > 1 &&
queue_type != USERQ;
if (!same_ctx || !same_queue)
add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
}
}
/* Since the kernel driver doesn't synchronize execution between different
* rings automatically, we have to add fence dependencies manually. This gathers sequence
* numbers from BOs and sets the next sequence number in the BOs.
*/
/* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ?
0 : BITFIELD_BIT(queue_index);
for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (queue_type == KERNELQ_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
/* We didn't add any slab entries into the real buffer list that will be submitted
* to the kernel. Do it now.
*/
struct amdgpu_cs_buffer *real_buffer =
amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
&cs->buffer_lists[AMDGPU_BO_REAL], false);
/* We need to set the usage because it determines the BO priority. */
real_buffer->usage |= buffer->usage;
}
/* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
bool out_of_memory = false;
for (unsigned i = 0; i < num_sparse_buffers; i++) {
struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (queue_type == KERNELQ_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
/* Add backing buffers of sparse buffers to the buffer list.
*
* This is done late, during submission, to keep the buffer list short before
* submit, and to avoid managing fences for the backing buffers.
*/
struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
if (queue_type == USERQ) {
uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point);
vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point);
}
simple_mtx_lock(&sparse_bo->commit_lock);
list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
/* We can directly add the buffer here, because we know that each
* backing buffer occurs only once.
*/
struct amdgpu_cs_buffer *real_buffer =
amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
if (!real_buffer) {
fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
simple_mtx_unlock(&sparse_bo->commit_lock);
r = -ENOMEM;
out_of_memory = true;
}
real_buffer->usage = buffer->usage;
}
simple_mtx_unlock(&sparse_bo->commit_lock);
}
/* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
struct drm_amdgpu_bo_list_entry *bo_list;
/* BO dependency management depends on the queue mode:
* - kernel queue: BO used by the submit are passed to the kernel in a
* drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled
* automatically by the kernel; intra-process sync is handled by Mesa.
* - user queue: intra-process sync is similar. Inter-process sync is handled
* using timeline points, amdgpu_userq_wait (before a submit) and
* amdgpu_userq_signal (after a submit).
*/
unsigned num_shared_buf_write;
unsigned num_shared_buf_read;
/* Store write handles in the begining and read handles at the end in shared_buf_kms_handles.
* If usage is read and write then store the handle in write list.
*/
uint32_t *shared_buf_kms_handles;
if (queue_type != USERQ) {
bo_list = (struct drm_amdgpu_bo_list_entry *)
alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
} else {
num_shared_buf_write = 0;
num_shared_buf_read = 0;
shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t));
}
unsigned i;
for (i = 0; i < initial_num_real_buffers; i++) {
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (queue_type == KERNELQ_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
if (queue_type != USERQ) {
amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
} else {
vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
if (!get_real_bo(bo)->is_shared)
continue;
if (buffer->usage & RADEON_USAGE_WRITE) {
shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
num_shared_buf_write++;
} else {
num_shared_buf_read++;
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
get_real_bo(bo)->kms_handle;
}
}
}
/* These are backing buffers of slab entries. Don't add their fence dependencies. */
for (; i < num_real_buffers_except_sparse; i++) {
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
if (queue_type == KERNELQ_ALT_FENCE)
get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
if (queue_type != USERQ) {
amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
} else {
vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
if (!get_real_bo(bo)->is_shared)
continue;
if (buffer->usage & RADEON_USAGE_WRITE) {
shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
num_shared_buf_write++;
} else {
num_shared_buf_read++;
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
get_real_bo(bo)->kms_handle;
}
}
}
/* Sparse backing BOs are last. Don't update their fences because we don't use them. */
for (; i < num_real_buffers; ++i) {
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
if (queue_type != USERQ) {
amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
} else {
if (!get_real_bo(buffer->bo)->is_shared)
continue;
if (buffer->usage & RADEON_USAGE_WRITE) {
shared_buf_kms_handles[num_shared_buf_write] =
get_real_bo(buffer->bo)->kms_handle;
num_shared_buf_write++;
} else {
num_shared_buf_read++;
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
get_real_bo(buffer->bo)->kms_handle;
}
}
}
#if 0 /* Debug code. */
printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
/* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
if (i == acs->queue_index)
continue;
struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
if (!fence) {
if (i <= 1)
printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
continue;
}
bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
uint_seq_no old = seq_no_dependencies.seq_no[i];
add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
uint_seq_no new = seq_no_dependencies.seq_no[i];
if (!valid)
printf(" missing dependency on queue=%u, seq_no=%u\n", i, new);
else if (old != new)
printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
else
printf(" has dependency on queue=%u, seq_no=%u\n", i, old);
}
#endif
/* Convert the sequence numbers we gathered to fence dependencies. */
u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
if (fence) {
/* If it's idle, don't add it to the list of dependencies. */
if (amdgpu_fence_wait(*fence, 0, false))
amdgpu_fence_reference(fence, NULL);
else
add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
}
}
if (queue_type != KERNELQ_ALT_FENCE) {
/* Finally, add the IB fence into the fence ring of the queue. */
amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
queue->latest_seq_no = next_seq_no;
((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
/* Update the last used context in the queue. */
amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
}
simple_mtx_unlock(&aws->bo_fence_lock);
#if MESA_DEBUG
/* Prepare the buffer list. */
if (aws->debug_all_bos) {
/* The buffer list contains all buffers. This is a slow path that
* ensures that no buffer is missing in the BO list.
*/
simple_mtx_lock(&aws->global_bo_list_lock);
if (queue_type != USERQ) {
bo_list = (struct drm_amdgpu_bo_list_entry *)
alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
num_real_buffers = 0;
list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
bo_list[num_real_buffers].bo_handle = bo->kms_handle;
bo_list[num_real_buffers].bo_priority = 0;
++num_real_buffers;
}
} else {
shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t));
num_shared_buf_write = 0;
num_shared_buf_read = 0;
list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle;
num_shared_buf_write++;
}
}
simple_mtx_unlock(&aws->global_bo_list_lock);
}
#endif
if (acs->ip_type == AMD_IP_GFX)
aws->gfx_bo_list_counter += num_real_buffers;
if (out_of_memory) {
r = -ENOMEM;
} else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
r = -ECANCELED;
} else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) {
r = 0;
} else {
if (queue_type != USERQ) {
/* Submit the command buffer.
*
* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
* quite often, but it eventually succeeds after enough attempts. This happens frequently
* with dEQP using NGG streamout.
*/
r = 0;
do {
/* Wait 1 ms and try again. */
if (r == -ENOMEM)
os_time_sleep(1000);
r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no);
} while (r == -ENOMEM);
if (!r) {
/* Success. */
uint64_t *user_fence = NULL;
/* Need to reserve 4 QWORD for user fence:
* QWORD[0]: completed fence
* QWORD[1]: preempted fence
* QWORD[2]: reset fence
* QWORD[3]: preempted then reset
*/
if (has_user_fence)
user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
}
} else {
struct amdgpu_userq *userq = &queue->userq;
r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write,
&shared_buf_kms_handles[num_real_buffers - num_shared_buf_read],
num_shared_buf_read, &seq_no, vm_timeline_point);
if (!r) {
/* Success. */
amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr);
}
}
}
if (unlikely(r)) {
if (r == -ECANCELED) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
} else if (r == -ENODATA) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
} else if (r == -ETIME) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
} else {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
PIPE_UNKNOWN_CONTEXT_RESET,
"amdgpu: The CS has been rejected, "
"see dmesg for more information (%i).\n",
r);
}
}
/* If there was an error, signal the fence, because it won't be signalled
* by the hardware. */
if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX))
amdgpu_fence_signalled(cs->fence);
if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
acs->mcbp_fw_shadow_chunk.flags = 0;
cs->error_code = r;
/* Clear the buffer lists. */
for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
unsigned num_buffers = cs->buffer_lists[list].num_buffers;
if (list == AMDGPU_BO_REAL) {
/* Only decrement num_active_ioctls and unref where we incremented them.
* We did both for regular real BOs. We only incremented the refcount for sparse
* backing BOs.
*/
/* Regular real BOs. */
for (unsigned i = 0; i < initial_num_real_buffers; i++) {
p_atomic_dec(&buffers[i].bo->num_active_ioctls);
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
}
/* Do nothing for slab BOs. */
/* Sparse backing BOs. */
for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
} else {
for (unsigned i = 0; i < num_buffers; i++) {
p_atomic_dec(&buffers[i].bo->num_active_ioctls);
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
}
}
cs->buffer_lists[list].num_buffers = 0;
}
amdgpu_cs_context_cleanup(aws, cs);
}
/* Make sure the previous submission is completed. */
void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
/* Wait for any pending ioctl of this CS to complete. */
util_queue_fence_wait(&cs->flush_completed);
}
static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
unsigned flags,
struct pipe_fence_handle **fence)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *aws = cs->aws;
int error_code = 0;
uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
/* Pad the IB according to the mask. */
switch (cs->ip_type) {
case AMD_IP_SDMA:
if (aws->info.gfx_level <= GFX6) {
while (rcs->current.cdw & ib_pad_dw_mask)
radeon_emit(rcs, 0xf0000000); /* NOP packet */
} else {
while (rcs->current.cdw & ib_pad_dw_mask)
radeon_emit(rcs, SDMA_NOP_PAD);
}
break;
case AMD_IP_GFX:
case AMD_IP_COMPUTE:
amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
if (cs->ip_type == AMD_IP_GFX)
aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
break;
case AMD_IP_UVD:
case AMD_IP_UVD_ENC:
while (rcs->current.cdw & ib_pad_dw_mask)
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
break;
case AMD_IP_VCN_JPEG:
if (rcs->current.cdw % 2)
assert(0);
while (rcs->current.cdw & ib_pad_dw_mask) {
radeon_emit(rcs, 0x60000000); /* nop packet */
radeon_emit(rcs, 0x00000000);
}
break;
case AMD_IP_VCN_DEC:
while (rcs->current.cdw & ib_pad_dw_mask)
radeon_emit(rcs, 0x81ff); /* nop packet */
break;
default:
break;
}
if (rcs->current.cdw > rcs->current.max_dw) {
fprintf(stderr, "amdgpu: command stream overflowed\n");
}
/* If the CS is not empty or overflowed.... */
if (likely(radeon_emitted(rcs, 0) &&
rcs->current.cdw <= rcs->current.max_dw &&
!(flags & RADEON_FLUSH_NOOP))) {
struct amdgpu_cs_context *cur = cs->csc;
/* Set IB sizes. */
amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
/* Create a fence. */
amdgpu_fence_reference(&cur->fence, NULL);
if (cs->next_fence) {
/* just move the reference */
cur->fence = cs->next_fence;
cs->next_fence = NULL;
} else {
cur->fence = amdgpu_fence_create(cs);
}
if (fence)
amdgpu_fence_reference(fence, cur->fence);
for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
unsigned num_buffers = cur->buffer_lists[i].num_buffers;
struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
for (unsigned j = 0; j < num_buffers; j++)
p_atomic_inc(&buffers[j].bo->num_active_ioctls);
}
amdgpu_cs_sync_flush(rcs);
cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
if (cs->noop && cs->ip_type == AMD_IP_GFX) {
/* Reduce the IB size and fill it with NOP to make it like an empty IB. */
unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
}
/* Swap command streams. "cst" is going to be submitted. */
rcs->csc = cs->csc = cs->cst;
cs->cst = cur;
/* only gfx, compute and sdma queues are supported in userqueues. */
if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) {
util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
amdgpu_cs_submit_ib<USERQ>, NULL, 0);
} else {
util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
cs->uses_alt_fence ?
amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE>
: amdgpu_cs_submit_ib<KERNELQ>,
NULL, 0);
}
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
cs->csc->secure = !cs->cst->secure;
else
cs->csc->secure = cs->cst->secure;
if (!(flags & PIPE_FLUSH_ASYNC)) {
amdgpu_cs_sync_flush(rcs);
error_code = cur->error_code;
}
} else {
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
cs->csc->secure = !cs->csc->secure;
amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
amdgpu_cs_context_cleanup(aws, cs->csc);
}
memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
if (cs->preamble_ib_bo) {
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
}
if (cs->ip_type == AMD_IP_GFX)
aws->num_gfx_IBs++;
else if (cs->ip_type == AMD_IP_SDMA)
aws->num_sdma_IBs++;
return error_code;
}
static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
struct pb_buffer_lean *_buf,
unsigned usage)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
}
static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
uint64_t csa_va)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
cs->mcbp_fw_shadow_chunk.gds_va = 0;
cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
}
static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
struct pipe_fence_handle **dst,
struct pipe_fence_handle *src)
{
amdgpu_fence_reference(dst, src);
}
void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
{
sws->base.ctx_create = amdgpu_ctx_create;
sws->base.ctx_destroy = amdgpu_ctx_destroy;
sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
sws->base.cs_create = amdgpu_cs_create;
sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
sws->base.cs_destroy = amdgpu_cs_destroy;
sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
sws->base.cs_validate = amdgpu_cs_validate;
sws->base.cs_check_space = amdgpu_cs_check_space;
sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
sws->base.cs_flush = amdgpu_cs_flush;
sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
sws->base.fence_reference = amdgpu_winsys_fence_reference;
sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
if (sws->aws->info.has_fw_based_shadowing)
sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
}