| /* |
| * Copyright © 2008 Jérôme Glisse |
| * Copyright © 2010 Marek Olšák <[email protected]> |
| * Copyright © 2015 Advanced Micro Devices, Inc. |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "amdgpu_cs.h" |
| #include "util/detect_os.h" |
| #include "amdgpu_winsys.h" |
| #include "util/os_time.h" |
| #include <inttypes.h> |
| #include <stdio.h> |
| |
| #include "amd/common/sid.h" |
| |
| /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error |
| * codes in the kernel). |
| */ |
| #if DETECT_OS_OPENBSD |
| #define ENODATA ENOTSUP |
| #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY |
| #define ENODATA ECONNREFUSED |
| #endif |
| |
| /* FENCES */ |
| |
| void amdgpu_fence_destroy(struct amdgpu_fence *fence) |
| { |
| ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj); |
| |
| if (fence->ctx) |
| amdgpu_ctx_reference(&fence->ctx, NULL); |
| |
| util_queue_fence_destroy(&fence->submitted); |
| FREE(fence); |
| } |
| |
| static struct pipe_fence_handle * |
| amdgpu_fence_create(struct amdgpu_cs *cs) |
| { |
| struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); |
| struct amdgpu_ctx *ctx = cs->ctx; |
| |
| fence->reference.count = 1; |
| fence->aws = ctx->aws; |
| amdgpu_ctx_reference(&fence->ctx, ctx); |
| fence->ctx = ctx; |
| fence->ip_type = cs->ip_type; |
| if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) { |
| free(fence); |
| return NULL; |
| } |
| |
| util_queue_fence_init(&fence->submitted); |
| util_queue_fence_reset(&fence->submitted); |
| fence->queue_index = cs->queue_index; |
| return (struct pipe_fence_handle *)fence; |
| } |
| |
| static struct pipe_fence_handle * |
| amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd) |
| { |
| struct amdgpu_winsys *aws = amdgpu_winsys(rws); |
| struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); |
| int r; |
| |
| if (!fence) |
| return NULL; |
| |
| pipe_reference_init(&fence->reference, 1); |
| fence->aws = aws; |
| fence->ip_type = 0xffffffff; |
| |
| r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj); |
| if (r) { |
| FREE(fence); |
| return NULL; |
| } |
| |
| util_queue_fence_init(&fence->submitted); |
| fence->imported = true; |
| |
| return (struct pipe_fence_handle*)fence; |
| } |
| |
| static struct pipe_fence_handle * |
| amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) |
| { |
| struct amdgpu_winsys *aws = amdgpu_winsys(rws); |
| struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); |
| |
| if (!fence) |
| return NULL; |
| |
| pipe_reference_init(&fence->reference, 1); |
| fence->aws = aws; |
| /* fence->ctx == NULL means that the fence is syncobj-based. */ |
| |
| /* Convert sync_file into syncobj. */ |
| int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj); |
| if (r) { |
| FREE(fence); |
| return NULL; |
| } |
| |
| r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd); |
| if (r) { |
| ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj); |
| FREE(fence); |
| return NULL; |
| } |
| |
| util_queue_fence_init(&fence->submitted); |
| fence->imported = true; |
| |
| return (struct pipe_fence_handle*)fence; |
| } |
| |
| static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, |
| struct pipe_fence_handle *pfence) |
| { |
| struct amdgpu_winsys *aws = amdgpu_winsys(rws); |
| struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; |
| int fd, r; |
| |
| util_queue_fence_wait(&fence->submitted); |
| |
| /* Convert syncobj into sync_file. */ |
| r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd); |
| return r ? -1 : fd; |
| } |
| |
| static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws) |
| { |
| struct amdgpu_winsys *aws = amdgpu_winsys(rws); |
| uint32_t syncobj; |
| int fd = -1; |
| |
| int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED, |
| &syncobj); |
| if (r) { |
| return -1; |
| } |
| |
| r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd); |
| if (r) { |
| fd = -1; |
| } |
| |
| ac_drm_cs_destroy_syncobj(aws->fd, syncobj); |
| return fd; |
| } |
| |
| static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, |
| uint64_t seq_no, |
| uint64_t *user_fence_cpu_address) |
| { |
| struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; |
| |
| afence->seq_no = seq_no; |
| afence->user_fence_cpu_address = user_fence_cpu_address; |
| util_queue_fence_signal(&afence->submitted); |
| } |
| |
| static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) |
| { |
| struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; |
| |
| afence->signalled = true; |
| util_queue_fence_signal(&afence->submitted); |
| } |
| |
| bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, |
| bool absolute) |
| { |
| struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; |
| int64_t abs_timeout; |
| uint64_t *user_fence_cpu; |
| |
| if (afence->signalled) |
| return true; |
| |
| if (absolute) |
| abs_timeout = timeout; |
| else |
| abs_timeout = os_time_get_absolute_timeout(timeout); |
| |
| /* The fence might not have a number assigned if its IB is being |
| * submitted in the other thread right now. Wait until the submission |
| * is done. */ |
| if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout)) |
| return false; |
| |
| user_fence_cpu = afence->user_fence_cpu_address; |
| if (user_fence_cpu) { |
| if (*user_fence_cpu >= afence->seq_no) { |
| afence->signalled = true; |
| return true; |
| } |
| |
| /* No timeout, just query: no need for the ioctl. */ |
| if (!absolute && !timeout) |
| return false; |
| } |
| |
| if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE) |
| abs_timeout = INT64_MAX; |
| |
| if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1, |
| abs_timeout, 0, NULL)) |
| return false; |
| |
| /* Check that guest-side syncobj agrees with the user fence. */ |
| if (user_fence_cpu && afence->aws->info.is_virtio) |
| assert(afence->seq_no <= *user_fence_cpu); |
| |
| afence->signalled = true; |
| return true; |
| } |
| |
| static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws, |
| struct pipe_fence_handle *fence, |
| uint64_t timeout) |
| { |
| return amdgpu_fence_wait(fence, timeout, false); |
| } |
| |
| static struct pipe_fence_handle * |
| amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| struct pipe_fence_handle *fence = NULL; |
| |
| if (cs->noop) |
| return NULL; |
| |
| if (cs->next_fence) { |
| amdgpu_fence_reference(&fence, cs->next_fence); |
| return fence; |
| } |
| |
| fence = amdgpu_fence_create(cs); |
| if (!fence) |
| return NULL; |
| |
| amdgpu_fence_reference(&cs->next_fence, fence); |
| return fence; |
| } |
| |
| /* CONTEXTS */ |
| |
| static uint32_t |
| radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority) |
| { |
| switch (radeon_priority) { |
| case RADEON_CTX_PRIORITY_REALTIME: |
| return AMDGPU_CTX_PRIORITY_VERY_HIGH; |
| case RADEON_CTX_PRIORITY_HIGH: |
| return AMDGPU_CTX_PRIORITY_HIGH; |
| case RADEON_CTX_PRIORITY_MEDIUM: |
| return AMDGPU_CTX_PRIORITY_NORMAL; |
| case RADEON_CTX_PRIORITY_LOW: |
| return AMDGPU_CTX_PRIORITY_LOW; |
| default: |
| unreachable("Invalid context priority"); |
| } |
| } |
| |
| static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws, |
| enum radeon_ctx_priority priority, |
| bool allow_context_lost) |
| { |
| struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx); |
| int r; |
| struct amdgpu_bo_alloc_request alloc_buffer = {}; |
| uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority); |
| ac_drm_device *dev; |
| ac_drm_bo buf_handle; |
| |
| if (!ctx) |
| return NULL; |
| |
| ctx->aws = amdgpu_winsys(rws); |
| ctx->reference.count = 1; |
| ctx->allow_context_lost = allow_context_lost; |
| |
| dev = ctx->aws->dev; |
| |
| r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle); |
| if (r) { |
| fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r); |
| goto error_create; |
| } |
| |
| alloc_buffer.alloc_size = ctx->aws->info.gart_page_size; |
| alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size; |
| alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT; |
| |
| r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle); |
| if (r) { |
| fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r); |
| goto error_user_fence_alloc; |
| } |
| |
| ctx->user_fence_cpu_address_base = NULL; |
| r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base); |
| if (r) { |
| fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r); |
| goto error_user_fence_map; |
| } |
| |
| memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size); |
| ctx->user_fence_bo = buf_handle; |
| ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle); |
| |
| return (struct radeon_winsys_ctx*)ctx; |
| |
| error_user_fence_map: |
| ac_drm_bo_free(dev, buf_handle); |
| |
| error_user_fence_alloc: |
| ac_drm_cs_ctx_free(dev, ctx->ctx_handle); |
| error_create: |
| FREE(ctx); |
| return NULL; |
| } |
| |
| static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) |
| { |
| struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; |
| |
| amdgpu_ctx_reference(&ctx, NULL); |
| } |
| |
| static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type, |
| uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space) |
| { |
| unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask; |
| unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask; |
| |
| if (unaligned_dw) { |
| int remaining = pad_dw_mask + 1 - unaligned_dw; |
| |
| /* Only pad by 1 dword with the type-2 NOP if necessary. */ |
| if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) { |
| ib[(*num_dw)++] = PKT2_NOP_PAD; |
| } else { |
| /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized |
| * packet. The size of the packet body after the header is always count + 1. |
| * If count == -1, there is no packet body. NOP is the only packet that can have |
| * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1). |
| */ |
| ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0); |
| *num_dw += remaining - 1; |
| } |
| } |
| assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0); |
| } |
| |
| static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx) |
| { |
| struct amdgpu_bo_alloc_request request = {0}; |
| struct drm_amdgpu_bo_list_in bo_list_in; |
| struct drm_amdgpu_cs_chunk_ib ib_in = {0}; |
| ac_drm_bo bo; |
| amdgpu_va_handle va_handle = NULL; |
| struct drm_amdgpu_cs_chunk chunks[2]; |
| struct drm_amdgpu_bo_list_entry list; |
| unsigned noop_dw_size; |
| void *cpu = NULL; |
| uint64_t seq_no; |
| uint64_t va; |
| int r; |
| |
| /* Older amdgpu doesn't report if the reset is complete or not. Detect |
| * it by submitting a no-op job. If it reports an error, then assume |
| * that the reset is not complete. |
| */ |
| uint32_t temp_ctx_handle; |
| r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle); |
| if (r) |
| return r; |
| |
| request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM; |
| request.alloc_size = 4096; |
| request.phys_alignment = 4096; |
| r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo); |
| if (r) |
| goto destroy_ctx; |
| |
| r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general, |
| request.alloc_size, request.phys_alignment, |
| 0, &va, &va_handle, |
| AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH); |
| if (r) |
| goto destroy_bo; |
| |
| uint32_t kms_handle; |
| ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle); |
| |
| r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va, |
| AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE, |
| AMDGPU_VA_OP_MAP); |
| if (r) |
| goto destroy_bo; |
| |
| r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu); |
| if (r) |
| goto destroy_bo; |
| |
| noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1; |
| ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0); |
| |
| ac_drm_bo_cpu_unmap(ctx->aws->dev, bo); |
| |
| list.bo_handle = kms_handle; |
| ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle); |
| list.bo_priority = 0; |
| |
| bo_list_in.list_handle = ~0; |
| bo_list_in.bo_number = 1; |
| bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); |
| bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list; |
| |
| ib_in.ip_type = AMD_IP_GFX; |
| ib_in.ib_bytes = noop_dw_size * 4; |
| ib_in.va_start = va; |
| |
| chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; |
| chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; |
| chunks[0].chunk_data = (uintptr_t)&bo_list_in; |
| |
| chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB; |
| chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; |
| chunks[1].chunk_data = (uintptr_t)&ib_in; |
| |
| r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no); |
| |
| destroy_bo: |
| if (va_handle) |
| ac_drm_va_range_free(va_handle); |
| ac_drm_bo_free(ctx->aws->dev, bo); |
| destroy_ctx: |
| ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle); |
| |
| return r; |
| } |
| |
| static void |
| amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status, |
| const char *format, ...) |
| { |
| struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; |
| |
| /* Don't overwrite the last reset status. */ |
| if (ctx->sw_status != PIPE_NO_RESET) |
| return; |
| |
| ctx->sw_status = status; |
| |
| if (!ctx->allow_context_lost) { |
| va_list args; |
| |
| va_start(args, format); |
| vfprintf(stderr, format, args); |
| va_end(args); |
| |
| /* Non-robust contexts are allowed to terminate the process. The only alternative is |
| * to skip command submission, which would look like a freeze because nothing is drawn, |
| * which looks like a hang without any reset. |
| */ |
| abort(); |
| } |
| } |
| |
| static enum pipe_reset_status |
| amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only, |
| bool *needs_reset, bool *reset_completed) |
| { |
| struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; |
| |
| if (needs_reset) |
| *needs_reset = false; |
| if (reset_completed) |
| *reset_completed = false; |
| |
| /* Return a failure due to a GPU hang. */ |
| uint64_t flags; |
| |
| if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) { |
| /* If the caller is only interested in full reset (= wants to ignore soft |
| * recoveries), we can use the rejected cs count as a quick first check. |
| */ |
| return PIPE_NO_RESET; |
| } |
| |
| /* |
| * ctx->sw_status is updated on alloc/ioctl failures. |
| * |
| * We only rely on amdgpu_cs_query_reset_state2 to tell us |
| * that the context reset is complete. |
| */ |
| if (ctx->sw_status != PIPE_NO_RESET) { |
| int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags); |
| if (!r) { |
| if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { |
| if (reset_completed) { |
| /* The ARB_robustness spec says: |
| * |
| * If a reset status other than NO_ERROR is returned and subsequent |
| * calls return NO_ERROR, the context reset was encountered and |
| * completed. If a reset status is repeatedly returned, the context may |
| * be in the process of resetting. |
| * |
| * Starting with drm_minor >= 54 amdgpu reports if the reset is complete, |
| * so don't do anything special. On older kernels, submit a no-op cs. If it |
| * succeeds then assume the reset is complete. |
| */ |
| if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)) |
| *reset_completed = true; |
| |
| if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics) |
| *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0; |
| } |
| } |
| } else { |
| fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r); |
| } |
| |
| /* Return a failure due to SW issues. */ |
| if (needs_reset) |
| *needs_reset = true; |
| return ctx->sw_status; |
| } |
| |
| if (needs_reset) |
| *needs_reset = false; |
| return PIPE_NO_RESET; |
| } |
| |
| /* COMMAND SUBMISSION */ |
| |
| static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs) |
| { |
| return acs->ip_type == AMD_IP_GFX || |
| acs->ip_type == AMD_IP_COMPUTE || |
| acs->ip_type == AMD_IP_SDMA; |
| } |
| |
| static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs) |
| { |
| if (cs->has_chaining) |
| return 4; /* for chaining */ |
| |
| return 0; |
| } |
| |
| static struct amdgpu_cs_buffer * |
| amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, |
| struct amdgpu_buffer_list *list) |
| { |
| int num_buffers = list->num_buffers; |
| struct amdgpu_cs_buffer *buffers = list->buffers; |
| unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); |
| int i = cs->buffer_indices_hashlist[hash]; |
| |
| /* not found or found */ |
| if (i < 0) |
| return NULL; |
| |
| if (i < num_buffers && buffers[i].bo == bo) |
| return &buffers[i]; |
| |
| /* Hash collision, look for the BO in the list of buffers linearly. */ |
| for (int i = num_buffers - 1; i >= 0; i--) { |
| if (buffers[i].bo == bo) { |
| /* Put this buffer in the hash list. |
| * This will prevent additional hash collisions if there are |
| * several consecutive lookup_buffer calls for the same buffer. |
| * |
| * Example: Assuming buffers A,B,C collide in the hash list, |
| * the following sequence of buffers: |
| * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC |
| * will collide here: ^ and here: ^, |
| * meaning that we should get very few collisions in the end. */ |
| cs->buffer_indices_hashlist[hash] = i & 0x7fff; |
| return &buffers[i]; |
| } |
| } |
| return NULL; |
| } |
| |
| struct amdgpu_cs_buffer * |
| amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) |
| { |
| return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]); |
| } |
| |
| static struct amdgpu_cs_buffer * |
| amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, |
| struct amdgpu_buffer_list *list, bool add_ref) |
| { |
| /* New buffer, check if the backing array is large enough. */ |
| if (unlikely(list->num_buffers >= list->max_buffers)) { |
| unsigned new_max = |
| MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3)); |
| struct amdgpu_cs_buffer *new_buffers; |
| |
| new_buffers = (struct amdgpu_cs_buffer *) |
| REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers), |
| new_max * sizeof(*new_buffers)); |
| if (!new_buffers) { |
| fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n"); |
| return NULL; |
| } |
| |
| list->max_buffers = new_max; |
| list->buffers = new_buffers; |
| } |
| |
| unsigned idx = list->num_buffers++; |
| struct amdgpu_cs_buffer *buffer = &list->buffers[idx]; |
| if (add_ref) |
| p_atomic_inc(&bo->base.reference.count); |
| buffer->bo = bo; |
| buffer->usage = 0; |
| |
| unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); |
| cs->buffer_indices_hashlist[hash] = idx & 0x7fff; |
| return buffer; |
| } |
| |
| static struct amdgpu_cs_buffer * |
| amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, |
| struct amdgpu_buffer_list *list, bool add_ref) |
| { |
| struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list); |
| |
| return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref); |
| } |
| |
| static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs, |
| struct pb_buffer_lean *buf, |
| unsigned usage, |
| enum radeon_bo_domain domains) |
| { |
| /* Don't use the "domains" parameter. Amdgpu doesn't support changing |
| * the buffer placement during command submission. |
| */ |
| struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc; |
| struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; |
| struct amdgpu_cs_buffer *buffer; |
| |
| /* Fast exit for no-op calls. |
| * This is very effective with suballocators and linear uploaders that |
| * are outside of the winsys. |
| */ |
| if (bo == cs->last_added_bo && |
| (usage & cs->last_added_bo_usage) == usage) |
| return 0; |
| |
| buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true); |
| if (!buffer) |
| return 0; |
| |
| buffer->usage |= usage; |
| |
| cs->last_added_bo_usage = buffer->usage; |
| cs->last_added_bo = bo; |
| return 0; |
| } |
| |
| static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws, |
| struct amdgpu_ib *main_ib, |
| struct amdgpu_cs *cs) |
| { |
| struct pb_buffer_lean *pb; |
| uint8_t *mapped; |
| unsigned buffer_size; |
| |
| /* Always create a buffer that is at least as large as the maximum seen IB size, |
| * aligned to a power of two. |
| */ |
| buffer_size = util_next_power_of_two(main_ib->max_ib_bytes); |
| |
| /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/ |
| if (!cs->has_chaining) |
| buffer_size *= 4; |
| |
| const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024); |
| /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */ |
| const unsigned max_size = 2 * 1024 * 1024; |
| |
| buffer_size = MIN2(buffer_size, max_size); |
| buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */ |
| |
| /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU. |
| * The speed of writing to GTT WC is somewhere between no difference and very slow, while |
| * VRAM being very slow a lot more often. |
| * |
| * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency |
| * and doesn't have to wait for cached GL2 requests to be processed. |
| */ |
| enum radeon_bo_domain domain = RADEON_DOMAIN_GTT; |
| unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | |
| RADEON_FLAG_GL2_BYPASS; |
| |
| if (cs->ip_type == AMD_IP_GFX || |
| cs->ip_type == AMD_IP_COMPUTE || |
| cs->ip_type == AMD_IP_SDMA) { |
| /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor |
| * on Navi 14 |
| */ |
| flags |= RADEON_FLAG_32BIT; |
| } |
| |
| pb = amdgpu_bo_create(aws, buffer_size, |
| aws->info.gart_page_size, |
| domain, (radeon_bo_flag)flags); |
| if (!pb) |
| return false; |
| |
| mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE); |
| if (!mapped) { |
| radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL); |
| return false; |
| } |
| |
| radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb); |
| radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL); |
| |
| main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer); |
| main_ib->big_buffer_cpu_ptr = mapped; |
| main_ib->used_ib_space = 0; |
| |
| return true; |
| } |
| |
| static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws, |
| struct radeon_cmdbuf *rcs, |
| struct amdgpu_ib *main_ib, |
| struct amdgpu_cs *cs) |
| { |
| struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN]; |
| /* This is the minimum size of a contiguous IB. */ |
| unsigned ib_size = 16 * 1024; |
| |
| /* Always allocate at least the size of the biggest cs_check_space call, |
| * because precisely the last call might have requested this size. |
| */ |
| ib_size = MAX2(ib_size, main_ib->max_check_space_size); |
| |
| if (!cs->has_chaining) { |
| ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes), |
| IB_MAX_SUBMIT_BYTES)); |
| } |
| |
| /* Decay the IB buffer size over time, so that memory usage decreases after |
| * a temporary peak. |
| */ |
| main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32; |
| |
| rcs->prev_dw = 0; |
| rcs->num_prev = 0; |
| rcs->current.cdw = 0; |
| rcs->current.buf = NULL; |
| |
| /* Allocate a new buffer for IBs if the current buffer is all used. */ |
| if (!main_ib->big_buffer || |
| main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) { |
| if (!amdgpu_ib_new_buffer(aws, main_ib, cs)) |
| return false; |
| } |
| |
| chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space; |
| chunk_ib->ib_bytes = 0; |
| /* ib_bytes is in dwords and the conversion to bytes will be done before |
| * the CS ioctl. */ |
| main_ib->ptr_ib_size = &chunk_ib->ib_bytes; |
| main_ib->is_chained_ib = false; |
| |
| amdgpu_cs_add_buffer(rcs, main_ib->big_buffer, |
| (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB), |
| (radeon_bo_domain)0); |
| |
| rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space); |
| |
| cs->csc->ib_main_addr = rcs->current.buf; |
| |
| ib_size = main_ib->big_buffer->size - main_ib->used_ib_space; |
| rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs); |
| return true; |
| } |
| |
| static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib) |
| { |
| if (ib->is_chained_ib) { |
| *ib->ptr_ib_size = rcs->current.cdw | |
| S_3F2_CHAIN(1) | S_3F2_VALID(1) | |
| S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL); |
| } else { |
| *ib->ptr_ib_size = rcs->current.cdw; |
| } |
| } |
| |
| static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs, |
| struct amdgpu_ib *ib, enum amd_ip_type ip_type) |
| { |
| amdgpu_set_ib_size(rcs, ib); |
| ib->used_ib_space += rcs->current.cdw * 4; |
| ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment); |
| ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4); |
| } |
| |
| static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws, |
| struct amdgpu_cs_context *cs, |
| enum amd_ip_type ip_type) |
| { |
| for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) { |
| cs->chunk_ib[i].ip_type = ip_type; |
| cs->chunk_ib[i].flags = 0; |
| |
| if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) { |
| /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation |
| * is the beginning of IBs because completion of an IB doesn't care about the state of |
| * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be |
| * executed in parallel, so draw calls from the current IB can finish after the next IB |
| * starts drawing, and so the cache flush at the end of IBs is usually late and thus |
| * useless. |
| */ |
| cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; |
| } |
| } |
| |
| cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; |
| cs->last_added_bo = NULL; |
| return true; |
| } |
| |
| static void cleanup_fence_list(struct amdgpu_fence_list *fences) |
| { |
| for (unsigned i = 0; i < fences->num; i++) |
| amdgpu_fence_drop_reference(fences->list[i]); |
| fences->num = 0; |
| } |
| |
| static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) |
| { |
| for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) { |
| struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers; |
| unsigned num_buffers = cs->buffer_lists[i].num_buffers; |
| |
| for (unsigned j = 0; j < num_buffers; j++) |
| amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo); |
| |
| cs->buffer_lists[i].num_buffers = 0; |
| } |
| } |
| |
| static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) |
| { |
| cs->seq_no_dependencies.valid_fence_mask = 0; |
| cleanup_fence_list(&cs->syncobj_dependencies); |
| cleanup_fence_list(&cs->syncobj_to_signal); |
| amdgpu_fence_reference(&cs->fence, NULL); |
| cs->last_added_bo = NULL; |
| } |
| |
| static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) |
| { |
| amdgpu_cs_context_cleanup_buffers(aws, cs); |
| amdgpu_cs_context_cleanup(aws, cs); |
| for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) |
| FREE(cs->buffer_lists[i].buffers); |
| FREE(cs->syncobj_dependencies.list); |
| FREE(cs->syncobj_to_signal.list); |
| } |
| |
| |
| static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| return cs->ip_type; |
| } |
| |
| static bool ip_uses_alt_fence(enum amd_ip_type ip_type) |
| { |
| /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */ |
| return ip_type == AMD_IP_VCN_DEC || |
| ip_type == AMD_IP_VCN_ENC || |
| ip_type == AMD_IP_VCN_JPEG; |
| } |
| |
| static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| |
| if (!cs) |
| return; |
| |
| amdgpu_cs_sync_flush(rcs); |
| util_queue_fence_destroy(&cs->flush_completed); |
| p_atomic_dec(&cs->aws->num_cs); |
| radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL); |
| radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL); |
| FREE(rcs->prev); |
| amdgpu_destroy_cs_context(cs->aws, &cs->csc1); |
| amdgpu_destroy_cs_context(cs->aws, &cs->csc2); |
| amdgpu_fence_reference(&cs->next_fence, NULL); |
| FREE(cs); |
| } |
| |
| static bool |
| amdgpu_cs_create(struct radeon_cmdbuf *rcs, |
| struct radeon_winsys_ctx *rwctx, |
| enum amd_ip_type ip_type, |
| void (*flush)(void *ctx, unsigned flags, |
| struct pipe_fence_handle **fence), |
| void *flush_ctx) |
| { |
| struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; |
| struct amdgpu_cs *cs; |
| |
| cs = CALLOC_STRUCT(amdgpu_cs); |
| if (!cs) { |
| return false; |
| } |
| |
| util_queue_fence_init(&cs->flush_completed); |
| |
| cs->aws = ctx->aws; |
| cs->ctx = ctx; |
| cs->flush_cs = flush; |
| cs->flush_data = flush_ctx; |
| cs->ip_type = ip_type; |
| cs->noop = ctx->aws->noop_cs; |
| cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 && |
| (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE); |
| |
| /* Compute the queue index by counting the IPs that have queues. */ |
| assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip)); |
| assert(ctx->aws->info.ip[ip_type].num_queues); |
| |
| if (ip_uses_alt_fence(ip_type)) { |
| cs->queue_index = INT_MAX; |
| cs->uses_alt_fence = true; |
| } else { |
| cs->queue_index = 0; |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) { |
| if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i)) |
| continue; |
| |
| if (i == ip_type) |
| break; |
| |
| cs->queue_index++; |
| } |
| assert(cs->queue_index < AMDGPU_MAX_QUEUES); |
| } |
| |
| ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4, |
| (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk); |
| |
| if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) { |
| FREE(cs); |
| return false; |
| } |
| |
| if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) { |
| amdgpu_destroy_cs_context(ctx->aws, &cs->csc1); |
| FREE(cs); |
| return false; |
| } |
| |
| memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); |
| |
| /* Set the first submission context as current. */ |
| rcs->csc = cs->csc = &cs->csc1; |
| cs->cst = &cs->csc2; |
| |
| /* Assign to both amdgpu_cs_context; only csc will use it. */ |
| cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist; |
| cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist; |
| |
| cs->csc1.aws = ctx->aws; |
| cs->csc2.aws = ctx->aws; |
| |
| p_atomic_inc(&ctx->aws->num_cs); |
| |
| if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs)) |
| goto fail; |
| |
| /* Currently only gfx, compute and sdma queues supports user queue. */ |
| if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) { |
| if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type)) |
| goto fail; |
| } |
| |
| rcs->priv = cs; |
| return true; |
| fail: |
| amdgpu_cs_destroy(rcs); |
| return false; |
| } |
| |
| static bool |
| amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, |
| unsigned preamble_num_dw) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| struct amdgpu_winsys *aws = cs->aws; |
| struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; |
| unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment); |
| struct pb_buffer_lean *preamble_bo; |
| uint32_t *map; |
| |
| /* Create the preamble IB buffer. */ |
| preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment, |
| RADEON_DOMAIN_VRAM, |
| (radeon_bo_flag) |
| (RADEON_FLAG_NO_INTERPROCESS_SHARING | |
| RADEON_FLAG_GTT_WC)); |
| if (!preamble_bo) |
| return false; |
| |
| map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL, |
| (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY)); |
| if (!map) { |
| radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL); |
| return false; |
| } |
| |
| /* Upload the preamble IB. */ |
| memcpy(map, preamble_ib, preamble_num_dw * 4); |
| |
| /* Pad the IB. */ |
| amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0); |
| amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo); |
| |
| for (unsigned i = 0; i < 2; i++) { |
| csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo); |
| csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; |
| |
| csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; |
| } |
| |
| assert(!cs->preamble_ib_bo); |
| cs->preamble_ib_bo = preamble_bo; |
| |
| amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, |
| RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); |
| return true; |
| } |
| |
| static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) |
| { |
| return true; |
| } |
| |
| static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| struct amdgpu_ib *main_ib = &cs->main_ib; |
| |
| assert(rcs->current.cdw <= rcs->current.max_dw); |
| |
| unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw; |
| |
| if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES) |
| return false; |
| |
| if (rcs->current.max_dw - rcs->current.cdw >= dw) |
| return true; |
| |
| unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs); |
| unsigned need_byte_size = (dw + cs_epilog_dw) * 4; |
| /* 125% of the size for IB epilog. */ |
| unsigned safe_byte_size = need_byte_size + need_byte_size / 4; |
| main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size); |
| main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4); |
| |
| if (!cs->has_chaining) |
| return false; |
| |
| /* Allocate a new chunk */ |
| if (rcs->num_prev >= rcs->max_prev) { |
| unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev); |
| struct radeon_cmdbuf_chunk *new_prev; |
| |
| new_prev = (struct radeon_cmdbuf_chunk*) |
| REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev, |
| sizeof(*new_prev) * new_max_prev); |
| if (!new_prev) |
| return false; |
| |
| rcs->prev = new_prev; |
| rcs->max_prev = new_max_prev; |
| } |
| |
| if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs)) |
| return false; |
| |
| assert(main_ib->used_ib_space == 0); |
| uint64_t va = main_ib->gpu_address; |
| |
| /* This space was originally reserved. */ |
| rcs->current.max_dw += cs_epilog_dw; |
| |
| /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ |
| amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4); |
| |
| radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); |
| radeon_emit(rcs, va); |
| radeon_emit(rcs, va >> 32); |
| uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; |
| |
| assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0); |
| assert(rcs->current.cdw <= rcs->current.max_dw); |
| |
| amdgpu_set_ib_size(rcs, main_ib); |
| main_ib->ptr_ib_size = new_ptr_ib_size; |
| main_ib->is_chained_ib = true; |
| |
| /* Hook up the new chunk */ |
| rcs->prev[rcs->num_prev].buf = rcs->current.buf; |
| rcs->prev[rcs->num_prev].cdw = rcs->current.cdw; |
| rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */ |
| rcs->num_prev++; |
| |
| rcs->prev_dw += rcs->current.cdw; |
| rcs->current.cdw = 0; |
| |
| rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space); |
| rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw; |
| |
| amdgpu_cs_add_buffer(rcs, main_ib->big_buffer, |
| RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); |
| |
| return true; |
| } |
| |
| static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs) |
| { |
| unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers; |
| struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers; |
| |
| for (unsigned i = 0; i < num_buffers; i++) { |
| struct amdgpu_cs_buffer *slab_buffer = &buffers[i]; |
| struct amdgpu_cs_buffer *real_buffer = |
| amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b, |
| &cs->buffer_lists[AMDGPU_BO_REAL], true); |
| |
| /* We need to set the usage because it determines the BO priority. |
| * |
| * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its |
| * BO fences to fence dependencies. Only the slab entries should do that. |
| */ |
| real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED; |
| } |
| } |
| |
| static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs, |
| struct radeon_bo_list_item *list) |
| { |
| struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; |
| |
| /* We do this in the CS thread, but since we need to return the final usage of all buffers |
| * here, do it here too. There is no harm in doing it again in the CS thread. |
| */ |
| amdgpu_add_slab_backing_buffers(cs); |
| |
| struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL]; |
| unsigned num_real_buffers = real_buffers->num_buffers; |
| |
| #if HAVE_AMDGPU_VIRTIO |
| assert(!cs->ws->info.is_virtio); |
| #endif |
| |
| if (list) { |
| for (unsigned i = 0; i < num_real_buffers; i++) { |
| list[i].bo_size = real_buffers->buffers[i].bo->base.size; |
| list[i].vm_address = |
| amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle); |
| list[i].priority_usage = real_buffers->buffers[i].usage; |
| } |
| } |
| return num_real_buffers; |
| } |
| |
| static void add_fence_to_list(struct amdgpu_fence_list *fences, |
| struct amdgpu_fence *fence) |
| { |
| unsigned idx = fences->num++; |
| |
| if (idx >= fences->max) { |
| unsigned size; |
| const unsigned increment = 8; |
| |
| fences->max = idx + increment; |
| size = fences->max * sizeof(fences->list[0]); |
| fences->list = (struct pipe_fence_handle**)realloc(fences->list, size); |
| } |
| amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); |
| } |
| |
| static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs, |
| struct pipe_fence_handle *pfence) |
| { |
| struct amdgpu_cs *acs = amdgpu_cs(rcs); |
| struct amdgpu_cs_context *cs = acs->csc; |
| struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; |
| |
| util_queue_fence_wait(&fence->submitted); |
| |
| if (!fence->imported) { |
| /* Ignore idle fences. This will only check the user fence in memory. */ |
| if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) { |
| add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index, |
| fence->queue_seq_no); |
| } |
| } |
| else |
| add_fence_to_list(&cs->syncobj_dependencies, fence); |
| } |
| |
| static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws, |
| struct amdgpu_cs_context *cs, |
| unsigned queue_index_bit, |
| struct amdgpu_seq_no_fences *dependencies, |
| struct amdgpu_winsys_bo *bo, unsigned usage) |
| { |
| if (usage & RADEON_USAGE_SYNCHRONIZED) { |
| /* Add BO fences from queues other than 'queue_index' to dependencies. */ |
| u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) { |
| add_seq_no_to_list(ws, dependencies, other_queue_idx, |
| bo->fences.seq_no[other_queue_idx]); |
| } |
| |
| if (bo->alt_fence) |
| add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence); |
| } |
| } |
| |
| static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo, |
| uint_seq_no new_queue_seq_no) |
| { |
| bo->fences.seq_no[queue_index] = new_queue_seq_no; |
| bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index); |
| } |
| |
| static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry, |
| struct amdgpu_winsys_bo *bo, unsigned usage) |
| { |
| bo_entry->bo_handle = get_real_bo(bo)->kms_handle; |
| bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2; |
| } |
| |
| static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws, |
| struct pipe_fence_handle *fence) |
| { |
| struct amdgpu_cs *acs = amdgpu_cs(rws); |
| struct amdgpu_cs_context *cs = acs->csc; |
| |
| add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence); |
| } |
| |
| static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs, |
| unsigned num_real_buffers, |
| struct drm_amdgpu_bo_list_entry *bo_list_real, |
| uint64_t *seq_no) |
| { |
| struct amdgpu_winsys *aws = acs->aws; |
| struct amdgpu_cs_context *cs = acs->cst; |
| struct drm_amdgpu_bo_list_in bo_list_in; |
| struct drm_amdgpu_cs_chunk chunks[8]; |
| unsigned num_chunks = 0; |
| |
| /* BO list */ |
| bo_list_in.operation = ~0; |
| bo_list_in.list_handle = ~0; |
| bo_list_in.bo_number = num_real_buffers; |
| bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); |
| bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real; |
| |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; |
| chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; |
| chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in; |
| num_chunks++; |
| |
| /* Syncobj dependencies. */ |
| unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; |
| if (num_syncobj_dependencies) { |
| struct drm_amdgpu_cs_chunk_sem *sem_chunk = |
| (struct drm_amdgpu_cs_chunk_sem *) |
| alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); |
| |
| for (unsigned i = 0; i < num_syncobj_dependencies; i++) { |
| struct amdgpu_fence *fence = |
| (struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; |
| |
| assert(util_queue_fence_is_signalled(&fence->submitted)); |
| sem_chunk[i].handle = fence->syncobj; |
| } |
| |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; |
| chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies; |
| chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; |
| num_chunks++; |
| } |
| |
| /* Syncobj signals. */ |
| unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num; |
| struct drm_amdgpu_cs_chunk_sem *sem_chunk = |
| (struct drm_amdgpu_cs_chunk_sem *) |
| alloca(num_syncobj_to_signal * sizeof(sem_chunk[0])); |
| |
| for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) { |
| struct amdgpu_fence *fence = |
| (struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; |
| |
| sem_chunk[i].handle = fence->syncobj; |
| } |
| sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj; |
| |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT; |
| chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal; |
| chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; |
| num_chunks++; |
| |
| if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) { |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW; |
| chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4; |
| chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk; |
| num_chunks++; |
| } |
| |
| /* Fence */ |
| if (amdgpu_cs_has_user_fence(acs)) { |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE; |
| chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4; |
| chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk; |
| num_chunks++; |
| } |
| |
| /* IB */ |
| if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) { |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; |
| chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; |
| chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE]; |
| num_chunks++; |
| } |
| |
| /* IB */ |
| chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; |
| chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; |
| chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN]; |
| num_chunks++; |
| |
| if (cs->secure) { |
| cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; |
| cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; |
| } else { |
| cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; |
| cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; |
| } |
| |
| assert(num_chunks <= 8); |
| |
| /* Submit the command buffer. |
| * |
| * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites |
| * quite often, but it eventually succeeds after enough attempts. This happens frequently |
| * with dEQP using NGG streamout. |
| */ |
| int r = 0; |
| |
| do { |
| /* Wait 1 ms and try again. */ |
| if (r == -ENOMEM) |
| os_time_sleep(1000); |
| |
| r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no); |
| } while (r == -ENOMEM); |
| |
| return r; |
| } |
| |
| static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq, |
| struct amdgpu_cs_context *cs, |
| uint64_t num_fences, |
| struct drm_amdgpu_userq_fence_info *fence_info) |
| { |
| amdgpu_pkt_begin(); |
| |
| if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) { |
| if (num_fences) { |
| unsigned num_fences_in_iter; |
| /* FENCE_WAIT_MULTI packet supports max 32 fenes */ |
| for (unsigned i = 0; i < num_fences; i = i + 32) { |
| num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32; |
| amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0)); |
| amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1)); |
| for (unsigned j = 0; j < num_fences_in_iter; j++) { |
| amdgpu_pkt_add_dw(fence_info[i + j].va); |
| amdgpu_pkt_add_dw(fence_info[i + j].va >> 32); |
| amdgpu_pkt_add_dw(fence_info[i + j].value); |
| amdgpu_pkt_add_dw(fence_info[i + j].value >> 32); |
| } |
| } |
| } |
| |
| amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0)); |
| amdgpu_pkt_add_dw(0x0); |
| |
| amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); |
| amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start); |
| amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32); |
| if (userq->ip_type == AMD_IP_GFX) |
| amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1)); |
| else |
| amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) | |
| S_3F3_INHERIT_VMID_MQD_COMPUTE(1)); |
| |
| /* Add 8 for release mem packet and 2 for protected fence signal packet. |
| * Calculcating userq_fence_seq_num this way to match with kernel fence that is |
| * returned in userq_wait iotl. |
| */ |
| userq->user_fence_seq_num = *userq->wptr_bo_map + __num_dw_written + 8 + 2; |
| |
| /* add release mem for user fence */ |
| amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0)); |
| amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) | |
| S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) | |
| S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3)); |
| amdgpu_pkt_add_dw(S_030358_DATA_SEL(2)); |
| amdgpu_pkt_add_dw(userq->user_fence_va); |
| amdgpu_pkt_add_dw(userq->user_fence_va >> 32); |
| amdgpu_pkt_add_dw(userq->user_fence_seq_num); |
| amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32); |
| amdgpu_pkt_add_dw(0); |
| |
| /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer |
| * is only accessible from kernel through VMID 0. |
| */ |
| amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0)); |
| amdgpu_pkt_add_dw(0); |
| } else { |
| fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type); |
| } |
| |
| amdgpu_pkt_end(); |
| } |
| |
| static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq, |
| struct amdgpu_cs *acs, |
| uint32_t *shared_buf_kms_handles_write, |
| unsigned num_shared_buf_write, |
| uint32_t *shared_buf_kms_handles_read, |
| unsigned num_shared_buf_read, |
| uint64_t *seq_no, |
| uint64_t vm_timeline_point) |
| { |
| int r = 0; |
| struct amdgpu_winsys *aws = acs->aws; |
| struct amdgpu_cs_context *cs = acs->cst; |
| |
| /* Syncobj dependencies. */ |
| unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; |
| uint32_t *syncobj_dependencies_list = |
| (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t)); |
| |
| /* Currently only 1 vm timeline syncobj can be a dependency. */ |
| uint16_t num_syncobj_timeline_dependencies = 1; |
| uint32_t syncobj_timeline_dependency; |
| uint64_t syncobj_timeline_dependency_point; |
| |
| if (num_syncobj_dependencies) { |
| for (unsigned i = 0; i < num_syncobj_dependencies; i++) { |
| struct amdgpu_fence *fence = |
| (struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; |
| |
| assert(util_queue_fence_is_signalled(&fence->submitted)); |
| syncobj_dependencies_list[i] = fence->syncobj; |
| } |
| } |
| syncobj_timeline_dependency = aws->vm_timeline_syncobj; |
| syncobj_timeline_dependency_point = vm_timeline_point; |
| |
| /* Syncobj signals. Adding 1 for cs submission fence. */ |
| unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1; |
| uint32_t *syncobj_signal_list = |
| (uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t)); |
| |
| for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) { |
| struct amdgpu_fence *fence = |
| (struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; |
| |
| syncobj_signal_list[i] = fence->syncobj; |
| } |
| syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj; |
| |
| struct drm_amdgpu_userq_fence_info *fence_info; |
| struct drm_amdgpu_userq_wait userq_wait_data = { |
| .syncobj_handles = (uintptr_t)syncobj_dependencies_list, |
| .syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency, |
| .syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point, |
| .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read, |
| .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write, |
| .num_syncobj_timeline_handles = num_syncobj_timeline_dependencies, |
| .num_fences = 0, |
| .num_syncobj_handles = num_syncobj_dependencies, |
| .num_bo_read_handles = num_shared_buf_read, |
| .num_bo_write_handles = num_shared_buf_write, |
| .out_fences = (uintptr_t)NULL, |
| }; |
| |
| /* |
| * Buffers sharing synchronization follow these rules: |
| * - read-only buffers wait for all previous writes to complete |
| * - write-only(also read-write) buffers wait for all previous reads to complete |
| * To implement this strategy, we use amdgpu_userq_wait() before submitting |
| * a job, and amdgpu_userq_signal() after to indicate completion. |
| */ |
| r = ac_drm_userq_wait(aws->dev, &userq_wait_data); |
| if (r) |
| fprintf(stderr, "amdgpu: getting wait num_fences failed\n"); |
| |
| fence_info = (struct drm_amdgpu_userq_fence_info*) |
| alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info)); |
| userq_wait_data.out_fences = (uintptr_t)fence_info; |
| |
| r = ac_drm_userq_wait(aws->dev, &userq_wait_data); |
| if (r) |
| fprintf(stderr, "amdgpu: getting wait fences failed\n"); |
| |
| simple_mtx_lock(&userq->lock); |
| amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info); |
| struct drm_amdgpu_userq_signal userq_signal_data = { |
| .queue_id = userq->userq_handle, |
| .syncobj_handles = (uintptr_t)syncobj_signal_list, |
| .num_syncobj_handles = num_syncobj_to_signal, |
| .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read, |
| .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write, |
| .num_bo_read_handles = num_shared_buf_read, |
| .num_bo_write_handles = num_shared_buf_write, |
| }; |
| |
| r = ac_drm_userq_signal(aws->dev, &userq_signal_data); |
| if (!r) |
| userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = *userq->wptr_bo_map; |
| |
| *seq_no = userq->user_fence_seq_num; |
| simple_mtx_unlock(&userq->lock); |
| |
| return r; |
| } |
| |
| enum queue_type { |
| KERNELQ, |
| KERNELQ_ALT_FENCE, |
| USERQ, |
| }; |
| |
| /* The template parameter determines whether the queue should skip code used by the default queue |
| * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence |
| * for all BOs. |
| */ |
| template<enum queue_type queue_type> |
| static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) |
| { |
| struct amdgpu_cs *acs = (struct amdgpu_cs*)job; |
| struct amdgpu_winsys *aws = acs->aws; |
| struct amdgpu_cs_context *cs = acs->cst; |
| int r; |
| uint64_t seq_no = 0; |
| bool has_user_fence = amdgpu_cs_has_user_fence(acs); |
| /* The maximum timeline point of VM updates for all BOs used in this submit. */ |
| uint64_t vm_timeline_point = 0; |
| |
| simple_mtx_lock(&aws->bo_fence_lock); |
| unsigned queue_index; |
| struct amdgpu_queue *queue; |
| uint_seq_no prev_seq_no, next_seq_no; |
| |
| if (queue_type != KERNELQ_ALT_FENCE) { |
| queue_index = acs->queue_index; |
| queue = &aws->queues[queue_index]; |
| prev_seq_no = queue->latest_seq_no; |
| |
| /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno, |
| * but the values aren't related. |
| */ |
| next_seq_no = prev_seq_no + 1; |
| |
| /* Wait for the oldest fence to signal. This should always check the user fence, then wait |
| * via the ioctl. We have to do this because we are going to release the oldest fence and |
| * replace it with the latest fence in the ring. |
| */ |
| struct pipe_fence_handle **oldest_fence = |
| &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE]; |
| |
| if (*oldest_fence) { |
| if (!amdgpu_fence_wait(*oldest_fence, 0, false)) { |
| /* Take the reference because the fence can be released by other threads after we |
| * unlock the mutex. |
| */ |
| struct pipe_fence_handle *tmp_fence = NULL; |
| amdgpu_fence_reference(&tmp_fence, *oldest_fence); |
| |
| /* Unlock the mutex before waiting. */ |
| simple_mtx_unlock(&aws->bo_fence_lock); |
| amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false); |
| amdgpu_fence_reference(&tmp_fence, NULL); |
| simple_mtx_lock(&aws->bo_fence_lock); |
| } |
| |
| /* Remove the idle fence from the ring. */ |
| amdgpu_fence_reference(oldest_fence, NULL); |
| } |
| } |
| |
| /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest |
| * sequence number per queue and removes all older ones. |
| */ |
| struct amdgpu_seq_no_fences seq_no_dependencies; |
| memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies)); |
| |
| if (queue_type != KERNELQ_ALT_FENCE) { |
| /* Add a fence dependency on the previous IB if the IP has multiple physical queues to |
| * make it appear as if it had only 1 queue, or if the previous IB comes from a different |
| * context. The reasons are: |
| * - Our BO fence tracking only supports 1 queue per IP. |
| * - IBs from different contexts must wait for each other and can't execute in a random order. |
| */ |
| struct amdgpu_fence *prev_fence = |
| (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE]; |
| |
| /* Add a dependency on a previous fence, unless we can determine that |
| * it's useless because the execution order is guaranteed. |
| */ |
| if (prev_fence) { |
| bool same_ctx = queue->last_ctx == acs->ctx; |
| /* userqueue submission mode uses a single queue per process. */ |
| bool same_queue = aws->info.ip[acs->ip_type].num_queues > 1 && |
| queue_type != USERQ; |
| if (!same_ctx || !same_queue) |
| add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no); |
| } |
| } |
| |
| /* Since the kernel driver doesn't synchronize execution between different |
| * rings automatically, we have to add fence dependencies manually. This gathers sequence |
| * numbers from BOs and sets the next sequence number in the BOs. |
| */ |
| |
| /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */ |
| struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers; |
| unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers; |
| unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; |
| unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ? |
| 0 : BITFIELD_BIT(queue_index); |
| |
| for (unsigned i = 0; i < num_slab_entry_buffers; i++) { |
| struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i]; |
| struct amdgpu_winsys_bo *bo = buffer->bo; |
| |
| amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, |
| buffer->usage); |
| if (queue_type == KERNELQ_ALT_FENCE) |
| amdgpu_fence_reference(&bo->alt_fence, cs->fence); |
| else |
| amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); |
| |
| /* We didn't add any slab entries into the real buffer list that will be submitted |
| * to the kernel. Do it now. |
| */ |
| struct amdgpu_cs_buffer *real_buffer = |
| amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b, |
| &cs->buffer_lists[AMDGPU_BO_REAL], false); |
| |
| /* We need to set the usage because it determines the BO priority. */ |
| real_buffer->usage |= buffer->usage; |
| } |
| |
| /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */ |
| unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; |
| struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers; |
| unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers; |
| bool out_of_memory = false; |
| |
| for (unsigned i = 0; i < num_sparse_buffers; i++) { |
| struct amdgpu_cs_buffer *buffer = &sparse_buffers[i]; |
| struct amdgpu_winsys_bo *bo = buffer->bo; |
| |
| amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, |
| buffer->usage); |
| if (queue_type == KERNELQ_ALT_FENCE) |
| amdgpu_fence_reference(&bo->alt_fence, cs->fence); |
| else |
| amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); |
| |
| /* Add backing buffers of sparse buffers to the buffer list. |
| * |
| * This is done late, during submission, to keep the buffer list short before |
| * submit, and to avoid managing fences for the backing buffers. |
| */ |
| struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo); |
| |
| if (queue_type == USERQ) { |
| uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point); |
| vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point); |
| } |
| |
| simple_mtx_lock(&sparse_bo->commit_lock); |
| list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) { |
| /* We can directly add the buffer here, because we know that each |
| * backing buffer occurs only once. |
| */ |
| struct amdgpu_cs_buffer *real_buffer = |
| amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true); |
| if (!real_buffer) { |
| fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__); |
| simple_mtx_unlock(&sparse_bo->commit_lock); |
| r = -ENOMEM; |
| out_of_memory = true; |
| } |
| |
| real_buffer->usage = buffer->usage; |
| } |
| simple_mtx_unlock(&sparse_bo->commit_lock); |
| } |
| |
| /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */ |
| unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; |
| struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers; |
| struct drm_amdgpu_bo_list_entry *bo_list; |
| /* BO dependency management depends on the queue mode: |
| * - kernel queue: BO used by the submit are passed to the kernel in a |
| * drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled |
| * automatically by the kernel; intra-process sync is handled by Mesa. |
| * - user queue: intra-process sync is similar. Inter-process sync is handled |
| * using timeline points, amdgpu_userq_wait (before a submit) and |
| * amdgpu_userq_signal (after a submit). |
| */ |
| unsigned num_shared_buf_write; |
| unsigned num_shared_buf_read; |
| /* Store write handles in the begining and read handles at the end in shared_buf_kms_handles. |
| * If usage is read and write then store the handle in write list. |
| */ |
| uint32_t *shared_buf_kms_handles; |
| if (queue_type != USERQ) { |
| bo_list = (struct drm_amdgpu_bo_list_entry *) |
| alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); |
| } else { |
| num_shared_buf_write = 0; |
| num_shared_buf_read = 0; |
| shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t)); |
| } |
| unsigned i; |
| |
| for (i = 0; i < initial_num_real_buffers; i++) { |
| struct amdgpu_cs_buffer *buffer = &real_buffers[i]; |
| struct amdgpu_winsys_bo *bo = buffer->bo; |
| |
| amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, |
| buffer->usage); |
| if (queue_type == KERNELQ_ALT_FENCE) |
| amdgpu_fence_reference(&bo->alt_fence, cs->fence); |
| else |
| amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); |
| |
| if (queue_type != USERQ) { |
| amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage); |
| } else { |
| vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point); |
| |
| if (!get_real_bo(bo)->is_shared) |
| continue; |
| |
| if (buffer->usage & RADEON_USAGE_WRITE) { |
| shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle; |
| num_shared_buf_write++; |
| } else { |
| num_shared_buf_read++; |
| shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = |
| get_real_bo(bo)->kms_handle; |
| } |
| } |
| } |
| |
| /* These are backing buffers of slab entries. Don't add their fence dependencies. */ |
| for (; i < num_real_buffers_except_sparse; i++) { |
| struct amdgpu_cs_buffer *buffer = &real_buffers[i]; |
| struct amdgpu_winsys_bo *bo = buffer->bo; |
| |
| if (queue_type == KERNELQ_ALT_FENCE) |
| get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true; |
| else |
| amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); |
| |
| if (queue_type != USERQ) { |
| amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage); |
| } else { |
| vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point); |
| |
| if (!get_real_bo(bo)->is_shared) |
| continue; |
| |
| if (buffer->usage & RADEON_USAGE_WRITE) { |
| shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle; |
| num_shared_buf_write++; |
| } else { |
| num_shared_buf_read++; |
| shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = |
| get_real_bo(bo)->kms_handle; |
| } |
| } |
| } |
| |
| /* Sparse backing BOs are last. Don't update their fences because we don't use them. */ |
| for (; i < num_real_buffers; ++i) { |
| struct amdgpu_cs_buffer *buffer = &real_buffers[i]; |
| |
| if (queue_type != USERQ) { |
| amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage); |
| } else { |
| if (!get_real_bo(buffer->bo)->is_shared) |
| continue; |
| if (buffer->usage & RADEON_USAGE_WRITE) { |
| shared_buf_kms_handles[num_shared_buf_write] = |
| get_real_bo(buffer->bo)->kms_handle; |
| num_shared_buf_write++; |
| } else { |
| num_shared_buf_read++; |
| shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = |
| get_real_bo(buffer->bo)->kms_handle; |
| } |
| } |
| } |
| |
| #if 0 /* Debug code. */ |
| printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no); |
| |
| /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */ |
| for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) { |
| if (i == acs->queue_index) |
| continue; |
| |
| struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE]; |
| if (!fence) { |
| if (i <= 1) |
| printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no); |
| continue; |
| } |
| |
| bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i); |
| uint_seq_no old = seq_no_dependencies.seq_no[i]; |
| add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no); |
| uint_seq_no new = seq_no_dependencies.seq_no[i]; |
| |
| if (!valid) |
| printf(" missing dependency on queue=%u, seq_no=%u\n", i, new); |
| else if (old != new) |
| printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new); |
| else |
| printf(" has dependency on queue=%u, seq_no=%u\n", i, old); |
| } |
| #endif |
| |
| /* Convert the sequence numbers we gathered to fence dependencies. */ |
| u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) { |
| struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i); |
| |
| if (fence) { |
| /* If it's idle, don't add it to the list of dependencies. */ |
| if (amdgpu_fence_wait(*fence, 0, false)) |
| amdgpu_fence_reference(fence, NULL); |
| else |
| add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence); |
| } |
| } |
| |
| if (queue_type != KERNELQ_ALT_FENCE) { |
| /* Finally, add the IB fence into the fence ring of the queue. */ |
| amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence); |
| queue->latest_seq_no = next_seq_no; |
| ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no; |
| |
| /* Update the last used context in the queue. */ |
| amdgpu_ctx_reference(&queue->last_ctx, acs->ctx); |
| } |
| simple_mtx_unlock(&aws->bo_fence_lock); |
| |
| #if MESA_DEBUG |
| /* Prepare the buffer list. */ |
| if (aws->debug_all_bos) { |
| /* The buffer list contains all buffers. This is a slow path that |
| * ensures that no buffer is missing in the BO list. |
| */ |
| simple_mtx_lock(&aws->global_bo_list_lock); |
| if (queue_type != USERQ) { |
| bo_list = (struct drm_amdgpu_bo_list_entry *) |
| alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); |
| num_real_buffers = 0; |
| list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) { |
| bo_list[num_real_buffers].bo_handle = bo->kms_handle; |
| bo_list[num_real_buffers].bo_priority = 0; |
| ++num_real_buffers; |
| } |
| } else { |
| shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t)); |
| num_shared_buf_write = 0; |
| num_shared_buf_read = 0; |
| list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) { |
| shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle; |
| num_shared_buf_write++; |
| } |
| } |
| simple_mtx_unlock(&aws->global_bo_list_lock); |
| } |
| #endif |
| |
| if (acs->ip_type == AMD_IP_GFX) |
| aws->gfx_bo_list_counter += num_real_buffers; |
| |
| if (out_of_memory) { |
| r = -ENOMEM; |
| } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) { |
| r = -ECANCELED; |
| } else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) { |
| r = 0; |
| } else { |
| if (queue_type != USERQ) { |
| /* Submit the command buffer. |
| * |
| * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites |
| * quite often, but it eventually succeeds after enough attempts. This happens frequently |
| * with dEQP using NGG streamout. |
| */ |
| r = 0; |
| |
| do { |
| /* Wait 1 ms and try again. */ |
| if (r == -ENOMEM) |
| os_time_sleep(1000); |
| |
| r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no); |
| } while (r == -ENOMEM); |
| |
| if (!r) { |
| /* Success. */ |
| uint64_t *user_fence = NULL; |
| |
| /* Need to reserve 4 QWORD for user fence: |
| * QWORD[0]: completed fence |
| * QWORD[1]: preempted fence |
| * QWORD[2]: reset fence |
| * QWORD[3]: preempted then reset |
| */ |
| if (has_user_fence) |
| user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4; |
| amdgpu_fence_submitted(cs->fence, seq_no, user_fence); |
| } |
| } else { |
| struct amdgpu_userq *userq = &queue->userq; |
| r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write, |
| &shared_buf_kms_handles[num_real_buffers - num_shared_buf_read], |
| num_shared_buf_read, &seq_no, vm_timeline_point); |
| if (!r) { |
| /* Success. */ |
| amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr); |
| } |
| } |
| } |
| |
| if (unlikely(r)) { |
| if (r == -ECANCELED) { |
| amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET, |
| "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n"); |
| } else if (r == -ENODATA) { |
| amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET, |
| "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n"); |
| } else if (r == -ETIME) { |
| amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET, |
| "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n"); |
| } else { |
| amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, |
| PIPE_UNKNOWN_CONTEXT_RESET, |
| "amdgpu: The CS has been rejected, " |
| "see dmesg for more information (%i).\n", |
| r); |
| } |
| } |
| |
| /* If there was an error, signal the fence, because it won't be signalled |
| * by the hardware. */ |
| if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX)) |
| amdgpu_fence_signalled(cs->fence); |
| |
| if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0)) |
| acs->mcbp_fw_shadow_chunk.flags = 0; |
| |
| cs->error_code = r; |
| |
| /* Clear the buffer lists. */ |
| for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) { |
| struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers; |
| unsigned num_buffers = cs->buffer_lists[list].num_buffers; |
| |
| if (list == AMDGPU_BO_REAL) { |
| /* Only decrement num_active_ioctls and unref where we incremented them. |
| * We did both for regular real BOs. We only incremented the refcount for sparse |
| * backing BOs. |
| */ |
| /* Regular real BOs. */ |
| for (unsigned i = 0; i < initial_num_real_buffers; i++) { |
| p_atomic_dec(&buffers[i].bo->num_active_ioctls); |
| amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); |
| } |
| |
| /* Do nothing for slab BOs. */ |
| |
| /* Sparse backing BOs. */ |
| for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++) |
| amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); |
| } else { |
| for (unsigned i = 0; i < num_buffers; i++) { |
| p_atomic_dec(&buffers[i].bo->num_active_ioctls); |
| amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); |
| } |
| } |
| |
| cs->buffer_lists[list].num_buffers = 0; |
| } |
| |
| amdgpu_cs_context_cleanup(aws, cs); |
| } |
| |
| /* Make sure the previous submission is completed. */ |
| void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| |
| /* Wait for any pending ioctl of this CS to complete. */ |
| util_queue_fence_wait(&cs->flush_completed); |
| } |
| |
| static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, |
| unsigned flags, |
| struct pipe_fence_handle **fence) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| struct amdgpu_winsys *aws = cs->aws; |
| int error_code = 0; |
| uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask; |
| |
| rcs->current.max_dw += amdgpu_cs_epilog_dws(cs); |
| |
| /* Pad the IB according to the mask. */ |
| switch (cs->ip_type) { |
| case AMD_IP_SDMA: |
| if (aws->info.gfx_level <= GFX6) { |
| while (rcs->current.cdw & ib_pad_dw_mask) |
| radeon_emit(rcs, 0xf0000000); /* NOP packet */ |
| } else { |
| while (rcs->current.cdw & ib_pad_dw_mask) |
| radeon_emit(rcs, SDMA_NOP_PAD); |
| } |
| break; |
| case AMD_IP_GFX: |
| case AMD_IP_COMPUTE: |
| amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0); |
| if (cs->ip_type == AMD_IP_GFX) |
| aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; |
| break; |
| case AMD_IP_UVD: |
| case AMD_IP_UVD_ENC: |
| while (rcs->current.cdw & ib_pad_dw_mask) |
| radeon_emit(rcs, 0x80000000); /* type2 nop packet */ |
| break; |
| case AMD_IP_VCN_JPEG: |
| if (rcs->current.cdw % 2) |
| assert(0); |
| while (rcs->current.cdw & ib_pad_dw_mask) { |
| radeon_emit(rcs, 0x60000000); /* nop packet */ |
| radeon_emit(rcs, 0x00000000); |
| } |
| break; |
| case AMD_IP_VCN_DEC: |
| while (rcs->current.cdw & ib_pad_dw_mask) |
| radeon_emit(rcs, 0x81ff); /* nop packet */ |
| break; |
| default: |
| break; |
| } |
| |
| if (rcs->current.cdw > rcs->current.max_dw) { |
| fprintf(stderr, "amdgpu: command stream overflowed\n"); |
| } |
| |
| /* If the CS is not empty or overflowed.... */ |
| if (likely(radeon_emitted(rcs, 0) && |
| rcs->current.cdw <= rcs->current.max_dw && |
| !(flags & RADEON_FLUSH_NOOP))) { |
| struct amdgpu_cs_context *cur = cs->csc; |
| |
| /* Set IB sizes. */ |
| amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type); |
| |
| /* Create a fence. */ |
| amdgpu_fence_reference(&cur->fence, NULL); |
| if (cs->next_fence) { |
| /* just move the reference */ |
| cur->fence = cs->next_fence; |
| cs->next_fence = NULL; |
| } else { |
| cur->fence = amdgpu_fence_create(cs); |
| } |
| if (fence) |
| amdgpu_fence_reference(fence, cur->fence); |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) { |
| unsigned num_buffers = cur->buffer_lists[i].num_buffers; |
| struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers; |
| |
| for (unsigned j = 0; j < num_buffers; j++) |
| p_atomic_inc(&buffers[j].bo->num_active_ioctls); |
| } |
| |
| amdgpu_cs_sync_flush(rcs); |
| |
| cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ |
| if (cs->noop && cs->ip_type == AMD_IP_GFX) { |
| /* Reduce the IB size and fill it with NOP to make it like an empty IB. */ |
| unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1; |
| assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size); |
| |
| cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0); |
| cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4; |
| } |
| |
| /* Swap command streams. "cst" is going to be submitted. */ |
| rcs->csc = cs->csc = cs->cst; |
| cs->cst = cur; |
| |
| /* only gfx, compute and sdma queues are supported in userqueues. */ |
| if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) { |
| util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed, |
| amdgpu_cs_submit_ib<USERQ>, NULL, 0); |
| } else { |
| util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed, |
| cs->uses_alt_fence ? |
| amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE> |
| : amdgpu_cs_submit_ib<KERNELQ>, |
| NULL, 0); |
| } |
| |
| if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) |
| cs->csc->secure = !cs->cst->secure; |
| else |
| cs->csc->secure = cs->cst->secure; |
| |
| if (!(flags & PIPE_FLUSH_ASYNC)) { |
| amdgpu_cs_sync_flush(rcs); |
| error_code = cur->error_code; |
| } |
| } else { |
| if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) |
| cs->csc->secure = !cs->csc->secure; |
| |
| amdgpu_cs_context_cleanup_buffers(aws, cs->csc); |
| amdgpu_cs_context_cleanup(aws, cs->csc); |
| } |
| |
| memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); |
| |
| amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs); |
| |
| if (cs->preamble_ib_bo) { |
| amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, |
| RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); |
| } |
| |
| if (cs->ip_type == AMD_IP_GFX) |
| aws->num_gfx_IBs++; |
| else if (cs->ip_type == AMD_IP_SDMA) |
| aws->num_sdma_IBs++; |
| |
| return error_code; |
| } |
| |
| static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs, |
| struct pb_buffer_lean *_buf, |
| unsigned usage) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf; |
| |
| return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage); |
| } |
| |
| static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va, |
| uint64_t csa_va) |
| { |
| struct amdgpu_cs *cs = amdgpu_cs(rcs); |
| cs->mcbp_fw_shadow_chunk.shadow_va = regs_va; |
| cs->mcbp_fw_shadow_chunk.csa_va = csa_va; |
| cs->mcbp_fw_shadow_chunk.gds_va = 0; |
| cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW; |
| } |
| |
| static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws, |
| struct pipe_fence_handle **dst, |
| struct pipe_fence_handle *src) |
| { |
| amdgpu_fence_reference(dst, src); |
| } |
| |
| void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws) |
| { |
| sws->base.ctx_create = amdgpu_ctx_create; |
| sws->base.ctx_destroy = amdgpu_ctx_destroy; |
| sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status; |
| sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; |
| sws->base.cs_create = amdgpu_cs_create; |
| sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; |
| sws->base.cs_destroy = amdgpu_cs_destroy; |
| sws->base.cs_add_buffer = amdgpu_cs_add_buffer; |
| sws->base.cs_validate = amdgpu_cs_validate; |
| sws->base.cs_check_space = amdgpu_cs_check_space; |
| sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list; |
| sws->base.cs_flush = amdgpu_cs_flush; |
| sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence; |
| sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; |
| sws->base.cs_sync_flush = amdgpu_cs_sync_flush; |
| sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency; |
| sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal; |
| sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type; |
| sws->base.fence_wait = amdgpu_fence_wait_rel_timeout; |
| sws->base.fence_reference = amdgpu_winsys_fence_reference; |
| sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj; |
| sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file; |
| sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file; |
| sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file; |
| |
| if (sws->aws->info.has_fw_based_shadowing) |
| sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va; |
| } |