| // |
| // Copyright 2021 The ANGLE Project Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // |
| // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk. |
| |
| #include "common/PackedCLEnums_autogen.h" |
| #include "common/system_utils.h" |
| |
| #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h" |
| #include "libANGLE/renderer/vulkan/CLContextVk.h" |
| #include "libANGLE/renderer/vulkan/CLDeviceVk.h" |
| #include "libANGLE/renderer/vulkan/CLEventVk.h" |
| #include "libANGLE/renderer/vulkan/CLKernelVk.h" |
| #include "libANGLE/renderer/vulkan/CLMemoryVk.h" |
| #include "libANGLE/renderer/vulkan/CLProgramVk.h" |
| #include "libANGLE/renderer/vulkan/CLSamplerVk.h" |
| #include "libANGLE/renderer/vulkan/cl_types.h" |
| #include "libANGLE/renderer/vulkan/clspv_utils.h" |
| #include "libANGLE/renderer/vulkan/vk_cache_utils.h" |
| #include "libANGLE/renderer/vulkan/vk_cl_utils.h" |
| #include "libANGLE/renderer/vulkan/vk_helpers.h" |
| #include "libANGLE/renderer/vulkan/vk_renderer.h" |
| #include "libANGLE/renderer/vulkan/vk_wrapper.h" |
| |
| #include "libANGLE/renderer/serial_utils.h" |
| |
| #include "libANGLE/CLBuffer.h" |
| #include "libANGLE/CLCommandQueue.h" |
| #include "libANGLE/CLContext.h" |
| #include "libANGLE/CLEvent.h" |
| #include "libANGLE/CLImage.h" |
| #include "libANGLE/CLKernel.h" |
| #include "libANGLE/CLSampler.h" |
| #include "libANGLE/Error.h" |
| #include "libANGLE/cl_types.h" |
| #include "libANGLE/cl_utils.h" |
| |
| #include "spirv/unified1/NonSemanticClspvReflection.h" |
| #include "vulkan/vulkan_core.h" |
| |
| #include <chrono> |
| |
| namespace rx |
| { |
| |
| namespace |
| { |
| static constexpr size_t kTimeoutInMS = 10000; |
| static constexpr size_t kSleepInMS = 500; |
| static constexpr size_t kTimeoutCheckIterations = kTimeoutInMS / kSleepInMS; |
| |
| angle::Result SetEventsWithQueueSerialToState(const cl::EventPtrs &eventList, |
| const QueueSerial &queueSerial, |
| cl::ExecutionStatus state) |
| { |
| |
| ASSERT(state < cl::ExecutionStatus::EnumCount); |
| |
| for (cl::EventPtr event : eventList) |
| { |
| CLEventVk *eventVk = &event->getImpl<CLEventVk>(); |
| if (!eventVk->isUserEvent() && eventVk->usedByCommandBuffer(queueSerial)) |
| { |
| ANGLE_TRY(eventVk->setStatusAndExecuteCallback(cl::ToCLenum(state))); |
| } |
| } |
| return angle::Result::Continue; |
| } |
| |
| DispatchWorkThread::DispatchWorkThread(CLCommandQueueVk *commandQueue) |
| : mCommandQueue(commandQueue), |
| mIsTerminating(false), |
| mQueueSerials(kFixedQueueLimit), |
| mQueueSerialIndex(kInvalidQueueSerialIndex) |
| {} |
| |
| DispatchWorkThread::~DispatchWorkThread() |
| { |
| ASSERT(mIsTerminating); |
| } |
| |
| angle::Result DispatchWorkThread::init() |
| { |
| mQueueSerialIndex = mCommandQueue->getQueueSerialIndex(); |
| ASSERT(mQueueSerialIndex != kInvalidQueueSerialIndex); |
| |
| mWorkerThread = std::thread(&DispatchWorkThread::finishLoop, this); |
| |
| return angle::Result::Continue; |
| } |
| |
| void DispatchWorkThread::terminate() |
| { |
| // Terminate the background thread |
| { |
| std::unique_lock<std::mutex> lock(mThreadMutex); |
| mIsTerminating = true; |
| } |
| mHasWorkSubmitted.notify_all(); |
| if (mWorkerThread.joinable()) |
| { |
| mWorkerThread.join(); |
| } |
| } |
| |
| angle::Result DispatchWorkThread::notify(QueueSerial queueSerial) |
| { |
| ASSERT(queueSerial.getIndex() == mQueueSerialIndex); |
| |
| // QueueSerials are always received in order, its either same or greater than last one |
| std::unique_lock<std::mutex> ul(mThreadMutex); |
| if (!mQueueSerials.empty()) |
| { |
| QueueSerial &lastSerial = mQueueSerials.back(); |
| ASSERT(queueSerial >= lastSerial); |
| if (queueSerial == lastSerial) |
| { |
| return angle::Result::Continue; |
| } |
| } |
| |
| // if the queue is full, it might be that device is lost, check for timeout |
| size_t numIterations = 0; |
| while (mQueueSerials.full() && numIterations < kTimeoutCheckIterations) |
| { |
| mHasEmptySlot.wait_for(ul, std::chrono::milliseconds(kSleepInMS), |
| [this]() { return !mQueueSerials.full(); }); |
| numIterations++; |
| } |
| if (numIterations == kTimeoutCheckIterations) |
| { |
| ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); |
| } |
| |
| mQueueSerials.push(queueSerial); |
| mHasWorkSubmitted.notify_one(); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result DispatchWorkThread::finishLoop() |
| { |
| angle::SetCurrentThreadName("ANGLE-CL-CQD"); |
| |
| while (true) |
| { |
| std::unique_lock<std::mutex> ul(mThreadMutex); |
| mHasWorkSubmitted.wait(ul, [this]() { return !mQueueSerials.empty() || mIsTerminating; }); |
| |
| while (!mQueueSerials.empty()) |
| { |
| QueueSerial queueSerial = mQueueSerials.front(); |
| mQueueSerials.pop(); |
| mHasEmptySlot.notify_one(); |
| ul.unlock(); |
| // finish the work associated with the queue serial |
| ANGLE_TRY(mCommandQueue->finishQueueSerial(queueSerial)); |
| ul.lock(); |
| } |
| |
| if (mIsTerminating) |
| { |
| break; |
| } |
| } |
| return angle::Result::Continue; |
| } |
| |
| } // namespace |
| |
| CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue) |
| : CLCommandQueueImpl(commandQueue), |
| mContext(&commandQueue.getContext().getImpl<CLContextVk>()), |
| mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()), |
| mPrintfBuffer(nullptr), |
| mComputePassCommands(nullptr), |
| mQueueSerialIndex(kInvalidQueueSerialIndex), |
| mNeedPrintfHandling(false), |
| mPrintfInfos(nullptr), |
| mFinishHandler(this) |
| {} |
| |
| angle::Result CLCommandQueueVk::init() |
| { |
| vk::Renderer *renderer = mContext->getRenderer(); |
| ASSERT(renderer); |
| |
| ANGLE_CL_IMPL_TRY_ERROR(vk::OutsideRenderPassCommandBuffer::InitializeCommandPool( |
| mContext, &mCommandPool.outsideRenderPassPool, |
| renderer->getQueueFamilyIndex(), getProtectionType()), |
| CL_OUT_OF_RESOURCES); |
| |
| ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper( |
| mContext, &mCommandPool.outsideRenderPassPool, |
| &mOutsideRenderPassCommandsAllocator, &mComputePassCommands), |
| CL_OUT_OF_RESOURCES); |
| |
| // Generate initial QueueSerial for command buffer helper |
| ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->allocateQueueSerialIndex(&mQueueSerialIndex), |
| CL_OUT_OF_RESOURCES); |
| // and set an initial queue serial for the compute pass commands |
| mComputePassCommands->setQueueSerial( |
| mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex)); |
| |
| // Initialize serials to be valid but appear submitted and finished. |
| mLastFlushedQueueSerial = QueueSerial(mQueueSerialIndex, Serial()); |
| mLastSubmittedQueueSerial = mLastFlushedQueueSerial; |
| |
| ANGLE_TRY(mFinishHandler.init()); |
| |
| return angle::Result::Continue; |
| } |
| |
| CLCommandQueueVk::~CLCommandQueueVk() |
| { |
| mFinishHandler.terminate(); |
| |
| ASSERT(mComputePassCommands->empty()); |
| ASSERT(!mNeedPrintfHandling); |
| |
| if (mPrintfBuffer) |
| { |
| // The lifetime of printf buffer is scoped to command queue, release and destroy. |
| const bool wasLastUser = mPrintfBuffer->release(); |
| ASSERT(wasLastUser); |
| delete mPrintfBuffer; |
| } |
| |
| VkDevice vkDevice = mContext->getDevice(); |
| |
| if (mQueueSerialIndex != kInvalidQueueSerialIndex) |
| { |
| mContext->getRenderer()->releaseQueueSerialIndex(mQueueSerialIndex); |
| mQueueSerialIndex = kInvalidQueueSerialIndex; |
| } |
| |
| // Recycle the current command buffers |
| mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands); |
| mCommandPool.outsideRenderPassPool.destroy(vkDevice); |
| } |
| |
| angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable) |
| { |
| // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1 |
| // http://man.opencl.org/deprecated.html |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer, |
| bool blocking, |
| size_t offset, |
| size_t size, |
| void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>(); |
| |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| ANGLE_TRY(bufferVk->copyTo(ptr, offset, size)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| } |
| else |
| { |
| // Stage a transfer routine |
| HostTransferConfig transferConfig; |
| transferConfig.type = CL_COMMAND_READ_BUFFER; |
| transferConfig.offset = offset; |
| transferConfig.size = size; |
| transferConfig.dstHostPtr = ptr; |
| ANGLE_TRY(addToHostTransferList(bufferVk, transferConfig)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer, |
| bool blocking, |
| size_t offset, |
| size_t size, |
| const void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| auto bufferVk = &buffer.getImpl<CLBufferVk>(); |
| |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| } |
| else |
| { |
| // Stage a transfer routine |
| HostTransferConfig config; |
| config.type = CL_COMMAND_WRITE_BUFFER; |
| config.offset = offset; |
| config.size = size; |
| config.srcHostPtr = ptr; |
| ANGLE_TRY(addToHostTransferList(bufferVk, config)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer, |
| bool blocking, |
| const cl::MemOffsets &bufferOrigin, |
| const cl::MemOffsets &hostOrigin, |
| const cl::Coordinate ®ion, |
| size_t bufferRowPitch, |
| size_t bufferSlicePitch, |
| size_t hostRowPitch, |
| size_t hostSlicePitch, |
| void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| auto bufferVk = &buffer.getImpl<CLBufferVk>(); |
| |
| cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, bufferRowPitch, |
| bufferSlicePitch, 1}; |
| |
| cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch, |
| 1}; |
| |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| ANGLE_TRY(bufferVk->getRect(bufferRect, ptrRect, ptr)); |
| } |
| else |
| { |
| // Stage a transfer routine |
| HostTransferConfig config; |
| config.type = CL_COMMAND_READ_BUFFER_RECT; |
| config.srcRect = bufferRect; |
| config.dstRect = ptrRect; |
| config.dstHostPtr = ptr; |
| config.size = bufferVk->getSize(); |
| ANGLE_TRY(addToHostTransferList(bufferVk, config)); |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, |
| blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued)); |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer, |
| bool blocking, |
| const cl::MemOffsets &bufferOrigin, |
| const cl::MemOffsets &hostOrigin, |
| const cl::Coordinate ®ion, |
| size_t bufferRowPitch, |
| size_t bufferSlicePitch, |
| size_t hostRowPitch, |
| size_t hostSlicePitch, |
| const void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| auto bufferVk = &buffer.getImpl<CLBufferVk>(); |
| |
| cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, bufferRowPitch, |
| bufferSlicePitch, 1}; |
| |
| cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch, |
| 1}; |
| |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| ANGLE_TRY(bufferVk->setRect(ptr, ptrRect, bufferRect)); |
| } |
| else |
| { |
| // Stage a transfer routine |
| HostTransferConfig config; |
| config.type = CL_COMMAND_WRITE_BUFFER_RECT; |
| config.srcRect = ptrRect; |
| config.dstRect = bufferRect; |
| config.srcHostPtr = ptr; |
| config.size = bufferVk->getSize(); |
| ANGLE_TRY(addToHostTransferList(bufferVk, config)); |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, |
| blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued)); |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer, |
| const cl::Buffer &dstBuffer, |
| size_t srcOffset, |
| size_t dstOffset, |
| size_t size, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| CLBufferVk *srcBufferVk = &srcBuffer.getImpl<CLBufferVk>(); |
| CLBufferVk *dstBufferVk = &dstBuffer.getImpl<CLBufferVk>(); |
| |
| vk::CommandBufferAccess access; |
| if (srcBufferVk->isSubBuffer() && dstBufferVk->isSubBuffer() && |
| (srcBufferVk->getParent() == dstBufferVk->getParent())) |
| { |
| // this is a self copy |
| access.onBufferSelfCopy(&srcBufferVk->getBuffer()); |
| } |
| else |
| { |
| access.onBufferTransferRead(&srcBufferVk->getBuffer()); |
| access.onBufferTransferWrite(&dstBufferVk->getBuffer()); |
| } |
| |
| vk::OutsideRenderPassCommandBuffer *commandBuffer; |
| ANGLE_TRY(getCommandBuffer(access, &commandBuffer)); |
| |
| VkBufferCopy copyRegion = {srcOffset, dstOffset, size}; |
| // update the offset in the case of sub-buffers |
| if (srcBufferVk->getOffset()) |
| { |
| copyRegion.srcOffset += srcBufferVk->getOffset(); |
| } |
| if (dstBufferVk->getOffset()) |
| { |
| copyRegion.dstOffset += dstBufferVk->getOffset(); |
| } |
| commandBuffer->copyBuffer(srcBufferVk->getBuffer().getBuffer(), |
| dstBufferVk->getBuffer().getBuffer(), 1, ©Region); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer, |
| const cl::Buffer &dstBuffer, |
| const cl::MemOffsets &srcOrigin, |
| const cl::MemOffsets &dstOrigin, |
| const cl::Coordinate ®ion, |
| size_t srcRowPitch, |
| size_t srcSlicePitch, |
| size_t dstRowPitch, |
| size_t dstSlicePitch, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| ANGLE_TRY(finishInternal()); |
| |
| cl::BufferRect srcRect{cl::Offset{srcOrigin.x, srcOrigin.y, srcOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, srcRowPitch, srcSlicePitch, |
| 1}; |
| |
| cl::BufferRect dstRect{cl::Offset{dstOrigin.x, dstOrigin.y, dstOrigin.z}, |
| cl::Extents{region.x, region.y, region.z}, dstRowPitch, dstSlicePitch, |
| 1}; |
| |
| auto srcBufferVk = &srcBuffer.getImpl<CLBufferVk>(); |
| auto dstBufferVk = &dstBuffer.getImpl<CLBufferVk>(); |
| uint8_t *mapPointer = nullptr; |
| ANGLE_TRY(srcBufferVk->map(mapPointer)); |
| ASSERT(mapPointer); |
| ANGLE_TRY(dstBufferVk->setRect(static_cast<const void *>(mapPointer), srcRect, dstRect)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer, |
| const void *pattern, |
| size_t patternSize, |
| size_t offset, |
| size_t size, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>(); |
| |
| // Stage a transfer routine |
| HostTransferConfig config; |
| config.type = CL_COMMAND_FILL_BUFFER; |
| config.patternSize = patternSize; |
| config.offset = offset; |
| config.size = size; |
| config.srcHostPtr = pattern; |
| ANGLE_TRY(addToHostTransferList(bufferVk, config)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer, |
| bool blocking, |
| cl::MapFlags mapFlags, |
| size_t offset, |
| size_t size, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc, |
| void *&mapPtr) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued; |
| if (blocking || !eventCreateFunc) |
| { |
| ANGLE_TRY(finishInternal()); |
| eventComplete = cl::ExecutionStatus::Complete; |
| } |
| |
| CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>(); |
| uint8_t *mapPointer = nullptr; |
| if (buffer.getFlags().intersects(CL_MEM_USE_HOST_PTR)) |
| { |
| ANGLE_TRY(finishInternal()); |
| mapPointer = static_cast<uint8_t *>(buffer.getHostPtr()) + offset; |
| ANGLE_TRY(bufferVk->copyTo(mapPointer, offset, size)); |
| eventComplete = cl::ExecutionStatus::Complete; |
| } |
| else |
| { |
| ANGLE_TRY(bufferVk->map(mapPointer, offset)); |
| } |
| mapPtr = static_cast<void *>(mapPointer); |
| |
| if (bufferVk->isCurrentlyInUse()) |
| { |
| eventComplete = cl::ExecutionStatus::Queued; |
| } |
| ANGLE_TRY(createEvent(eventCreateFunc, eventComplete)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::copyImageToFromBuffer(CLImageVk &imageVk, |
| vk::BufferHelper &buffer, |
| const cl::MemOffsets &origin, |
| const cl::Coordinate ®ion, |
| size_t bufferOffset, |
| ImageBufferCopyDirection direction) |
| { |
| vk::CommandBufferAccess access; |
| vk::OutsideRenderPassCommandBuffer *commandBuffer; |
| VkImageAspectFlags aspectFlags = imageVk.getImage().getAspectFlags(); |
| if (direction == ImageBufferCopyDirection::ToBuffer) |
| { |
| access.onImageTransferRead(aspectFlags, &imageVk.getImage()); |
| access.onBufferTransferWrite(&buffer); |
| } |
| else |
| { |
| access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, |
| static_cast<uint32_t>(imageVk.getArraySize()), aspectFlags, |
| &imageVk.getImage()); |
| access.onBufferTransferRead(&buffer); |
| } |
| ANGLE_TRY(getCommandBuffer(access, &commandBuffer)); |
| |
| VkBufferImageCopy copyRegion = {}; |
| copyRegion.bufferOffset = bufferOffset; |
| copyRegion.bufferRowLength = 0; |
| copyRegion.bufferImageHeight = 0; |
| copyRegion.imageExtent = cl_vk::GetExtent(imageVk.getExtentForCopy(region)); |
| copyRegion.imageOffset = cl_vk::GetOffset(imageVk.getOffsetForCopy(origin)); |
| copyRegion.imageSubresource = imageVk.getSubresourceLayersForCopy( |
| origin, region, imageVk.getType(), ImageCopyWith::Buffer); |
| if (imageVk.isWritable()) |
| { |
| // We need an execution barrier if image can be written to by kernel |
| ANGLE_TRY(insertBarrier()); |
| } |
| |
| VkMemoryBarrier memBarrier = {}; |
| memBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| if (direction == ImageBufferCopyDirection::ToBuffer) |
| { |
| commandBuffer->copyImageToBuffer(imageVk.getImage().getImage(), |
| VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, |
| buffer.getBuffer().getHandle(), 1, ©Region); |
| |
| mComputePassCommands->getCommandBuffer().pipelineBarrier( |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &memBarrier, 0, |
| nullptr, 0, nullptr); |
| } |
| else |
| { |
| commandBuffer->copyBufferToImage(buffer.getBuffer().getHandle(), |
| imageVk.getImage().getImage(), |
| VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region); |
| |
| mComputePassCommands->getCommandBuffer().pipelineBarrier( |
| VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memBarrier, |
| 0, nullptr, 0, nullptr); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::addToHostTransferList(CLBufferVk *srcBuffer, |
| HostTransferConfig transferConfig) |
| { |
| // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic |
| // http://anglebug.com/377545840 |
| |
| cl::Memory *transferBufferHandle = |
| cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer( |
| nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcBuffer->getSize(), nullptr)); |
| if (transferBufferHandle == nullptr) |
| { |
| ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); |
| } |
| HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}}; |
| mCommandsStateMap[mComputePassCommands->getQueueSerial()].hostTransferList.emplace_back( |
| transferEntry); |
| |
| // Release initialization reference, lifetime controlled by RefPointer. |
| transferBufferHandle->release(); |
| |
| // We need an execution barrier if buffer can be written to by kernel |
| if (!mComputePassCommands->getCommandBuffer().empty() && srcBuffer->isWritable()) |
| { |
| // TODO(aannestrand): Look into combining these kernel execution barriers |
| // http://anglebug.com/377545840 |
| VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, |
| VK_ACCESS_SHADER_WRITE_BIT, |
| VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT}; |
| mComputePassCommands->getCommandBuffer().pipelineBarrier( |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, |
| &memoryBarrier, 0, nullptr, 0, nullptr); |
| } |
| |
| // Enqueue blit/transfer cmd |
| VkPipelineStageFlags srcStageMask = {}; |
| VkPipelineStageFlags dstStageMask = {}; |
| VkMemoryBarrier memBarrier = {}; |
| memBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; |
| CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>(); |
| switch (transferConfig.type) |
| { |
| case CL_COMMAND_WRITE_BUFFER: |
| { |
| VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset, |
| transferConfig.size}; |
| ANGLE_TRY(transferBufferHandleVk.copyFrom(transferConfig.srcHostPtr, |
| transferConfig.offset, transferConfig.size)); |
| copyRegion.srcOffset += transferBufferHandleVk.getOffset(); |
| copyRegion.dstOffset += srcBuffer->getOffset(); |
| mComputePassCommands->getCommandBuffer().copyBuffer( |
| transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(), |
| 1, ©Region); |
| |
| srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| break; |
| } |
| case CL_COMMAND_WRITE_BUFFER_RECT: |
| { |
| ANGLE_TRY(transferBufferHandleVk.setRect( |
| transferConfig.srcHostPtr, transferConfig.srcRect, transferConfig.dstRect)); |
| for (VkBufferCopy ©Region : |
| transferBufferHandleVk.rectCopyRegions(transferConfig.dstRect)) |
| { |
| copyRegion.srcOffset += transferBufferHandleVk.getOffset(); |
| copyRegion.dstOffset += srcBuffer->getOffset(); |
| mComputePassCommands->getCommandBuffer().copyBuffer( |
| transferBufferHandleVk.getBuffer().getBuffer(), |
| srcBuffer->getBuffer().getBuffer(), 1, ©Region); |
| } |
| |
| // Config transfer barrier |
| srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| break; |
| } |
| case CL_COMMAND_READ_BUFFER: |
| { |
| VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset, |
| transferConfig.size}; |
| copyRegion.srcOffset += srcBuffer->getOffset(); |
| copyRegion.dstOffset += transferBufferHandleVk.getOffset(); |
| mComputePassCommands->getCommandBuffer().copyBuffer( |
| srcBuffer->getBuffer().getBuffer(), transferBufferHandleVk.getBuffer().getBuffer(), |
| 1, ©Region); |
| |
| srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| dstStageMask = VK_PIPELINE_STAGE_HOST_BIT; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| break; |
| } |
| case CL_COMMAND_READ_BUFFER_RECT: |
| { |
| for (VkBufferCopy ©Region : |
| transferBufferHandleVk.rectCopyRegions(transferConfig.srcRect)) |
| { |
| copyRegion.srcOffset += srcBuffer->getOffset(); |
| copyRegion.dstOffset += transferBufferHandleVk.getOffset(); |
| mComputePassCommands->getCommandBuffer().copyBuffer( |
| srcBuffer->getBuffer().getBuffer(), |
| transferBufferHandleVk.getBuffer().getBuffer(), 1, ©Region); |
| } |
| |
| // Config transfer barrier |
| srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| dstStageMask = VK_PIPELINE_STAGE_HOST_BIT; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| break; |
| } |
| case CL_COMMAND_FILL_BUFFER: |
| { |
| VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset, |
| transferConfig.size}; |
| ANGLE_TRY(transferBufferHandleVk.fillWithPattern( |
| transferConfig.srcHostPtr, transferConfig.patternSize, transferConfig.offset, |
| transferConfig.size)); |
| copyRegion.srcOffset += transferBufferHandleVk.getOffset(); |
| copyRegion.dstOffset += srcBuffer->getOffset(); |
| mComputePassCommands->getCommandBuffer().copyBuffer( |
| transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(), |
| 1, ©Region); |
| |
| // Config transfer barrier |
| srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; |
| dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; |
| break; |
| } |
| default: |
| UNIMPLEMENTED(); |
| break; |
| } |
| |
| // TODO(aannestrand): Look into combining these transfer barriers |
| // http://anglebug.com/377545840 |
| mComputePassCommands->getCommandBuffer().pipelineBarrier(srcStageMask, dstStageMask, 0, 1, |
| &memBarrier, 0, nullptr, 0, nullptr); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::addToHostTransferList(CLImageVk *srcImage, |
| HostTransferConfig transferConfig) |
| { |
| // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic |
| // http://anglebug.com/377545840 |
| CommandsState &commandsState = mCommandsStateMap[mComputePassCommands->getQueueSerial()]; |
| |
| cl::Memory *transferBufferHandle = |
| cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer( |
| nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcImage->getSize(), nullptr)); |
| if (transferBufferHandle == nullptr) |
| { |
| ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); |
| } |
| |
| HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}}; |
| commandsState.hostTransferList.emplace_back(transferEntry); |
| |
| // Release initialization reference, lifetime controlled by RefPointer. |
| transferBufferHandle->release(); |
| |
| // Enqueue blit |
| CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>(); |
| ANGLE_TRY(copyImageToFromBuffer(*srcImage, transferBufferHandleVk.getBuffer(), |
| transferConfig.origin, transferConfig.region, 0, |
| ImageBufferCopyDirection::ToBuffer)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image, |
| bool blocking, |
| const cl::MemOffsets &origin, |
| const cl::Coordinate ®ion, |
| size_t rowPitch, |
| size_t slicePitch, |
| void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| CLImageVk &imageVk = image.getImpl<CLImageVk>(); |
| size_t size = (region.x * region.y * region.z * imageVk.getElementSize()); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| if (imageVk.isStagingBufferInitialized() == false) |
| { |
| ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize())); |
| } |
| |
| if (blocking) |
| { |
| ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0, |
| ImageBufferCopyDirection::ToBuffer)); |
| ANGLE_TRY(finishInternal()); |
| if (rowPitch == 0 && slicePitch == 0) |
| { |
| ANGLE_TRY(imageVk.copyStagingTo(ptr, 0, size)); |
| } |
| else |
| { |
| ANGLE_TRY(imageVk.copyStagingToFromWithPitch(ptr, region, rowPitch, slicePitch, |
| StagingBufferCopyDirection::ToHost)); |
| } |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| } |
| else |
| { |
| // Create a transfer buffer and push it in update list |
| HostTransferConfig transferConfig; |
| transferConfig.type = CL_COMMAND_READ_IMAGE; |
| transferConfig.size = size; |
| transferConfig.dstHostPtr = ptr; |
| transferConfig.origin = origin; |
| transferConfig.region = region; |
| transferConfig.rowPitch = rowPitch; |
| transferConfig.slicePitch = slicePitch; |
| transferConfig.elementSize = imageVk.getElementSize(); |
| ANGLE_TRY(addToHostTransferList(&imageVk, transferConfig)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image, |
| bool blocking, |
| const cl::MemOffsets &origin, |
| const cl::Coordinate ®ion, |
| size_t inputRowPitch, |
| size_t inputSlicePitch, |
| const void *ptr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| CLImageVk &imageVk = image.getImpl<CLImageVk>(); |
| size_t size = (region.x * region.y * region.z * imageVk.getElementSize()); |
| cl::ExecutionStatus eventInitialState = cl::ExecutionStatus::Queued; |
| if (imageVk.isStagingBufferInitialized() == false) |
| { |
| ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize())); |
| } |
| |
| if (inputRowPitch == 0 && inputSlicePitch == 0) |
| { |
| ANGLE_TRY(imageVk.copyStagingFrom((void *)ptr, 0, size)); |
| } |
| else |
| { |
| ANGLE_TRY(imageVk.copyStagingToFromWithPitch((void *)ptr, region, inputRowPitch, |
| inputSlicePitch, |
| StagingBufferCopyDirection::ToStagingBuffer)); |
| } |
| |
| ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0, |
| ImageBufferCopyDirection::ToImage)); |
| |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| eventInitialState = cl::ExecutionStatus::Complete; |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, eventInitialState)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage, |
| const cl::Image &dstImage, |
| const cl::MemOffsets &srcOrigin, |
| const cl::MemOffsets &dstOrigin, |
| const cl::Coordinate ®ion, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| auto srcImageVk = &srcImage.getImpl<CLImageVk>(); |
| auto dstImageVk = &dstImage.getImpl<CLImageVk>(); |
| |
| vk::CommandBufferAccess access; |
| vk::OutsideRenderPassCommandBuffer *commandBuffer; |
| VkImageAspectFlags dstAspectFlags = srcImageVk->getImage().getAspectFlags(); |
| VkImageAspectFlags srcAspectFlags = dstImageVk->getImage().getAspectFlags(); |
| access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, 1, dstAspectFlags, |
| &dstImageVk->getImage()); |
| access.onImageTransferRead(srcAspectFlags, &srcImageVk->getImage()); |
| ANGLE_TRY(getCommandBuffer(access, &commandBuffer)); |
| |
| VkImageCopy copyRegion = {}; |
| copyRegion.extent = cl_vk::GetExtent(srcImageVk->getExtentForCopy(region)); |
| copyRegion.srcOffset = cl_vk::GetOffset(srcImageVk->getOffsetForCopy(srcOrigin)); |
| copyRegion.dstOffset = cl_vk::GetOffset(dstImageVk->getOffsetForCopy(dstOrigin)); |
| copyRegion.srcSubresource = srcImageVk->getSubresourceLayersForCopy( |
| srcOrigin, region, dstImageVk->getType(), ImageCopyWith::Image); |
| copyRegion.dstSubresource = dstImageVk->getSubresourceLayersForCopy( |
| dstOrigin, region, srcImageVk->getType(), ImageCopyWith::Image); |
| if (srcImageVk->isWritable() || dstImageVk->isWritable()) |
| { |
| // We need an execution barrier if buffer can be written to by kernel |
| ANGLE_TRY(insertBarrier()); |
| } |
| |
| commandBuffer->copyImage( |
| srcImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, |
| dstImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image, |
| const void *fillColor, |
| const cl::MemOffsets &origin, |
| const cl::Coordinate ®ion, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| CLImageVk &imageVk = image.getImpl<CLImageVk>(); |
| PixelColor packedColor; |
| cl::Extents extent = imageVk.getImageExtent(); |
| |
| imageVk.packPixels(fillColor, &packedColor); |
| |
| if (imageVk.isStagingBufferInitialized() == false) |
| { |
| ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize())); |
| } |
| |
| ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero, |
| {extent.width, extent.height, extent.depth}, 0, |
| ImageBufferCopyDirection::ToBuffer)); |
| ANGLE_TRY(finishInternal()); |
| |
| uint8_t *mapPointer = nullptr; |
| ANGLE_TRY(imageVk.map(mapPointer, 0)); |
| imageVk.fillImageWithColor(origin, region, mapPointer, &packedColor); |
| imageVk.unmap(); |
| mapPointer = nullptr; |
| ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero, |
| {extent.width, extent.height, extent.depth}, 0, |
| ImageBufferCopyDirection::ToImage)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage, |
| const cl::Buffer &dstBuffer, |
| const cl::MemOffsets &srcOrigin, |
| const cl::Coordinate ®ion, |
| size_t dstOffset, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| CLImageVk &srcImageVk = srcImage.getImpl<CLImageVk>(); |
| CLBufferVk &dstBufferVk = dstBuffer.getImpl<CLBufferVk>(); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| ANGLE_TRY(copyImageToFromBuffer(srcImageVk, dstBufferVk.getBuffer(), srcOrigin, region, |
| dstOffset, ImageBufferCopyDirection::ToBuffer)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer, |
| const cl::Image &dstImage, |
| size_t srcOffset, |
| const cl::MemOffsets &dstOrigin, |
| const cl::Coordinate ®ion, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| CLBufferVk &srcBufferVk = srcBuffer.getImpl<CLBufferVk>(); |
| CLImageVk &dstImageVk = dstImage.getImpl<CLImageVk>(); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| ANGLE_TRY(copyImageToFromBuffer(dstImageVk, srcBufferVk.getBuffer(), dstOrigin, region, |
| srcOffset, ImageBufferCopyDirection::ToImage)); |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image, |
| bool blocking, |
| cl::MapFlags mapFlags, |
| const cl::MemOffsets &origin, |
| const cl::Coordinate ®ion, |
| size_t *imageRowPitch, |
| size_t *imageSlicePitch, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc, |
| void *&mapPtr) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| // TODO: Look into better enqueue handling of this map-op if non-blocking |
| // https://anglebug.com/376722715 |
| CLImageVk *imageVk = &image.getImpl<CLImageVk>(); |
| cl::Extents extent = imageVk->getImageExtent(); |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| } |
| |
| mComputePassCommands->imageRead(mContext, imageVk->getImage().getAspectFlags(), |
| vk::ImageLayout::TransferSrc, &imageVk->getImage()); |
| |
| if (imageVk->isStagingBufferInitialized() == false) |
| { |
| ANGLE_TRY(imageVk->createStagingBuffer(imageVk->getSize())); |
| } |
| |
| ANGLE_TRY(copyImageToFromBuffer(*imageVk, imageVk->getStagingBuffer(), cl::kMemOffsetsZero, |
| {extent.width, extent.height, extent.depth}, 0, |
| ImageBufferCopyDirection::ToBuffer)); |
| if (blocking) |
| { |
| ANGLE_TRY(finishInternal()); |
| } |
| |
| uint8_t *mapPointer = nullptr; |
| size_t elementSize = imageVk->getElementSize(); |
| size_t rowPitch = (extent.width * elementSize); |
| size_t offset = |
| (origin.x * elementSize) + (origin.y * rowPitch) + (origin.z * extent.height * rowPitch); |
| size_t size = (region.x * region.y * region.z * elementSize); |
| |
| if (image.getFlags().intersects(CL_MEM_USE_HOST_PTR)) |
| { |
| mapPointer = static_cast<uint8_t *>(image.getHostPtr()) + offset; |
| ANGLE_TRY(imageVk->copyTo(mapPointer, offset, size)); |
| } |
| else |
| { |
| ANGLE_TRY(imageVk->map(mapPointer, offset)); |
| } |
| mapPtr = static_cast<void *>(mapPointer); |
| |
| *imageRowPitch = rowPitch; |
| |
| switch (imageVk->getDescriptor().type) |
| { |
| case cl::MemObjectType::Image1D: |
| case cl::MemObjectType::Image1D_Buffer: |
| case cl::MemObjectType::Image2D: |
| if (imageSlicePitch != nullptr) |
| { |
| *imageSlicePitch = 0; |
| } |
| break; |
| case cl::MemObjectType::Image2D_Array: |
| case cl::MemObjectType::Image3D: |
| *imageSlicePitch = (extent.height * (*imageRowPitch)); |
| break; |
| case cl::MemObjectType::Image1D_Array: |
| *imageSlicePitch = *imageRowPitch; |
| break; |
| default: |
| UNREACHABLE(); |
| break; |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory, |
| void *mappedPtr, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued; |
| if (!eventCreateFunc) |
| { |
| ANGLE_TRY(finishInternal()); |
| eventComplete = cl::ExecutionStatus::Complete; |
| } |
| |
| if (memory.getType() == cl::MemObjectType::Buffer) |
| { |
| CLBufferVk &bufferVk = memory.getImpl<CLBufferVk>(); |
| if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR)) |
| { |
| ANGLE_TRY(finishInternal()); |
| ANGLE_TRY(bufferVk.copyFrom(memory.getHostPtr(), 0, bufferVk.getSize())); |
| eventComplete = cl::ExecutionStatus::Complete; |
| } |
| } |
| else if (memory.getType() != cl::MemObjectType::Pipe) |
| { |
| // of image type |
| CLImageVk &imageVk = memory.getImpl<CLImageVk>(); |
| if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR)) |
| { |
| uint8_t *mapPointer = static_cast<uint8_t *>(memory.getHostPtr()); |
| ANGLE_TRY(imageVk.copyStagingFrom(mapPointer, 0, imageVk.getSize())); |
| } |
| cl::Extents extent = imageVk.getImageExtent(); |
| ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero, |
| {extent.width, extent.height, extent.depth}, 0, |
| ImageBufferCopyDirection::ToImage)); |
| ANGLE_TRY(finishInternal()); |
| eventComplete = cl::ExecutionStatus::Complete; |
| } |
| else |
| { |
| // mem object type pipe is not supported and creation of such an object should have |
| // failed |
| UNREACHABLE(); |
| } |
| |
| memory.getImpl<CLMemoryVk>().unmap(); |
| ANGLE_TRY(createEvent(eventCreateFunc, eventComplete)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects, |
| cl::MemMigrationFlags flags, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| if (mCommandQueue.getContext().getDevices().size() > 1) |
| { |
| // TODO(aannestrand): Later implement support to allow migration of mem objects across |
| // different devices. http://anglebug.com/377942759 |
| UNIMPLEMENTED(); |
| ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel, |
| const cl::NDRange &ndrange, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| |
| vk::PipelineCacheAccess pipelineCache; |
| vk::PipelineHelper *pipelineHelper = nullptr; |
| CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>(); |
| const CLProgramVk::DeviceProgramData *devProgramData = |
| kernelImpl.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative()); |
| ASSERT(devProgramData != nullptr); |
| cl::NDRange enqueueNDRange(ndrange); |
| |
| // Start with Workgroup size (WGS) from kernel attribute (if available) |
| cl::WorkgroupSize workgroupSize = |
| devProgramData->getCompiledWorkgroupSize(kernelImpl.getKernelName()); |
| if (workgroupSize != cl::WorkgroupSize{0, 0, 0}) |
| { |
| // Local work size (LWS) was valid, use that as WGS |
| enqueueNDRange.localWorkSize = workgroupSize; |
| } |
| else |
| { |
| if (enqueueNDRange.nullLocalWorkSize) |
| { |
| // NULL value was passed, in which case the OpenCL implementation will determine |
| // how to be break the global work-items into appropriate work-group instances. |
| enqueueNDRange.localWorkSize = |
| mCommandQueue.getDevice().getImpl<CLDeviceVk>().selectWorkGroupSize(enqueueNDRange); |
| } |
| // At this point, we should have a non-zero Workgroup size |
| ASSERT((enqueueNDRange.localWorkSize != cl::WorkgroupSize{0, 0, 0})); |
| } |
| |
| // Printf storage is setup for single time usage. So drive any existing usage to completion if |
| // the kernel uses printf. |
| if (kernelImpl.usesPrintf() && mNeedPrintfHandling) |
| { |
| ANGLE_TRY(finishInternal()); |
| } |
| |
| // Fetch or create compute pipeline (if we miss in cache) |
| ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache), |
| CL_OUT_OF_RESOURCES); |
| |
| ANGLE_TRY(processKernelResources(kernelImpl)); |
| ANGLE_TRY(processGlobalPushConstants(kernelImpl, enqueueNDRange)); |
| |
| // Create uniform dispatch region(s) based on VkLimits for WorkgroupCount |
| const uint32_t *maxComputeWorkGroupCount = |
| mContext->getRenderer()->getPhysicalDeviceProperties().limits.maxComputeWorkGroupCount; |
| for (cl::NDRange &uniformRegion : enqueueNDRange.createUniformRegions( |
| {maxComputeWorkGroupCount[0], maxComputeWorkGroupCount[1], |
| maxComputeWorkGroupCount[2]})) |
| { |
| cl::WorkgroupCount uniformRegionWorkgroupCount = uniformRegion.getWorkgroupCount(); |
| const VkPushConstantRange *pushConstantRegionOffset = |
| devProgramData->getRegionOffsetRange(); |
| if (pushConstantRegionOffset != nullptr) |
| { |
| // The sum of the global ID offset into the NDRange for this uniform region and |
| // the global offset of the NDRange |
| // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants |
| uint32_t regionOffsets[3] = { |
| enqueueNDRange.globalWorkOffset[0] + uniformRegion.globalWorkOffset[0], |
| enqueueNDRange.globalWorkOffset[1] + uniformRegion.globalWorkOffset[1], |
| enqueueNDRange.globalWorkOffset[2] + uniformRegion.globalWorkOffset[2]}; |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| pushConstantRegionOffset->offset, pushConstantRegionOffset->size, ®ionOffsets); |
| } |
| const VkPushConstantRange *pushConstantRegionGroupOffset = |
| devProgramData->getRegionGroupOffsetRange(); |
| if (pushConstantRegionGroupOffset != nullptr) |
| { |
| // The 3D group ID offset into the NDRange for this region |
| // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants |
| ASSERT(enqueueNDRange.localWorkSize[0] > 0 && enqueueNDRange.localWorkSize[1] > 0 && |
| enqueueNDRange.localWorkSize[2] > 0); |
| ASSERT(uniformRegion.globalWorkOffset[0] % enqueueNDRange.localWorkSize[0] == 0 && |
| uniformRegion.globalWorkOffset[1] % enqueueNDRange.localWorkSize[1] == 0 && |
| uniformRegion.globalWorkOffset[2] % enqueueNDRange.localWorkSize[2] == 0); |
| uint32_t regionGroupOffsets[3] = { |
| uniformRegion.globalWorkOffset[0] / enqueueNDRange.localWorkSize[0], |
| uniformRegion.globalWorkOffset[1] / enqueueNDRange.localWorkSize[1], |
| uniformRegion.globalWorkOffset[2] / enqueueNDRange.localWorkSize[2]}; |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| pushConstantRegionGroupOffset->offset, pushConstantRegionGroupOffset->size, |
| ®ionGroupOffsets); |
| } |
| |
| ANGLE_TRY(kernelImpl.getOrCreateComputePipeline( |
| &pipelineCache, uniformRegion, mCommandQueue.getDevice(), &pipelineHelper)); |
| mComputePassCommands->retainResource(pipelineHelper); |
| mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline()); |
| mComputePassCommands->getCommandBuffer().dispatch(uniformRegionWorkgroupCount[0], |
| uniformRegionWorkgroupCount[1], |
| uniformRegionWorkgroupCount[2]); |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| constexpr size_t globalWorkSize[3] = {1, 0, 0}; |
| constexpr size_t localWorkSize[3] = {1, 0, 0}; |
| cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize); |
| return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc); |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc, |
| void *args, |
| size_t cbArgs, |
| const cl::BufferPtrs &buffers, |
| const std::vector<size_t> &bufferPtrOffsets, |
| const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| UNIMPLEMENTED(); |
| ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return |
| // an event object (i.e. marker) since clEnqueueBarrier does not provide this |
| ANGLE_TRY(insertBarrier()); |
| |
| ANGLE_TRY(createEvent(&eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| // Unlike clWaitForEvents, this routine is non-blocking |
| ANGLE_TRY(processWaitlist(events)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents, |
| CLEventImpl::CreateFunc *eventCreateFunc) |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| // The barrier command either waits for a list of events to complete, or if the list is |
| // empty it waits for all commands previously enqueued in command_queue to complete before |
| // it completes |
| if (waitEvents.empty()) |
| { |
| ANGLE_TRY(insertBarrier()); |
| } |
| else |
| { |
| ANGLE_TRY(processWaitlist(waitEvents)); |
| } |
| |
| ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::insertBarrier() |
| { |
| VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, |
| VK_ACCESS_SHADER_WRITE_BIT, |
| VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT}; |
| mComputePassCommands->getCommandBuffer().pipelineBarrier( |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, |
| &memoryBarrier, 0, nullptr, 0, nullptr); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::enqueueBarrier() |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRY(insertBarrier()); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::flush() |
| { |
| ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush"); |
| |
| QueueSerial lastSubmittedQueueSerial; |
| { |
| std::unique_lock<std::mutex> ul(mCommandQueueMutex); |
| |
| ANGLE_TRY(flushInternal()); |
| lastSubmittedQueueSerial = mLastSubmittedQueueSerial; |
| } |
| |
| return mFinishHandler.notify(lastSubmittedQueueSerial); |
| } |
| |
| angle::Result CLCommandQueueVk::finish() |
| { |
| std::scoped_lock<std::mutex> sl(mCommandQueueMutex); |
| |
| ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish"); |
| |
| // Blocking finish |
| return finishInternal(); |
| } |
| |
| angle::Result CLCommandQueueVk::syncHostBuffers(HostTransferEntries &hostTransferList) |
| { |
| if (!hostTransferList.empty()) |
| { |
| for (const HostTransferEntry &hostTransferEntry : hostTransferList) |
| { |
| const HostTransferConfig &transferConfig = hostTransferEntry.transferConfig; |
| CLBufferVk &transferBufferVk = |
| hostTransferEntry.transferBufferHandle->getImpl<CLBufferVk>(); |
| switch (hostTransferEntry.transferConfig.type) |
| { |
| case CL_COMMAND_FILL_BUFFER: |
| case CL_COMMAND_WRITE_BUFFER: |
| case CL_COMMAND_WRITE_BUFFER_RECT: |
| // Nothing left to do here |
| break; |
| case CL_COMMAND_READ_BUFFER: |
| case CL_COMMAND_READ_IMAGE: |
| if (transferConfig.rowPitch == 0 && transferConfig.slicePitch == 0) |
| { |
| ANGLE_TRY(transferBufferVk.copyTo( |
| transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size)); |
| } |
| else |
| { |
| ANGLE_TRY(transferBufferVk.copyToWithPitch( |
| transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size, |
| transferConfig.rowPitch, transferConfig.slicePitch, |
| transferConfig.region, transferConfig.elementSize)); |
| } |
| break; |
| case CL_COMMAND_READ_BUFFER_RECT: |
| ANGLE_TRY(transferBufferVk.getRect( |
| transferConfig.srcRect, transferConfig.dstRect, transferConfig.dstHostPtr)); |
| break; |
| default: |
| UNIMPLEMENTED(); |
| break; |
| } |
| } |
| } |
| hostTransferList.clear(); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::addMemoryDependencies(cl::Memory *clMem) |
| { |
| cl::Memory *parentMem = clMem->getParent() ? clMem->getParent().get() : nullptr; |
| |
| // Take an usage count |
| mCommandsStateMap[mComputePassCommands->getQueueSerial()].memories.emplace_back(clMem); |
| |
| // Handle possible resource RAW hazard |
| bool needsBarrier = false; |
| if (clMem->getFlags().intersects(CL_MEM_READ_WRITE)) |
| { |
| // Texel buffers have backing buffer objects |
| if (mDependencyTracker.contains(clMem) || mDependencyTracker.contains(parentMem) || |
| mDependencyTracker.size() == kMaxDependencyTrackerSize) |
| { |
| needsBarrier = true; |
| mDependencyTracker.clear(); |
| } |
| mDependencyTracker.insert(clMem); |
| if (parentMem) |
| { |
| mDependencyTracker.insert(parentMem); |
| } |
| } |
| |
| // Insert a layout transition for images |
| if (cl::IsImageType(clMem->getType())) |
| { |
| CLImageVk &vkMem = clMem->getImpl<CLImageVk>(); |
| mComputePassCommands->imageWrite(mContext, gl::LevelIndex(0), 0, 1, |
| vkMem.getImage().getAspectFlags(), |
| vk::ImageLayout::ComputeShaderWrite, &vkMem.getImage()); |
| } |
| if (needsBarrier) |
| { |
| ANGLE_TRY(insertBarrier()); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk) |
| { |
| bool podBufferPresent = false; |
| uint32_t podBinding = 0; |
| VkDescriptorType podDescriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; |
| const CLProgramVk::DeviceProgramData *devProgramData = |
| kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative()); |
| ASSERT(devProgramData != nullptr); |
| |
| // Set the descriptor set layouts and allocate descriptor sets |
| // The descriptor set layouts are setup in the order of their appearance, as Vulkan requires |
| // them to point to valid handles. |
| angle::EnumIterator<DescriptorSetIndex> layoutIndex(DescriptorSetIndex::LiteralSampler); |
| for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>()) |
| { |
| if (!kernelVk.getDescriptorSetLayoutDesc(index).empty()) |
| { |
| // Setup the descriptor layout |
| ANGLE_CL_IMPL_TRY_ERROR(mContext->getDescriptorSetLayoutCache()->getDescriptorSetLayout( |
| mContext, kernelVk.getDescriptorSetLayoutDesc(index), |
| &kernelVk.getDescriptorSetLayouts()[*layoutIndex]), |
| CL_INVALID_OPERATION); |
| ASSERT(kernelVk.getDescriptorSetLayouts()[*layoutIndex]->valid()); |
| |
| // Allocate descriptor set |
| ANGLE_TRY(mContext->allocateDescriptorSet(&kernelVk, index, layoutIndex, |
| mComputePassCommands)); |
| ++layoutIndex; |
| } |
| } |
| |
| // Setup the pipeline layout |
| ANGLE_CL_IMPL_TRY_ERROR(kernelVk.initPipelineLayout(), CL_INVALID_OPERATION); |
| |
| // Retain kernel object until we finish executing it later |
| mCommandsStateMap[mComputePassCommands->getQueueSerial()].kernels.emplace_back( |
| &kernelVk.getFrontendObject()); |
| |
| // Process descriptor sets used by the kernel |
| vk::DescriptorSetArray<UpdateDescriptorSetsBuilder> updateDescriptorSetsBuilders; |
| |
| UpdateDescriptorSetsBuilder &literalSamplerDescSetBuilder = |
| updateDescriptorSetsBuilders[DescriptorSetIndex::LiteralSampler]; |
| |
| // Create/Setup Literal Sampler |
| for (const ClspvLiteralSampler &literalSampler : devProgramData->reflectionData.literalSamplers) |
| { |
| cl::SamplerPtr clLiteralSampler = |
| cl::SamplerPtr(cl::Sampler::Cast(this->mContext->getFrontendObject().createSampler( |
| literalSampler.normalizedCoords, literalSampler.addressingMode, |
| literalSampler.filterMode))); |
| |
| // Release immediately to ensure correct refcount |
| clLiteralSampler->release(); |
| ASSERT(clLiteralSampler != nullptr); |
| CLSamplerVk &vkLiteralSampler = clLiteralSampler->getImpl<CLSamplerVk>(); |
| |
| VkDescriptorImageInfo &samplerInfo = |
| literalSamplerDescSetBuilder.allocDescriptorImageInfo(); |
| samplerInfo.sampler = vkLiteralSampler.getSamplerHelper().get().getHandle(); |
| samplerInfo.imageView = VK_NULL_HANDLE; |
| samplerInfo.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; |
| |
| VkWriteDescriptorSet &writeDescriptorSet = |
| literalSamplerDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; |
| writeDescriptorSet.pImageInfo = &samplerInfo; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::LiteralSampler); |
| writeDescriptorSet.dstBinding = literalSampler.binding; |
| |
| mCommandsStateMap[mComputePassCommands->getQueueSerial()].samplers.emplace_back( |
| clLiteralSampler); |
| } |
| |
| CLKernelArguments args = kernelVk.getArgs(); |
| UpdateDescriptorSetsBuilder &kernelArgDescSetBuilder = |
| updateDescriptorSetsBuilders[DescriptorSetIndex::KernelArguments]; |
| for (size_t index = 0; index < args.size(); index++) |
| { |
| const auto &arg = args.at(index); |
| switch (arg.type) |
| { |
| case NonSemanticClspvReflectionArgumentUniform: |
| case NonSemanticClspvReflectionArgumentStorageBuffer: |
| { |
| cl::Memory *clMem = cl::Buffer::Cast(static_cast<const cl_mem>(arg.handle)); |
| CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>(); |
| |
| ANGLE_TRY(addMemoryDependencies(clMem)); |
| |
| // Update buffer/descriptor info |
| VkDescriptorBufferInfo &bufferInfo = |
| kernelArgDescSetBuilder.allocDescriptorBufferInfo(); |
| bufferInfo.range = clMem->getSize(); |
| bufferInfo.offset = clMem->getOffset(); |
| bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle(); |
| VkWriteDescriptorSet &writeDescriptorSet = |
| kernelArgDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = |
| arg.type == NonSemanticClspvReflectionArgumentUniform |
| ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER |
| : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| writeDescriptorSet.pBufferInfo = &bufferInfo; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = |
| kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments); |
| writeDescriptorSet.dstBinding = arg.descriptorBinding; |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentPodPushConstant: |
| { |
| ASSERT(!podBufferPresent); |
| |
| // Spec requires the size and offset to be multiple of 4, round up for size and |
| // round down for offset to ensure this |
| uint32_t offset = roundDownPow2(arg.pushConstOffset, 4u); |
| uint32_t size = |
| roundUpPow2(arg.pushConstOffset + arg.pushConstantSize, 4u) - offset; |
| ASSERT(offset + size <= kernelVk.getPodArgumentPushConstantsData().size()); |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, offset, size, |
| &kernelVk.getPodArgumentPushConstantsData()[offset]); |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentWorkgroup: |
| { |
| // Nothing to do here (this is already taken care of during clSetKernelArg) |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentSampler: |
| { |
| cl::Sampler *clSampler = |
| cl::Sampler::Cast(*static_cast<const cl_sampler *>(arg.handle)); |
| CLSamplerVk &vkSampler = clSampler->getImpl<CLSamplerVk>(); |
| VkDescriptorImageInfo &samplerInfo = |
| kernelArgDescSetBuilder.allocDescriptorImageInfo(); |
| samplerInfo.sampler = vkSampler.getSamplerHelper().get().getHandle(); |
| VkWriteDescriptorSet &writeDescriptorSet = |
| kernelArgDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; |
| writeDescriptorSet.pImageInfo = &samplerInfo; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = |
| kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments); |
| writeDescriptorSet.dstBinding = arg.descriptorBinding; |
| |
| const VkPushConstantRange *samplerMaskRange = |
| devProgramData->getNormalizedSamplerMaskRange(index); |
| if (samplerMaskRange != nullptr) |
| { |
| if (clSampler->getNormalizedCoords() == false) |
| { |
| ANGLE_TRY(vkSampler.createNormalized()); |
| samplerInfo.sampler = |
| vkSampler.getSamplerHelperNormalized().get().getHandle(); |
| } |
| uint32_t mask = vkSampler.getSamplerMask(); |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| samplerMaskRange->offset, samplerMaskRange->size, &mask); |
| } |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentStorageImage: |
| case NonSemanticClspvReflectionArgumentSampledImage: |
| { |
| cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle)); |
| CLImageVk &vkMem = clMem->getImpl<CLImageVk>(); |
| |
| ANGLE_TRY(addMemoryDependencies(clMem)); |
| |
| cl_image_format imageFormat = vkMem.getFormat(); |
| const VkPushConstantRange *imageDataChannelOrderRange = |
| devProgramData->getImageDataChannelOrderRange(index); |
| if (imageDataChannelOrderRange != nullptr) |
| { |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| imageDataChannelOrderRange->offset, imageDataChannelOrderRange->size, |
| &imageFormat.image_channel_order); |
| } |
| |
| const VkPushConstantRange *imageDataChannelDataTypeRange = |
| devProgramData->getImageDataChannelDataTypeRange(index); |
| if (imageDataChannelDataTypeRange != nullptr) |
| { |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| imageDataChannelDataTypeRange->offset, imageDataChannelDataTypeRange->size, |
| &imageFormat.image_channel_data_type); |
| } |
| |
| // Update image/descriptor info |
| VkDescriptorImageInfo &imageInfo = |
| kernelArgDescSetBuilder.allocDescriptorImageInfo(); |
| imageInfo.imageLayout = arg.type == NonSemanticClspvReflectionArgumentStorageImage |
| ? VK_IMAGE_LAYOUT_GENERAL |
| : vkMem.getImage().getCurrentLayout(); |
| imageInfo.imageView = vkMem.getImageView().getHandle(); |
| imageInfo.sampler = VK_NULL_HANDLE; |
| VkWriteDescriptorSet &writeDescriptorSet = |
| kernelArgDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = |
| arg.type == NonSemanticClspvReflectionArgumentStorageImage |
| ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE |
| : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; |
| writeDescriptorSet.pImageInfo = &imageInfo; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = |
| kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments); |
| writeDescriptorSet.dstBinding = arg.descriptorBinding; |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentUniformTexelBuffer: |
| case NonSemanticClspvReflectionArgumentStorageTexelBuffer: |
| { |
| cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle)); |
| CLImageVk &vkMem = clMem->getImpl<CLImageVk>(); |
| |
| ANGLE_TRY(addMemoryDependencies(clMem)); |
| |
| VkBufferView &bufferView = kernelArgDescSetBuilder.allocBufferView(); |
| const vk::BufferView *vkBufferView = nullptr; |
| ANGLE_TRY(vkMem.getBufferView(&vkBufferView)); |
| bufferView = vkBufferView->getHandle(); |
| |
| VkWriteDescriptorSet &writeDescriptorSet = |
| kernelArgDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = |
| arg.type == NonSemanticClspvReflectionArgumentStorageTexelBuffer |
| ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER |
| : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; |
| writeDescriptorSet.pImageInfo = nullptr; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = |
| kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments); |
| writeDescriptorSet.dstBinding = arg.descriptorBinding; |
| writeDescriptorSet.pTexelBufferView = &bufferView; |
| |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentPodUniform: |
| case NonSemanticClspvReflectionArgumentPodStorageBuffer: |
| { |
| if (!podBufferPresent) |
| { |
| podBufferPresent = true; |
| podBinding = arg.descriptorBinding; |
| podDescriptorType = arg.type == NonSemanticClspvReflectionArgumentPodUniform |
| ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER |
| : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| } |
| break; |
| } |
| case NonSemanticClspvReflectionArgumentPointerUniform: |
| case NonSemanticClspvReflectionArgumentPointerPushConstant: |
| default: |
| { |
| UNIMPLEMENTED(); |
| break; |
| } |
| } |
| } |
| if (podBufferPresent) |
| { |
| cl::MemoryPtr clMem = kernelVk.getPodBuffer(); |
| ASSERT(clMem != nullptr); |
| CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>(); |
| |
| VkDescriptorBufferInfo &bufferInfo = kernelArgDescSetBuilder.allocDescriptorBufferInfo(); |
| bufferInfo.range = clMem->getSize(); |
| bufferInfo.offset = clMem->getOffset(); |
| bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle(); |
| |
| ANGLE_TRY(addMemoryDependencies(clMem.get())); |
| |
| VkWriteDescriptorSet &writeDescriptorSet = |
| kernelArgDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.pNext = nullptr; |
| writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments); |
| writeDescriptorSet.dstBinding = podBinding; |
| writeDescriptorSet.dstArrayElement = 0; |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = podDescriptorType; |
| writeDescriptorSet.pImageInfo = nullptr; |
| writeDescriptorSet.pBufferInfo = &bufferInfo; |
| } |
| |
| // process the printf storage buffer |
| if (kernelVk.usesPrintf()) |
| { |
| UpdateDescriptorSetsBuilder &printfDescSetBuilder = |
| updateDescriptorSetsBuilders[DescriptorSetIndex::Printf]; |
| |
| cl::MemoryPtr clMem = getOrCreatePrintfBuffer(); |
| CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>(); |
| uint8_t *mapPointer = nullptr; |
| ANGLE_TRY(vkMem.map(mapPointer, 0)); |
| // The spec calls out *The first 4 bytes of the buffer should be zero-initialized.* |
| memset(mapPointer, 0, 4); |
| |
| auto &bufferInfo = printfDescSetBuilder.allocDescriptorBufferInfo(); |
| bufferInfo.range = clMem->getSize(); |
| bufferInfo.offset = clMem->getOffset(); |
| bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle(); |
| |
| auto &writeDescriptorSet = printfDescSetBuilder.allocWriteDescriptorSet(); |
| writeDescriptorSet.descriptorCount = 1; |
| writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; |
| writeDescriptorSet.pBufferInfo = &bufferInfo; |
| writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; |
| writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::Printf); |
| writeDescriptorSet.dstBinding = kernelVk.getProgram() |
| ->getDeviceProgramData(kernelVk.getKernelName().c_str()) |
| ->reflectionData.printfBufferStorage.binding; |
| |
| mNeedPrintfHandling = true; |
| mPrintfInfos = kernelVk.getProgram()->getPrintfDescriptors(kernelVk.getKernelName()); |
| } |
| |
| angle::EnumIterator<DescriptorSetIndex> descriptorSetIndex(DescriptorSetIndex::LiteralSampler); |
| for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>()) |
| { |
| if (!kernelVk.getDescriptorSetLayoutDesc(index).empty()) |
| { |
| mContext->getPerfCounters().writeDescriptorSets = |
| updateDescriptorSetsBuilders[index].flushDescriptorSetUpdates( |
| mContext->getRenderer()->getDevice()); |
| |
| VkDescriptorSet descriptorSet = kernelVk.getDescriptorSet(index); |
| mComputePassCommands->getCommandBuffer().bindDescriptorSets( |
| kernelVk.getPipelineLayout(), VK_PIPELINE_BIND_POINT_COMPUTE, *descriptorSetIndex, |
| 1, &descriptorSet, 0, nullptr); |
| |
| ++descriptorSetIndex; |
| } |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::processGlobalPushConstants(CLKernelVk &kernelVk, |
| const cl::NDRange &ndrange) |
| { |
| const CLProgramVk::DeviceProgramData *devProgramData = |
| kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative()); |
| ASSERT(devProgramData != nullptr); |
| |
| const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange(); |
| if (globalOffsetRange != nullptr) |
| { |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalOffsetRange->offset, |
| globalOffsetRange->size, ndrange.globalWorkOffset.data()); |
| } |
| |
| const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange(); |
| if (globalSizeRange != nullptr) |
| { |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalSizeRange->offset, |
| globalSizeRange->size, ndrange.globalWorkSize.data()); |
| } |
| |
| const VkPushConstantRange *enqueuedLocalSizeRange = devProgramData->getEnqueuedLocalSizeRange(); |
| if (enqueuedLocalSizeRange != nullptr) |
| { |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, |
| enqueuedLocalSizeRange->offset, enqueuedLocalSizeRange->size, |
| ndrange.localWorkSize.data()); |
| } |
| |
| const VkPushConstantRange *numWorkgroupsRange = devProgramData->getNumWorkgroupsRange(); |
| if (devProgramData->reflectionData.pushConstants.contains( |
| NonSemanticClspvReflectionPushConstantNumWorkgroups)) |
| { |
| // We support non-uniform workgroups, thus take the ceil of the quotient |
| uint32_t numWorkgroups[3] = { |
| UnsignedCeilDivide(ndrange.globalWorkSize[0], ndrange.localWorkSize[0]), |
| UnsignedCeilDivide(ndrange.globalWorkSize[1], ndrange.localWorkSize[1]), |
| UnsignedCeilDivide(ndrange.globalWorkSize[2], ndrange.localWorkSize[2])}; |
| mComputePassCommands->getCommandBuffer().pushConstants( |
| kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, numWorkgroupsRange->offset, |
| numWorkgroupsRange->size, &numWorkgroups); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::flushComputePassCommands() |
| { |
| if (mComputePassCommands->empty()) |
| { |
| return angle::Result::Continue; |
| } |
| |
| // Flush any host visible buffers by adding appropriate barriers |
| if (mComputePassCommands->getAndResetHasHostVisibleBufferWrite()) |
| { |
| // Make sure all writes to host-visible buffers are flushed. |
| VkMemoryBarrier memoryBarrier = {}; |
| memoryBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; |
| memoryBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; |
| memoryBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT; |
| |
| mComputePassCommands->getCommandBuffer().memoryBarrier( |
| VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| VK_PIPELINE_STAGE_HOST_BIT, memoryBarrier); |
| } |
| |
| // get hold of the queue serial that is flushed, post the flush the command buffer will be reset |
| mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial(); |
| // Here, we flush our compute cmds to RendererVk's primary command buffer |
| ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands( |
| mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands)); |
| |
| mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++; |
| |
| // Generate new serial for next batch of cmds |
| mComputePassCommands->setQueueSerial( |
| mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents) |
| { |
| if (!waitEvents.empty()) |
| { |
| bool insertedBarrier = false; |
| for (const cl::EventPtr &event : waitEvents) |
| { |
| if (event->getImpl<CLEventVk>().isUserEvent() || |
| event->getCommandQueue() != &mCommandQueue) |
| { |
| // We cannot use a barrier in these cases, therefore defer the event |
| // handling till submission time |
| // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s) |
| // https://anglebug.com/42267109 |
| mExternalEvents.push_back(event); |
| } |
| else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier) |
| { |
| // As long as there is at least one dependant command in same queue, |
| // we just need to insert one execution barrier |
| ANGLE_TRY(insertBarrier()); |
| |
| insertedBarrier = true; |
| } |
| } |
| } |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::submitCommands() |
| { |
| ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()"); |
| |
| ASSERT(hasCommandsPendingSubmission()); |
| |
| // Kick off renderer submit |
| ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(), |
| egl::ContextPriority::Medium, nullptr, |
| nullptr, {}, mLastFlushedQueueSerial)); |
| |
| mLastSubmittedQueueSerial = mLastFlushedQueueSerial; |
| |
| // Now that we have submitted commands, some of pending garbage may no longer pending |
| // and should be moved to garbage list. |
| mContext->getRenderer()->cleanupPendingSubmissionGarbage(); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc, |
| cl::ExecutionStatus initialStatus) |
| { |
| if (createFunc != nullptr) |
| { |
| *createFunc = [initialStatus, queueSerial = mComputePassCommands->getQueueSerial()]( |
| const cl::Event &event) { |
| auto eventVk = new (std::nothrow) CLEventVk(event, initialStatus, queueSerial); |
| if (eventVk == nullptr) |
| { |
| ERR() << "Failed to create cmd event obj!"; |
| return CLEventImpl::Ptr(nullptr); |
| } |
| return CLEventImpl::Ptr(eventVk); |
| }; |
| } |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::submitEmptyCommand() |
| { |
| // This will be called as part of resetting the command buffer and command buffer has to be |
| // empty. |
| ASSERT(mComputePassCommands->empty()); |
| |
| // There is nothing to be flushed, mark it flushed and do a submit to signal the queue serial |
| mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial(); |
| ANGLE_TRY(submitCommands()); |
| ANGLE_TRY(finishQueueSerialInternal(mLastSubmittedQueueSerial)); |
| |
| // increment the queue serial for the next command batch |
| mComputePassCommands->setQueueSerial( |
| mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex)); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::resetCommandBufferWithError(cl_int errorCode) |
| { |
| // Got an error so reset the command buffer and report back error to all the associated |
| // events |
| ASSERT(errorCode != CL_SUCCESS); |
| |
| QueueSerial currentSerial = mComputePassCommands->getQueueSerial(); |
| mComputePassCommands->getCommandBuffer().reset(); |
| |
| for (cl::EventPtr event : mCommandsStateMap[currentSerial].events) |
| { |
| CLEventVk *eventVk = &event->getImpl<CLEventVk>(); |
| if (!eventVk->isUserEvent()) |
| { |
| ANGLE_TRY( |
| eventVk->setStatusAndExecuteCallback(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)); |
| } |
| } |
| mCommandsStateMap.erase(currentSerial); |
| mExternalEvents.clear(); |
| |
| // Command buffer has been reset and as such the associated queue serial will not get signaled |
| // leading to causality issues. So submit an empty command to keep the queue serials timelines |
| // intact. |
| ANGLE_TRY(submitEmptyCommand()); |
| |
| ANGLE_CL_RETURN_ERROR(errorCode); |
| } |
| |
| angle::Result CLCommandQueueVk::finishQueueSerialInternal(const QueueSerial queueSerial) |
| { |
| // Queue serial must belong to this queue and work must have been submitted. |
| ASSERT(queueSerial.getIndex() == mQueueSerialIndex); |
| ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial)); |
| |
| ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial)); |
| |
| // Ensure memory objects are synced back to host CPU |
| ANGLE_TRY(syncHostBuffers(mCommandsStateMap[queueSerial].hostTransferList)); |
| |
| if (mNeedPrintfHandling) |
| { |
| ANGLE_TRY(processPrintfBuffer()); |
| mNeedPrintfHandling = false; |
| } |
| |
| // Events associated with this queue serial and ready to be marked complete |
| ANGLE_TRY(SetEventsWithQueueSerialToState(mCommandsStateMap[queueSerial].events, queueSerial, |
| cl::ExecutionStatus::Complete)); |
| |
| mExternalEvents.clear(); |
| mCommandsStateMap.erase(queueSerial); |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::finishQueueSerial(const QueueSerial queueSerial) |
| { |
| ASSERT(queueSerial.getIndex() == getQueueSerialIndex()); |
| ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial)); |
| |
| ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial)); |
| |
| std::lock_guard<std::mutex> sl(mCommandQueueMutex); |
| |
| return finishQueueSerialInternal(queueSerial); |
| } |
| |
| angle::Result CLCommandQueueVk::flushInternal() |
| { |
| if (!mComputePassCommands->empty()) |
| { |
| // If we still have dependant events, handle them now |
| if (!mExternalEvents.empty()) |
| { |
| for (const auto &depEvent : mExternalEvents) |
| { |
| if (depEvent->getImpl<CLEventVk>().isUserEvent()) |
| { |
| // We just wait here for user to set the event object |
| cl_int status = CL_QUEUED; |
| ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus()); |
| ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status)); |
| if (status < 0) |
| { |
| ERR() << "Invalid dependant user-event (" << depEvent.get() |
| << ") status encountered!"; |
| ANGLE_TRY(resetCommandBufferWithError( |
| CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)); |
| } |
| } |
| else |
| { |
| // Otherwise, we just need to submit/finish for dependant event queues |
| // here that are not associated with this queue |
| ANGLE_TRY(depEvent->getCommandQueue()->finish()); |
| } |
| } |
| mExternalEvents.clear(); |
| } |
| |
| ANGLE_TRY(flushComputePassCommands()); |
| CommandsState commandsState = mCommandsStateMap[mLastFlushedQueueSerial]; |
| ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastFlushedQueueSerial, |
| cl::ExecutionStatus::Submitted)); |
| |
| ANGLE_TRY(submitCommands()); |
| ASSERT(!hasCommandsPendingSubmission()); |
| ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastSubmittedQueueSerial, |
| cl::ExecutionStatus::Running)); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::finishInternal() |
| { |
| ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish"); |
| ANGLE_TRY(flushInternal()); |
| |
| return finishQueueSerialInternal(mLastSubmittedQueueSerial); |
| } |
| |
| // Helper function to insert appropriate memory barriers before accessing the resources in the |
| // command buffer. |
| angle::Result CLCommandQueueVk::onResourceAccess(const vk::CommandBufferAccess &access) |
| { |
| // Buffers |
| for (const vk::CommandBufferBufferAccess &bufferAccess : access.getReadBuffers()) |
| { |
| if (mComputePassCommands->usesBufferForWrite(*bufferAccess.buffer)) |
| { |
| // read buffers only need a new command buffer if previously used for write |
| ANGLE_TRY(flushInternal()); |
| } |
| |
| mComputePassCommands->bufferRead(mContext, bufferAccess.accessType, bufferAccess.stage, |
| bufferAccess.buffer); |
| } |
| |
| for (const vk::CommandBufferBufferAccess &bufferAccess : access.getWriteBuffers()) |
| { |
| if (mComputePassCommands->usesBuffer(*bufferAccess.buffer)) |
| { |
| // write buffers always need a new command buffer |
| ANGLE_TRY(flushInternal()); |
| } |
| |
| mComputePassCommands->bufferWrite(mContext, bufferAccess.accessType, bufferAccess.stage, |
| bufferAccess.buffer); |
| if (bufferAccess.buffer->isHostVisible()) |
| { |
| // currently all are host visible so nothing to do |
| } |
| } |
| |
| for (const vk::CommandBufferBufferExternalAcquireRelease &bufferAcquireRelease : |
| access.getExternalAcquireReleaseBuffers()) |
| { |
| mComputePassCommands->retainResourceForWrite(bufferAcquireRelease.buffer); |
| } |
| |
| for (const vk::CommandBufferResourceAccess &resourceAccess : access.getAccessResources()) |
| { |
| mComputePassCommands->retainResource(resourceAccess.resource); |
| } |
| |
| return angle::Result::Continue; |
| } |
| |
| angle::Result CLCommandQueueVk::processPrintfBuffer() |
| { |
| ASSERT(mPrintfBuffer); |
| ASSERT(mNeedPrintfHandling); |
| ASSERT(mPrintfInfos); |
| |
| cl::MemoryPtr clMem = getOrCreatePrintfBuffer(); |
| CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>(); |
| |
| unsigned char *data = nullptr; |
| ANGLE_TRY(vkMem.map(data, 0)); |
| ANGLE_TRY(ClspvProcessPrintfBuffer(data, vkMem.getSize(), mPrintfInfos)); |
| vkMem.unmap(); |
| |
| return angle::Result::Continue; |
| } |
| |
| // A single CL buffer is setup for every command queue of size kPrintfBufferSize. This can be |
| // expanded later, if more storage is needed. |
| cl::MemoryPtr CLCommandQueueVk::getOrCreatePrintfBuffer() |
| { |
| if (!mPrintfBuffer) |
| { |
| mPrintfBuffer = cl::Buffer::Cast(mContext->getFrontendObject().createBuffer( |
| nullptr, cl::MemFlags(CL_MEM_READ_WRITE), kPrintfBufferSize, nullptr)); |
| } |
| return cl::MemoryPtr(mPrintfBuffer); |
| } |
| |
| bool CLCommandQueueVk::hasUserEventDependency() const |
| { |
| return std::any_of(mExternalEvents.begin(), mExternalEvents.end(), |
| [](const cl::EventPtr event) { return event->isUserEvent(); }); |
| } |
| |
| void CLCommandQueueVk::addEventReference(CLEventVk &eventVk) |
| { |
| ASSERT(eventVk.getQueueSerial().valid()); |
| ASSERT(eventVk.getQueueSerial().getIndex() == mQueueSerialIndex); |
| |
| std::lock_guard<std::mutex> lock(mCommandQueueMutex); |
| |
| mCommandsStateMap[eventVk.getQueueSerial()].events.emplace_back(&eventVk.getFrontendObject()); |
| } |
| |
| } // namespace rx |