| .\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk> |
| .\" Copyright (C) 2019 Jon Corbet <corbet@lwn.net> |
| .\" Copyright (C) 2019 Red Hat, Inc. |
| .\" |
| .\" SPDX-License-Identifier: LGPL-2.0-or-later |
| .\" |
| .TH IO_URING_SETUP 2 2019-01-29 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| io_uring_setup \- setup a context for performing asynchronous I/O |
| .SH SYNOPSIS |
| .nf |
| .BR "#include <linux/io_uring.h>" |
| .PP |
| .BI "int io_uring_setup(u32 " entries ", struct io_uring_params *" p ); |
| .fi |
| .PP |
| .SH DESCRIPTION |
| .PP |
| The io_uring_setup() system call sets up a submission queue (SQ) and |
| completion queue (CQ) with at least |
| .I entries |
| entries, and returns a file descriptor which can be used to perform |
| subsequent operations on the io_uring instance. The submission and |
| completion queues are shared between userspace and the kernel, which |
| eliminates the need to copy data when initiating and completing I/O. |
| |
| .I params |
| is used by the application to pass options to the kernel, and by the |
| kernel to convey information about the ring buffers. |
| .PP |
| .in +4n |
| .EX |
| struct io_uring_params { |
| __u32 sq_entries; |
| __u32 cq_entries; |
| __u32 flags; |
| __u32 sq_thread_cpu; |
| __u32 sq_thread_idle; |
| __u32 features; |
| __u32 resv[4]; |
| struct io_sqring_offsets sq_off; |
| struct io_cqring_offsets cq_off; |
| }; |
| .EE |
| .in |
| .PP |
| The |
| .IR flags , |
| .IR sq_thread_cpu , |
| and |
| .I sq_thread_idle |
| fields are used to configure the io_uring instance. |
| .I flags |
| is a bit mask of 0 or more of the following values ORed |
| together: |
| .TP |
| .B IORING_SETUP_IOPOLL |
| Perform busy-waiting for an I/O completion, as opposed to getting |
| notifications via an asynchronous IRQ (Interrupt Request). The file |
| system (if any) and block device must support polling in order for |
| this to work. Busy-waiting provides lower latency, but may consume |
| more CPU resources than interrupt driven I/O. Currently, this feature |
| is usable only on a file descriptor opened using the |
| .B O_DIRECT |
| flag. When a read or write is submitted to a polled context, the |
| application must poll for completions on the CQ ring by calling |
| .BR io_uring_enter (2). |
| It is illegal to mix and match polled and non-polled I/O on an io_uring |
| instance. |
| |
| .TP |
| .B IORING_SETUP_SQPOLL |
| When this flag is specified, a kernel thread is created to perform |
| submission queue polling. An io_uring instance configured in this way |
| enables an application to issue I/O without ever context switching |
| into the kernel. By using the submission queue to fill in new |
| submission queue entries and watching for completions on the |
| completion queue, the application can submit and reap I/Os without |
| doing a single system call. |
| |
| If the kernel thread is idle for more than |
| .I sq_thread_idle |
| milliseconds, it will set the |
| .B IORING_SQ_NEED_WAKEUP |
| bit in the |
| .I flags |
| field of the |
| .IR "struct io_sq_ring" . |
| When this happens, the application must call |
| .BR io_uring_enter (2) |
| to wake the kernel thread. If I/O is kept busy, the kernel thread |
| will never sleep. An application making use of this feature will need |
| to guard the |
| .BR io_uring_enter (2) |
| call with the following code sequence: |
| |
| .in +4n |
| .EX |
| /* |
| * Ensure that the wakeup flag is read after the tail pointer |
| * has been written. It's important to use memory load acquire |
| * semantics for the flags read, as otherwise the application |
| * and the kernel might not agree on the consistency of the |
| * wakeup flag. |
| */ |
| unsigned flags = atomic_load_relaxed(sq_ring->flags); |
| if (flags & IORING_SQ_NEED_WAKEUP) |
| io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); |
| .EE |
| .in |
| |
| where |
| .I sq_ring |
| is a submission queue ring setup using the |
| .I struct io_sqring_offsets |
| described below. |
| .TP |
| .BR |
| Before version 5.11 of the Linux kernel, to successfully use this feature, the |
| application must register a set of files to be used for IO through |
| .BR io_uring_register (2) |
| using the |
| .B IORING_REGISTER_FILES |
| opcode. Failure to do so will result in submitted IO being errored with |
| .B EBADF. |
| The presence of this feature can be detected by the |
| .B IORING_FEAT_SQPOLL_NONFIXED |
| feature flag. |
| In version 5.11 and later, it is no longer necessary to register files to use |
| this feature. 5.11 also allows using this as non-root, if the user has the |
| .B CAP_SYS_NICE |
| capability. |
| .TP |
| .B IORING_SETUP_SQ_AFF |
| If this flag is specified, then the poll thread will be bound to the |
| cpu set in the |
| .I sq_thread_cpu |
| field of the |
| .IR "struct io_uring_params" . |
| This flag is only meaningful when |
| .B IORING_SETUP_SQPOLL |
| is specified. When cgroup setting |
| .I cpuset.cpus |
| changes (typically in container environment), the bounded cpu set may be |
| changed as well. |
| .TP |
| .B IORING_SETUP_CQSIZE |
| Create the completion queue with |
| .IR "struct io_uring_params.cq_entries" |
| entries. The value must be greater than |
| .IR entries , |
| and may be rounded up to the next power-of-two. |
| .TP |
| .B IORING_SETUP_CLAMP |
| If this flag is specified, and if |
| .IR entries |
| exceeds |
| .B IORING_MAX_ENTRIES , |
| then |
| .IR entries |
| will be clamped at |
| .B IORING_MAX_ENTRIES . |
| If the flag |
| .BR IORING_SETUP_SQPOLL |
| is set, and if the value of |
| .IR "struct io_uring_params.cq_entries" |
| exceeds |
| .B IORING_MAX_CQ_ENTRIES , |
| then it will be clamped at |
| .B IORING_MAX_CQ_ENTRIES . |
| .TP |
| .B IORING_SETUP_ATTACH_WQ |
| This flag should be set in conjunction with |
| .IR "struct io_uring_params.wq_fd" |
| being set to an existing io_uring ring file descriptor. When set, the |
| io_uring instance being created will share the asynchronous worker |
| thread backend of the specified io_uring ring, rather than create a new |
| separate thread pool. |
| .TP |
| .B IORING_SETUP_R_DISABLED |
| If this flag is specified, the io_uring ring starts in a disabled state. |
| In this state, restrictions can be registered, but submissions are not allowed. |
| See |
| .BR io_uring_register (2) |
| for details on how to enable the ring. Available since 5.10. |
| .PP |
| If no flags are specified, the io_uring instance is setup for |
| interrupt driven I/O. I/O may be submitted using |
| .BR io_uring_enter (2) |
| and can be reaped by polling the completion queue. |
| |
| The |
| .I resv |
| array must be initialized to zero. |
| |
| .I features |
| is filled in by the kernel, which specifies various features supported |
| by current kernel version. |
| .TP |
| .B IORING_FEAT_SINGLE_MMAP |
| If this flag is set, the two SQ and CQ rings can be mapped with a single |
| .I mmap(2) |
| call. The SQEs must still be allocated separately. This brings the necessary |
| .I mmap(2) |
| calls down from three to two. |
| .TP |
| .B IORING_FEAT_NODROP |
| If this flag is set, io_uring supports never dropping completion events. |
| If a completion event occurs and the CQ ring is full, the kernel stores |
| the event internally until such a time that the CQ ring has room for more |
| entries. If this overflow condition is entered, attempting to submit more |
| IO with fail with the |
| .B -EBUSY |
| error value, if it can't flush the overflown events to the CQ ring. If this |
| happens, the application must reap events from the CQ ring and attempt the |
| submit again. |
| .TP |
| .B IORING_FEAT_SUBMIT_STABLE |
| If this flag is set, applications can be certain that any data for |
| async offload has been consumed when the kernel has consumed the SQE. |
| .TP |
| .B IORING_FEAT_RW_CUR_POS |
| If this flag is set, applications can specify |
| .I offset |
| == -1 with |
| .B IORING_OP_{READV,WRITEV} |
| , |
| .B IORING_OP_{READ,WRITE}_FIXED |
| , and |
| .B IORING_OP_{READ,WRITE} |
| to mean current file position, which behaves like |
| .I preadv2(2) |
| and |
| .I pwritev2(2) |
| with |
| .I offset |
| == -1. It'll use (and update) the current file position. This obviously comes |
| with the caveat that if the application has multiple reads or writes in flight, |
| then the end result will not be as expected. This is similar to threads sharing |
| a file descriptor and doing IO using the current file position. |
| .TP |
| .B IORING_FEAT_CUR_PERSONALITY |
| If this flag is set, then io_uring guarantees that both sync and async |
| execution of a request assumes the credentials of the task that called |
| .I |
| io_uring_enter(2) |
| to queue the requests. If this flag isn't set, then requests are issued with |
| the credentials of the task that originally registered the io_uring. If only |
| one task is using a ring, then this flag doesn't matter as the credentials |
| will always be the same. Note that this is the default behavior, tasks can |
| still register different personalities through |
| .I |
| io_uring_register(2) |
| with |
| .B IORING_REGISTER_PERSONALITY |
| and specify the personality to use in the sqe. |
| .TP |
| .B IORING_FEAT_FAST_POLL |
| If this flag is set, then io_uring supports using an internal poll mechanism |
| to drive data/space readiness. This means that requests that cannot read or |
| write data to a file no longer need to be punted to an async thread for |
| handling, instead they will begin operation when the file is ready. This is |
| similar to doing poll + read/write in userspace, but eliminates the need to do |
| so. If this flag is set, requests waiting on space/data consume a lot less |
| resources doing so as they are not blocking a thread. |
| .TP |
| .B IORING_FEAT_POLL_32BITS |
| If this flag is set, the |
| .B IORING_OP_POLL_ADD |
| command accepts the full 32-bit range of epoll based flags. Most notably |
| .B EPOLLEXCLUSIVE |
| which allows exclusive (waking single waiters) behavior. |
| .TP |
| .B IORING_FEAT_SQPOLL_NONFIXED |
| If this flag is set, the |
| .B IORING_SETUP_SQPOLL |
| feature no longer requires the use of fixed files. Any normal file descriptor |
| can be used for IO commands without needing registration. |
| |
| .PP |
| The rest of the fields in the |
| .I struct io_uring_params |
| are filled in by the kernel, and provide the information necessary to |
| memory map the submission queue, completion queue, and the array of |
| submission queue entries. |
| .I sq_entries |
| specifies the number of submission queue entries allocated. |
| .I sq_off |
| describes the offsets of various ring buffer fields: |
| .PP |
| .in +4n |
| .EX |
| struct io_sqring_offsets { |
| __u32 head; |
| __u32 tail; |
| __u32 ring_mask; |
| __u32 ring_entries; |
| __u32 flags; |
| __u32 dropped; |
| __u32 array; |
| __u32 resv[3]; |
| }; |
| .EE |
| .in |
| .PP |
| Taken together, |
| .I sq_entries |
| and |
| .I sq_off |
| provide all of the information necessary for accessing the submission |
| queue ring buffer and the submission queue entry array. The |
| submission queue can be mapped with a call like: |
| .PP |
| .in +4n |
| .EX |
| ptr = mmap(0, sq_off.array + sq_entries * sizeof(__u32), |
| PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, |
| ring_fd, IORING_OFF_SQ_RING); |
| .EE |
| .in |
| .PP |
| where |
| .I sq_off |
| is the |
| .I io_sqring_offsets |
| structure, and |
| .I ring_fd |
| is the file descriptor returned from |
| .BR io_uring_setup (2). |
| The addition of |
| .I sq_off.array |
| to the length of the region accounts for the fact that the ring |
| located at the end of the data structure. As an example, the ring |
| buffer head pointer can be accessed by adding |
| .I sq_off.head |
| to the address returned from |
| .BR mmap (2): |
| .PP |
| .in +4n |
| .EX |
| head = ptr + sq_off.head; |
| .EE |
| .in |
| |
| The |
| .I flags |
| field is used by the kernel to communicate state information to the |
| application. Currently, it is used to inform the application when a |
| call to |
| .BR io_uring_enter (2) |
| is necessary. See the documentation for the |
| .B IORING_SETUP_SQPOLL |
| flag above. |
| The |
| .I dropped |
| member is incremented for each invalid submission queue entry |
| encountered in the ring buffer. |
| |
| The head and tail track the ring buffer state. The tail is |
| incremented by the application when submitting new I/O, and the head |
| is incremented by the kernel when the I/O has been successfully |
| submitted. Determining the index of the head or tail into the ring is |
| accomplished by applying a mask: |
| .PP |
| .in +4n |
| .EX |
| index = tail & ring_mask; |
| .EE |
| .in |
| .PP |
| The array of submission queue entries is mapped with: |
| .PP |
| .in +4n |
| .EX |
| sqentries = mmap(0, sq_entries * sizeof(struct io_uring_sqe), |
| PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, |
| ring_fd, IORING_OFF_SQES); |
| .EE |
| .in |
| .PP |
| The completion queue is described by |
| .I cq_entries |
| and |
| .I cq_off |
| shown here: |
| .PP |
| .in +4n |
| .EX |
| struct io_cqring_offsets { |
| __u32 head; |
| __u32 tail; |
| __u32 ring_mask; |
| __u32 ring_entries; |
| __u32 overflow; |
| __u32 cqes; |
| __u32 flags; |
| __u32 resv[3]; |
| }; |
| .EE |
| .in |
| .PP |
| The completion queue is simpler, since the entries are not separated |
| from the queue itself, and can be mapped with: |
| .PP |
| .in +4n |
| .EX |
| ptr = mmap(0, cq_off.cqes + cq_entries * sizeof(struct io_uring_cqe), |
| PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd, |
| IORING_OFF_CQ_RING); |
| .EE |
| .in |
| .PP |
| Closing the file descriptor returned by |
| .BR io_uring_setup (2) |
| will free all resources associated with the io_uring context. |
| .PP |
| .SH RETURN VALUE |
| .BR io_uring_setup (2) |
| returns a new file descriptor on success. The application may then |
| provide the file descriptor in a subsequent |
| .BR mmap (2) |
| call to map the submission and completion queues, or to the |
| .BR io_uring_register (2) |
| or |
| .BR io_uring_enter (2) |
| system calls. |
| |
| On error, -1 is returned and |
| .I errno |
| is set appropriately. |
| .PP |
| .SH ERRORS |
| .TP |
| .B EFAULT |
| params is outside your accessible address space. |
| .TP |
| .B EINVAL |
| The resv array contains non-zero data, p.flags contains an unsupported |
| flag, |
| .I entries |
| is out of bounds, |
| .B IORING_SETUP_SQ_AFF |
| was specified, but |
| .B IORING_SETUP_SQPOLL |
| was not, or |
| .B IORING_SETUP_CQSIZE |
| was specified, but |
| .I io_uring_params.cq_entries |
| was invalid. |
| .TP |
| .B EMFILE |
| The per-process limit on the number of open file descriptors has been |
| reached (see the description of |
| .B RLIMIT_NOFILE |
| in |
| .BR getrlimit (2)). |
| .TP |
| .B ENFILE |
| The system-wide limit on the total number of open files has been |
| reached. |
| .TP |
| .B ENOMEM |
| Insufficient kernel resources are available. |
| .TP |
| .B EPERM |
| .B IORING_SETUP_SQPOLL |
| was specified, but the effective user ID of the caller did not have sufficient |
| privileges. |
| .SH SEE ALSO |
| .BR io_uring_register (2), |
| .BR io_uring_enter (2) |