| .\" Copyright (C) 2019 Jens Axboe <[email protected]> |
| .\" Copyright (C) 2019 Red Hat, Inc. |
| .\" |
| .\" SPDX-License-Identifier: LGPL-2.0-or-later |
| .\" |
| .TH IO_URING_ENTER 2 2019-01-22 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| io_uring_enter \- initiate and/or complete asynchronous I/O |
| .SH SYNOPSIS |
| .nf |
| .BR "#include <linux/io_uring.h>" |
| .PP |
| .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit , |
| .BI " unsigned int " min_complete ", unsigned int " flags , |
| .BI " sigset_t *" sig ); |
| .fi |
| .PP |
| .SH DESCRIPTION |
| .PP |
| .BR io_uring_enter () |
| is used to initiate and complete I/O using the shared submission and |
| completion queues setup by a call to |
| .BR io_uring_setup (2). |
| A single call can both submit new I/O and wait for completions of I/O |
| initiated by this call or previous calls to |
| .BR io_uring_enter (). |
| |
| .I fd |
| is the file descriptor returned by |
| .BR io_uring_setup (2). |
| .I to_submit |
| specifies the number of I/Os to submit from the submission queue. |
| .I flags |
| is a bitmask of the following values: |
| .TP |
| .B IORING_ENTER_GETEVENTS |
| If this flag is set, then the system call will wait for the specificied |
| number of events in |
| .I min_complete |
| before returning. This flag can be set along with |
| .I to_submit |
| to both submit and complete events in a single system call. |
| .TP |
| .B IORING_ENTER_SQ_WAKEUP |
| If the ring has been created with |
| .B IORING_SETUP_SQPOLL, |
| then this flag asks the kernel to wakeup the SQ kernel thread to submit IO. |
| .TP |
| .B IORING_ENTER_SQ_WAIT |
| If the ring has been created with |
| .B IORING_SETUP_SQPOLL, |
| then the application has no real insight into when the SQ kernel thread has |
| consumed entries from the SQ ring. This can lead to a situation where the |
| application can no longer get a free SQE entry to submit, without knowing |
| when it one becomes available as the SQ kernel thread consumes them. If |
| the system call is used with this flag set, then it will wait until at least |
| one entry is free in the SQ ring. |
| .PP |
| .PP |
| If the io_uring instance was configured for polling, by specifying |
| .B IORING_SETUP_IOPOLL |
| in the call to |
| .BR io_uring_setup (2), |
| then min_complete has a slightly different meaning. Passing a value |
| of 0 instructs the kernel to return any events which are already complete, |
| without blocking. If |
| .I min_complete |
| is a non-zero value, the kernel will still return immediately if any |
| completion events are available. If no event completions are |
| available, then the call will poll either until one or more |
| completions become available, or until the process has exceeded its |
| scheduler time slice. |
| |
| Note that, for interrupt driven I/O (where |
| .B IORING_SETUP_IOPOLL |
| was not specified in the call to |
| .BR io_uring_setup (2)), |
| an application may check the completion queue for event completions |
| without entering the kernel at all. |
| .PP |
| When the system call returns that a certain amount of SQEs have been |
| consumed and submitted, it's safe to reuse SQE entries in the ring. This is |
| true even if the actual IO submission had to be punted to async context, |
| which means that the SQE may in fact not have been submitted yet. If the |
| kernel requires later use of a particular SQE entry, it will have made a |
| private copy of it. |
| |
| .I sig |
| is a pointer to a signal mask (see |
| .BR sigprocmask (2)); |
| if |
| .I sig |
| is not NULL, |
| .BR io_uring_enter () |
| first replaces the current signal mask by the one pointed to by |
| .IR sig , |
| then waits for events to become available in the completion queue, and |
| then restores the original signal mask. The following |
| .BR io_uring_enter () |
| call: |
| .PP |
| .in +4n |
| .EX |
| ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig); |
| .EE |
| .in |
| .PP |
| is equivalent to |
| .I atomically |
| executing the following calls: |
| .PP |
| .in +4n |
| .EX |
| pthread_sigmask(SIG_SETMASK, &sig, &orig); |
| ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); |
| pthread_sigmask(SIG_SETMASK, &orig, NULL); |
| .EE |
| .in |
| .PP |
| See the description of |
| .BR pselect (2) |
| for an explanation of why the |
| .I sig |
| parameter is necessary. |
| |
| Submission queue entries are represented using the following data |
| structure: |
| .PP |
| .in +4n |
| .EX |
| /* |
| * IO submission data structure (Submission Queue Entry) |
| */ |
| struct io_uring_sqe { |
| __u8 opcode; /* type of operation for this sqe */ |
| __u8 flags; /* IOSQE_ flags */ |
| __u16 ioprio; /* ioprio for the request */ |
| __s32 fd; /* file descriptor to do IO on */ |
| union { |
| __u64 off; /* offset into file */ |
| __u64 addr2; |
| }; |
| union { |
| __u64 addr; /* pointer to buffer or iovecs */ |
| __u64 splice_off_in; |
| } |
| __u32 len; /* buffer size or number of iovecs */ |
| union { |
| __kernel_rwf_t rw_flags; |
| __u32 fsync_flags; |
| __u16 poll_events; /* compatibility */ |
| __u32 poll32_events; /* word-reversed for BE */ |
| __u32 sync_range_flags; |
| __u32 msg_flags; |
| __u32 timeout_flags; |
| __u32 accept_flags; |
| __u32 cancel_flags; |
| __u32 open_flags; |
| __u32 statx_flags; |
| __u32 fadvise_advice; |
| __u32 splice_flags; |
| }; |
| __u64 user_data; /* data to be passed back at completion time */ |
| union { |
| struct { |
| /* index into fixed buffers, if used */ |
| union { |
| /* index into fixed buffers, if used */ |
| __u16 buf_index; |
| /* for grouped buffer selection */ |
| __u16 buf_group; |
| } |
| /* personality to use, if used */ |
| __u16 personality; |
| __s32 splice_fd_in; |
| }; |
| __u64 __pad2[3]; |
| }; |
| }; |
| .EE |
| .in |
| .PP |
| The |
| .I opcode |
| describes the operation to be performed. It can be one of: |
| .TP |
| .B IORING_OP_NOP |
| Do not perform any I/O. This is useful for testing the performance of |
| the io_uring implementation itself. |
| .TP |
| .B IORING_OP_READV |
| .TP |
| .B IORING_OP_WRITEV |
| Vectored read and write operations, similar to |
| .BR preadv2 (2) |
| and |
| .BR pwritev2 (2). |
| If the file is not seekable, |
| .I off |
| must be set to zero. |
| |
| .TP |
| .B IORING_OP_READ_FIXED |
| .TP |
| .B IORING_OP_WRITE_FIXED |
| Read from or write to pre-mapped buffers. See |
| .BR io_uring_register (2) |
| for details on how to setup a context for fixed reads and writes. |
| |
| .TP |
| .B IORING_OP_FSYNC |
| File sync. See also |
| .BR fsync (2). |
| Note that, while I/O is initiated in the order in which it appears in |
| the submission queue, completions are unordered. For example, an |
| application which places a write I/O followed by an fsync in the |
| submission queue cannot expect the fsync to apply to the write. The |
| two operations execute in parallel, so the fsync may complete before |
| the write is issued to the storage. The same is also true for |
| previously issued writes that have not completed prior to the fsync. |
| |
| .TP |
| .B IORING_OP_POLL_ADD |
| Poll the |
| .I fd |
| specified in the submission queue entry for the events |
| specified in the |
| .I poll_events |
| field. Unlike poll or epoll without |
| .BR EPOLLONESHOT , |
| this interface always works in one shot mode. That is, once the poll |
| operation is completed, it will have to be resubmitted. This command works like |
| an async |
| .BR poll(2) |
| and the completion event result is the returned mask of events. |
| |
| .TP |
| .B IORING_OP_POLL_REMOVE |
| Remove an existing poll request. If found, the |
| .I res |
| field of the |
| .I "struct io_uring_cqe" |
| will contain 0. If not found, |
| .I res |
| will contain |
| .B -ENOENT. |
| |
| .TP |
| .B IORING_OP_EPOLL_CTL |
| Add, remove or modify entries in the interest list of |
| .BR epoll (7). |
| See |
| .BR epoll_ctl (2) |
| for details of the system call. |
| .I fd |
| holds the file descriptor that represents the epoll instance, |
| .I addr |
| holds the file descriptor to add, remove or modify, |
| .I len |
| holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and, |
| .I off |
| holds a pointer to the |
| .I epoll_events |
| structure. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_SYNC_FILE_RANGE |
| Issue the equivalent of a \fBsync_file_range\fR (2) on the file descriptor. The |
| .I fd |
| field is the file descriptor to sync, the |
| .I off |
| field holds the offset in bytes, the |
| .I len |
| field holds the length in bytes, and the |
| .I sync_range_flags |
| field holds the flags for the command. See also |
| .BR sync_file_range (2) |
| for the general description of the related system call. Available since 5.2. |
| |
| .TP |
| .B IORING_OP_SENDMSG |
| Issue the equivalent of a |
| .BR sendmsg(2) |
| system call. |
| .I fd |
| must be set to the socket file descriptor, |
| .I addr |
| must contain a pointer to the msghdr structure, and |
| .I msg_flags |
| holds the flags associated with the system call. See also |
| .BR sendmsg (2) |
| for the general description of the related system call. Available since 5.3. |
| |
| .TP |
| .B IORING_OP_RECVMSG |
| Works just like IORING_OP_SENDMSG, except for |
| .BR recvmsg(2) |
| instead. See the description of IORING_OP_SENDMSG. Available since 5.3. |
| |
| .TP |
| .B IORING_OP_SEND |
| Issue the equivalent of a |
| .BR send(2) |
| system call. |
| .I fd |
| must be set to the socket file descriptor, |
| .I addr |
| must contain a pointer to the buffer, |
| .I len |
| denotes the length of the buffer to send, and |
| .I msg_flags |
| holds the flags associated with the system call. See also |
| .BR send(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_RECV |
| Works just like IORING_OP_SEND, except for |
| .BR recv(2) |
| instead. See the description of IORING_OP_SEND. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_TIMEOUT |
| This command will register a timeout operation. The |
| .I addr |
| field must contain a pointer to a struct timespec64 structure, |
| .I len |
| must contain 1 to signify one timespec64 structure, |
| .I timeout_flags |
| may contain IORING_TIMEOUT_ABS |
| for an absolute timeout value, or 0 for a relative timeout. |
| .I off |
| may contain a completion event count. A timeout |
| will trigger a wakeup event on the completion ring for anyone waiting for |
| events. A timeout condition is met when either the specified timeout expires, |
| or the specified number of events have completed. Either condition will |
| trigger the event. If set to 0, completed events are not counted, which |
| effectively acts like a timer. io_uring timeouts use the |
| .B CLOCK_MONOTONIC |
| clock source. The request will complete with |
| .I -ETIME |
| if the timeout got completed through expiration of the timer, or |
| .I 0 |
| if the timeout got completed through requests completing on their own. If |
| the timeout was cancelled before it expired, the request will complete with |
| .I -ECANCELED. |
| Available since 5.4. |
| |
| .TP |
| .B IORING_OP_TIMEOUT_REMOVE |
| If |
| .I timeout_flags are zero, then it attempts to remove an existing timeout |
| operation. |
| .I addr |
| must contain the |
| .I user_data |
| field of the previously issued timeout operation. If the specified timeout |
| request is found and cancelled successfully, this request will terminate |
| with a result value of |
| .I 0 |
| If the timeout request was found but expiration was already in progress, |
| this request will terminate with a result value of |
| .I -EBUSY |
| If the timeout request wasn't found, the request will terminate with a result |
| value of |
| .I -ENOENT |
| Available since 5.5. |
| |
| If |
| .I timeout_flags |
| contain |
| .I IORING_TIMEOUT_UPDATE, |
| instead of removing an existing operation it updates it. |
| .I addr |
| and return values are same as before. |
| .I addr2 |
| field must contain a pointer to a struct timespec64 structure. |
| .I timeout_flags |
| may also contain IORING_TIMEOUT_ABS. |
| Available since 5.11. |
| |
| .TP |
| .B IORING_OP_ACCEPT |
| Issue the equivalent of an |
| .BR accept4(2) |
| system call. |
| .I fd |
| must be set to the socket file descriptor, |
| .I addr |
| must contain the pointer to the sockaddr structure, and |
| .I addr2 |
| must contain a pointer to the socklen_t addrlen field. See also |
| .BR accept4(2) |
| for the general description of the related system call. Available since 5.5. |
| |
| .TP |
| .B IORING_OP_ASYNC_CANCEL |
| Attempt to cancel an already issued request. |
| .I addr |
| must contain the |
| .I user_data |
| field of the request that should be cancelled. The cancellation request will |
| complete with one of the following results codes. If found, the |
| .I res |
| field of the cqe will contain 0. If not found, |
| .I res |
| will contain -ENOENT. If found and attempted cancelled, the |
| .I res |
| field will contain -EALREADY. In this case, the request may or may not |
| terminate. In general, requests that are interruptible (like socket IO) will |
| get cancelled, while disk IO requests cannot be cancelled if already started. |
| Available since 5.5. |
| |
| .TP |
| .B IORING_OP_LINK_TIMEOUT |
| This request must be linked with another request through |
| .I IOSQE_IO_LINK |
| which is described below. Unlike |
| .I IORING_OP_TIMEOUT, |
| .I IORING_OP_LINK_TIMEOUT |
| acts on the linked request, not the completion queue. The format of the command |
| is otherwise like |
| .I IORING_OP_TIMEOUT, |
| except there's no completion event count as it's tied to a specific request. |
| If used, the timeout specified in the command will cancel the linked command, |
| unless the linked command completes before the timeout. The timeout will |
| complete with |
| .I -ETIME |
| if the timer expired and the linked request was attempted cancelled, or |
| .I -ECANCELED |
| if the timer got cancelled because of completion of the linked request. Like |
| .B IORING_OP_TIMEOUT |
| the clock source used is |
| .B CLOCK_MONOTONIC |
| Available since 5.5. |
| |
| |
| .TP |
| .B IORING_OP_CONNECT |
| Issue the equivalent of a |
| .BR connect(2) |
| system call. |
| .I fd |
| must be set to the socket file descriptor, |
| .I addr |
| must contain the const pointer to the sockaddr structure, and |
| .I off |
| must contain the socklen_t addrlen field. See also |
| .BR connect(2) |
| for the general description of the related system call. Available since 5.5. |
| |
| .TP |
| .B IORING_OP_FALLOCATE |
| Issue the equivalent of a |
| .BR fallocate(2) |
| system call. |
| .I fd |
| must be set to the file descriptor, |
| .I len |
| must contain the mode associated with the operation, |
| .I off |
| must contain the offset on which to operate, and |
| .I addr |
| must contain the length. See also |
| .BR fallocate(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_FADVISE |
| Issue the equivalent of a |
| .BR posix_fadvise(2) |
| system call. |
| .I fd |
| must be set to the file descriptor, |
| .I off |
| must contain the offset on which to operate, |
| .I len |
| must contain the length, and |
| .I fadvise_advice |
| must contain the advice associated with the operation. See also |
| .BR posix_fadvise(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_MADVISE |
| Issue the equivalent of a |
| .BR madvise(2) |
| system call. |
| .I addr |
| must contain the address to operate on, |
| .I len |
| must contain the length on which to operate, |
| and |
| .I fadvise_advice |
| must contain the advice associated with the operation. See also |
| .BR madvise(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_OPENAT |
| Issue the equivalent of a |
| .BR openat(2) |
| system call. |
| .I fd |
| is the |
| .I dirfd |
| argument, |
| .I addr |
| must contain a pointer to the |
| .I *pathname |
| argument, |
| .I open_flags |
| should contain any flags passed in, and |
| .I len |
| is access mode of the file. See also |
| .BR openat(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_OPENAT2 |
| Issue the equivalent of a |
| .BR openat2(2) |
| system call. |
| .I fd |
| is the |
| .I dirfd |
| argument, |
| .I addr |
| must contain a pointer to the |
| .I *pathname |
| argument, |
| .I len |
| should contain the size of the open_how structure, and |
| .I off |
| should be set to the address of the open_how structure. See also |
| .BR openat2(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_CLOSE |
| Issue the equivalent of a |
| .BR close(2) |
| system call. |
| .I fd |
| is the file descriptor to be closed. See also |
| .BR close(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_STATX |
| Issue the equivalent of a |
| .BR statx(2) |
| system call. |
| .I fd |
| is the |
| .I dirfd |
| argument, |
| .I addr |
| must contain a pointer to the |
| .I *pathname |
| string, |
| .I statx_flags |
| is the |
| .I flags |
| argument, |
| .I len |
| should be the |
| .I mask |
| argument, and |
| .I off |
| must contain a pointer to the |
| .I statxbuf |
| to be filled in. See also |
| .BR statx(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_READ |
| .TP |
| .B IORING_OP_WRITE |
| Issue the equivalent of a |
| .BR pread(2) |
| or |
| .BR pwrite(2) |
| system call. |
| .I fd |
| is the file descriptor to be operated on, |
| .I addr |
| contains the buffer in question, |
| .I len |
| contains the length of the IO operation, and |
| .I offs |
| contains the read or write offset. If |
| .I fd |
| does not refer to a seekable file, |
| .I off |
| must be set to zero. If |
| .I offs |
| is set to -1, the offset will use (and advance) the file position, like the |
| .BR read(2) |
| and |
| .BR write(2) |
| system calls. These are non-vectored versions of the |
| .B IORING_OP_READV |
| and |
| .B IORING_OP_WRITEV |
| opcodes. See also |
| .BR read(2) |
| and |
| .BR write(2) |
| for the general description of the related system call. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_SPLICE |
| Issue the equivalent of a |
| .BR splice(2) |
| system call. |
| .I splice_fd_in |
| is the file descriptor to read from, |
| .I splice_off_in |
| is an offset to read from, |
| .I fd |
| is the file descriptor to write to, |
| .I off |
| is an offset from which to start writing to. A sentinel value of -1 is used |
| to pass the equivalent of a NULL for the offsets to |
| .BR splice(2). |
| .I len |
| contains the number of bytes to copy. |
| .I splice_flags |
| contains a bit mask for the flag field associated with the system call. |
| Please note that one of the file descriptors must refer to a pipe. |
| See also |
| .BR splice(2) |
| for the general description of the related system call. Available since 5.7. |
| |
| .TP |
| .B IORING_OP_TEE |
| Issue the equivalent of a |
| .BR tee(2) |
| system call. |
| .I splice_fd_in |
| is the file descriptor to read from, |
| .I fd |
| is the file descriptor to write to, |
| .I len |
| contains the number of bytes to copy, and |
| .I splice_flags |
| contains a bit mask for the flag field associated with the system call. |
| Please note that both of the file descriptors must refer to a pipe. |
| See also |
| .BR tee(2) |
| for the general description of the related system call. Available since 5.8. |
| |
| .TP |
| .B IORING_OP_FILES_UPDATE |
| This command is an alternative to using |
| .B IORING_REGISTER_FILES_UPDATE |
| which then works in an async fashion, like the rest of the io_uring commands. |
| The arguments passed in are the same. |
| .I addr |
| must contain a pointer to the array of file descriptors, |
| .I len |
| must contain the length of the array, and |
| .I off |
| must contain the offset at which to operate. Note that the array of file |
| descriptors pointed to in |
| .I addr |
| must remain valid until this operation has completed. Available since 5.6. |
| |
| .TP |
| .B IORING_OP_PROVIDE_BUFFERS |
| This command allows an application to register a group of buffers to be used |
| by commands that read/receive data. Using buffers in this manner can eliminate |
| the need to separate the poll + read, which provides a convenient point in |
| time to allocate a buffer for a given request. It's often infeasible to have |
| as many buffers available as pending reads or receive. With this feature, the |
| application can have its pool of buffers ready in the kernel, and when the |
| file or socket is ready to read/receive data, a buffer can be selected for the |
| operation. |
| .I fd |
| must contain the number of buffers to provide, |
| .I addr |
| must contain the starting address to add buffers from, |
| .I len |
| must contain the length of each buffer to add from the range, |
| .I buf_group |
| must contain the group ID of this range of buffers, and |
| .I off |
| must contain the starting buffer ID of this range of buffers. With that set, |
| the kernel adds buffers starting with the memory address in |
| .I addr, |
| each with a length of |
| .I len. |
| Hence the application should provide |
| .I len * fd |
| worth of memory in |
| .I addr. |
| Buffers are grouped by the group ID, and each buffer within this group will be |
| identical in size according to the above arguments. This allows the application |
| to provide different groups of buffers, and this is often used to have |
| differently sized buffers available depending on what the expectations are of |
| the individual request. When submitting a request that should use a provided |
| buffer, the |
| .B IOSQE_BUFFER_SELECT |
| flag must be set, and |
| .I buf_group |
| must be set to the desired buffer group ID where the buffer should be selected |
| from. Available since 5.7. |
| |
| .TP |
| .B IORING_OP_REMOVE_BUFFERS |
| Remove buffers previously registered with |
| .B IORING_OP_PROVIDE_BUFFERS. |
| .I fd |
| must contain the number of buffers to remove, and |
| .I buf_group |
| must contain the buffer group ID from which to remove the buffers. Available |
| since 5.7. |
| |
| .TP |
| .B IORING_OP_SHUTDOWN |
| Issue the equivalent of a |
| .BR shutdown(2) |
| system call. |
| .I fd |
| is the file descriptor to the socket being shutdown, no other fields should |
| be set. Available since 5.11. |
| |
| .TP |
| .B IORING_OP_RENAMEAT |
| Issue the equivalent of a |
| .BR renameat2(2) |
| system call. |
| .I fd |
| should be set to the |
| .I olddirfd, |
| .I addr |
| should be set to the |
| .I oldpath, |
| .I len |
| should be set to the |
| .I newdirfd, |
| .I addr |
| should be set to the |
| .I oldpath, |
| .I addr2 |
| should be set to the |
| .I newpath, |
| and finally |
| .I rename_flags |
| should be set to the |
| .I flags |
| passed in to |
| .BR renameat2(2). |
| Available since 5.11. |
| |
| .TP |
| .B IORING_OP_UNLINKAT |
| Issue the equivalent of a |
| .BR unlinkat2(2) |
| system call. |
| .I fd |
| should be set to the |
| .I dirfd, |
| .I addr |
| should be set to the |
| .I pathname, |
| and |
| .I unlink_flags |
| should be set to the |
| .I flags |
| being passed in to |
| .BR unlinkat(2). |
| Available since 5.11. |
| |
| .PP |
| The |
| .I flags |
| field is a bit mask. The supported flags are: |
| .TP |
| .B IOSQE_FIXED_FILE |
| When this flag is specified, |
| .I fd |
| is an index into the files array registered with the io_uring instance (see the |
| .B IORING_REGISTER_FILES |
| section of the |
| .BR io_uring_register (2) |
| man page). Available since 5.1. |
| .TP |
| .B IOSQE_IO_DRAIN |
| When this flag is specified, the SQE will not be started before previously |
| submitted SQEs have completed, and new SQEs will not be started before this |
| one completes. Available since 5.2. |
| .TP |
| .B IOSQE_IO_LINK |
| When this flag is specified, it forms a link with the next SQE in the |
| submission ring. That next SQE will not be started before this one completes. |
| This, in effect, forms a chain of SQEs, which can be arbitrarily long. The tail |
| of the chain is denoted by the first SQE that does not have this flag set. |
| This flag has no effect on previous SQE submissions, nor does it impact SQEs |
| that are outside of the chain tail. This means that multiple chains can be |
| executing in parallel, or chains and individual SQEs. Only members inside the |
| chain are serialized. A chain of SQEs will be broken, if any request in that |
| chain ends in error. io_uring considers any unexpected result an error. This |
| means that, eg, a short read will also terminate the remainder of the chain. |
| If a chain of SQE links is broken, the remaining unstarted part of the chain |
| will be terminated and completed with |
| .B -ECANCELED |
| as the error code. Available since 5.3. |
| .TP |
| .B IOSQE_IO_HARDLINK |
| Like IOSQE_IO_LINK, but it doesn't sever regardless of the completion result. |
| Note that the link will still sever if we fail submitting the parent request, |
| hard links are only resilient in the presence of completion results for |
| requests that did submit correctly. IOSQE_IO_HARDLINK implies IOSQE_IO_LINK. |
| Available since 5.5. |
| .TP |
| .B IOSQE_ASYNC |
| Normal operation for io_uring is to try and issue an sqe as non-blocking first, |
| and if that fails, execute it in an async manner. To support more efficient |
| overlapped operation of requests that the application knows/assumes will |
| always (or most of the time) block, the application can ask for an sqe to be |
| issued async from the start. Available since 5.6. |
| .TP |
| .B IOSQE_BUFFER_SELECT |
| Used in conjunction with the |
| .B IORING_OP_PROVIDE_BUFFERS |
| command, which registers a pool of buffers to be used by commands that read |
| or receive data. When buffers are registered for this use case, and this |
| flag is set in the command, io_uring will grab a buffer from this pool when |
| the request is ready to receive or read data. If succesful, the resulting CQE |
| will have |
| .B IORING_CQE_F_BUFFER |
| set in the flags part of the struct, and the upper |
| .B IORING_CQE_BUFFER_SHIFT |
| bits will contain the ID of the selected buffers. This allows the application |
| to know exactly which buffer was selected for the operation. If no buffers |
| are available and this flag is set, then the request will fail with |
| .B -ENOBUFS |
| as the error code. Once a buffer has been used, it is no longer available in |
| the kernel pool. The application must re-register the given buffer again when |
| it is ready to recycle it (eg has completed using it). Available since 5.7. |
| |
| .PP |
| .I ioprio |
| specifies the I/O priority. See |
| .BR ioprio_get (2) |
| for a description of Linux I/O priorities. |
| |
| .I fd |
| specifies the file descriptor against which the operation will be |
| performed, with the exception noted above. |
| |
| If the operation is one of |
| .B IORING_OP_READ_FIXED |
| or |
| .BR IORING_OP_WRITE_FIXED , |
| .I addr |
| and |
| .I len |
| must fall within the buffer located at |
| .I buf_index |
| in the fixed buffer array. If the operation is either |
| .B IORING_OP_READV |
| or |
| .BR IORING_OP_WRITEV , |
| then |
| .I addr |
| points to an iovec array of |
| .I len |
| entries. |
| |
| .IR rw_flags , |
| specified for read and write operations, contains a bitwise OR of |
| per-I/O flags, as described in the |
| .BR preadv2 (2) |
| man page. |
| |
| The |
| .I fsync_flags |
| bit mask may contain either 0, for a normal file integrity sync, or |
| .B IORING_FSYNC_DATASYNC |
| to provide data sync only semantics. See the descriptions of |
| .B O_SYNC |
| and |
| .B O_DSYNC |
| in the |
| .BR open (2) |
| manual page for more information. |
| |
| The bits that may be set in |
| .I poll_events |
| are defined in \fI<poll.h>\fP, and documented in |
| .BR poll (2). |
| |
| .I user_data |
| is an application-supplied value that will be copied into |
| the completion queue entry (see below). |
| .I buf_index |
| is an index into an array of fixed buffers, and is only valid if fixed |
| buffers were registered. |
| .I personality |
| is the credentials id to use for this operation. See |
| .BR io_uring_register(2) |
| for how to register personalities with io_uring. If set to 0, the current |
| personality of the submitting task is used. |
| .PP |
| Once the submission queue entry is initialized, I/O is submitted by |
| placing the index of the submission queue entry into the tail of the |
| submission queue. After one or more indexes are added to the queue, |
| and the queue tail is advanced, the |
| .BR io_uring_enter (2) |
| system call can be invoked to initiate the I/O. |
| |
| Completions use the following data structure: |
| .PP |
| .in +4n |
| .EX |
| /* |
| * IO completion data structure (Completion Queue Entry) |
| */ |
| struct io_uring_cqe { |
| __u64 user_data; /* sqe->data submission passed back */ |
| __s32 res; /* result code for this event */ |
| __u32 flags; |
| }; |
| .EE |
| .in |
| .PP |
| .I user_data |
| is copied from the field of the same name in the submission queue |
| entry. The primary use case is to store data that the application |
| will need to access upon completion of this particular I/O. The |
| .I flags |
| is reserved for future use. |
| .I res |
| is the operation-specific result, but io_uring-specific errors |
| (e.g. flags or opcode invalid) are returned through this field. |
| They are described in section |
| .B CQE ERRORS. |
| .PP |
| For read and write opcodes, the |
| return values match those documented in the |
| .BR preadv2 (2) |
| and |
| .BR pwritev2 (2) |
| man pages. |
| Return codes for the io_uring-specific opcodes are documented in the |
| description of the opcodes above. |
| .PP |
| .SH RETURN VALUE |
| .BR io_uring_enter () |
| returns the number of I/Os successfully consumed. This can be zero |
| if |
| .I to_submit |
| was zero or if the submission queue was empty. Note that if the ring was |
| created with |
| .B IORING_SETUP_SQPOLL |
| specified, then the return value will generally be the same as |
| .I to_submit |
| as submission happens outside the context of the system call. |
| |
| The errors related to a submission queue entry will be returned through a |
| completion queue entry (see section |
| .B CQE ERRORS), |
| rather than through the system call itself. |
| |
| Errors that occur not on behalf of a submission queue entry are returned via the |
| system call directly. On such an error, -1 is returned and |
| .I errno |
| is set appropriately. |
| .PP |
| .SH ERRORS |
| These are the errors returned by |
| .BR io_uring_enter () |
| system call. |
| .TP |
| .B EAGAIN |
| The kernel was unable to allocate memory for the request, or otherwise ran out |
| of resources to handle it. The application should wait for some completions and |
| try again. |
| .TP |
| .B EBADF |
| .I fd |
| is not a valid file descriptor. |
| .TP |
| .B EBADFD |
| .I fd |
| is a valid file descriptor, but the io_uring ring is not in the right state |
| (enabled). See |
| .BR io_uring_register (2) |
| for details on how to enable the ring. |
| .TP |
| .B EBUSY |
| The application is attempting to overcommit the number of requests it can have |
| pending. The application should wait for some completions and try again. May |
| occur if the application tries to queue more requests than we have room for in |
| the CQ ring, or if the application attempts to wait for more events without |
| having reaped the ones already present in the CQ ring. |
| .TP |
| .B EINVAL |
| Some bits in the |
| .I flags |
| argument are invalid. |
| .TP |
| .B EFAULT |
| An invalid user space address was specified for the |
| .I sig |
| argument. |
| .TP |
| .B ENXIO |
| The io_uring instance is in the process of being torn down. |
| .TP |
| .B EOPNOTSUPP |
| .I fd |
| does not refer to an io_uring instance. |
| .TP |
| .B EINTR |
| The operation was interrupted by a delivery of a signal before it could |
| complete; see |
| .BR signal(7). |
| Can happen while waiting for events with |
| .B IORING_ENTER_GETEVENTS. |
| |
| .SH CQE ERRORS |
| These io_uring-specific errors are returned as a negative value in the |
| .I res |
| field of the completion queue entry. |
| .TP |
| .B EACCES |
| The |
| .I flags |
| field or |
| .I opcode |
| in a submission queue entry is not allowed due to registered restrictions. |
| See |
| .BR io_uring_register (2) |
| for details on how restrictions work. |
| .TP |
| .B EBADF |
| The |
| .I fd |
| field in the submission queue entry is invalid, or the |
| .B IOSQE_FIXED_FILE |
| flag was set in the submission queue entry, but no files were registered |
| with the io_uring instance. |
| .TP |
| .B EFAULT |
| buffer is outside of the process' accessible address space |
| .TP |
| .B EFAULT |
| .B IORING_OP_READ_FIXED |
| or |
| .B IORING_OP_WRITE_FIXED |
| was specified in the |
| .I opcode |
| field of the submission queue entry, but either buffers were not |
| registered for this io_uring instance, or the address range described |
| by |
| .I addr |
| and |
| .I len |
| does not fit within the buffer registered at |
| .IR buf_index . |
| .TP |
| .B EINVAL |
| The |
| .I flags |
| field or |
| .I opcode |
| in a submission queue entry is invalid. |
| .TP |
| .B EINVAL |
| The |
| .I buf_index |
| member of the submission queue entry is invalid. |
| .TP |
| .B EINVAL |
| The |
| .I personality |
| field in a submission queue entry is invalid. |
| .TP |
| .B EINVAL |
| .B IORING_OP_NOP |
| was specified in the submission queue entry, but the io_uring context |
| was setup for polling |
| .RB ( IORING_SETUP_IOPOLL |
| was specified in the call to io_uring_setup). |
| .TP |
| .B EINVAL |
| .B IORING_OP_READV |
| or |
| .B IORING_OP_WRITEV |
| was specified in the submission queue entry, but the io_uring instance |
| has fixed buffers registered. |
| .TP |
| .B EINVAL |
| .B IORING_OP_READ_FIXED |
| or |
| .B IORING_OP_WRITE_FIXED |
| was specified in the submission queue entry, and the |
| .I buf_index |
| is invalid. |
| .TP |
| .B EINVAL |
| .BR IORING_OP_READV , |
| .BR IORING_OP_WRITEV , |
| .BR IORING_OP_READ_FIXED , |
| .B IORING_OP_WRITE_FIXED |
| or |
| .B IORING_OP_FSYNC |
| was specified in the submission queue entry, but the io_uring instance |
| was configured for IOPOLLing, or any of |
| .IR addr , |
| .IR ioprio , |
| .IR off , |
| .IR len , |
| or |
| .I buf_index |
| was set in the submission queue entry. |
| .TP |
| .B EINVAL |
| .B IORING_OP_POLL_ADD |
| or |
| .B IORING_OP_POLL_REMOVE |
| was specified in the |
| .I opcode |
| field of the submission queue entry, but the io_uring instance was |
| configured for busy-wait polling |
| .RB ( IORING_SETUP_IOPOLL ), |
| or any of |
| .IR ioprio , |
| .IR off , |
| .IR len , |
| or |
| .I buf_index |
| was non-zero in the submission queue entry. |
| .TP |
| .B EINVAL |
| .B IORING_OP_POLL_ADD |
| was specified in the |
| .I opcode |
| field of the submission queue entry, and the |
| .I addr |
| field was non-zero. |
| .TP |
| .B EOPNOTSUPP |
| .I opcode |
| is valid, but not supported by this kernel. |
| .TP |
| .B EOPNOTSUPP |
| .B IOSQE_BUFFER_SELECT |
| was set in the |
| .I flags |
| field of the submission queue entry, but the |
| .I opcode |
| doesn't support buffer selection. |